summaryrefslogtreecommitdiff
path: root/split.py
diff options
context:
space:
mode:
Diffstat (limited to 'split.py')
-rwxr-xr-xsplit.py125
1 files changed, 125 insertions, 0 deletions
diff --git a/split.py b/split.py
new file mode 100755
index 0000000..f88d05e
--- /dev/null
+++ b/split.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+
+
+# Delete `import ray` and initting if not using modin. #
+# TODO: add try: except: around the `import ray`, `import modin.pandas`.
+import ray
+import logging
+import os
+os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1"
+
+ray.init(log_to_driver=False, logging_level=logging.FATAL)
+
+import modin.pandas as pd
+
+from typing import List
+from enum import Enum
+from dataclasses import dataclass
+from utilfn import *
+import time
+
+
+def query_split(df: pd.DataFrame) -> List[str]:
+ ret: List[str] = []
+ header = df.iloc[:1]
+ for idx, val in enumerate(header):
+ print(f"{idx + 1}: {val}")
+ sel = input("Select the fields to split on, delimited by commas: ")
+ if sel.lower() in [ "quit", "exit" ]:
+ print("Quitting...")
+ quit()
+
+ sellist = sel.strip()\
+ .replace(",", ";")\
+ .replace(".", ";")\
+ .replace("\t", ";")\
+ .replace(" ", ";")\
+ .split(";")
+
+ for idx, val in enumerate(header):
+ if str(idx + 1) in sellist:
+ # It's selected, so add it.
+ ret.append(val)
+ print(f"Selections are: {ret}")
+ return ret
+
+def make_valid_fn(fn: str) -> str:
+ # Possibly add spaces into this as well?
+ return fn.replace(":", "-") \
+ .replace(";", "-")\
+ .replace("/", "-")\
+ .replace("\\", "-")
+
+
+def split(filename: str, df: pd.DataFrame, splits: List[str],
+ out_filetype: Filetype,
+ update_user: bool = True,
+ show_timings: bool = True):
+ # filename: the output filename.
+ # df: the input dataframe
+ # splits: the columns to split on.
+ # out_filetype: either Excel or Csv format to save in.
+ print_options = PrintOptions(update_user, show_timings)
+
+ ### Checks ###
+ if len(splits) == 0:
+ print("No values selected to split on! Quitting...")
+ quit()
+ if filename == "":
+ print("No filename provided! Quitting...")
+ quit()
+
+ parents: List[Tuple[str, pd.DataFrame]] = [(make_valid_fn(filename), df)]
+ children = []
+
+ ### Split the file into individual dataframes for each split ###
+ begin_calc_split_time = time.time()
+ for outer_index, entry in enumerate(splits):
+ values = df[entry].unique().tolist()
+ for inner_index, (name, parent) in enumerate(parents):
+ if update_user:
+ print_noendl(f"Calculating splits for ({outer_index + 1} of {len(splits)}) {entry}: entry {inner_index} of {len(parents)} ")
+ for val in values:
+ # For each value which is different, create the new index and
+ # add to children - with each iteration, the children become
+ # the next parents.
+ key = f'{entry}'
+ view = parent[parent[key] == val]
+ children.append((f'{name} - {val}', view))
+
+ if update_user:
+ reprint_noendl('')
+ parents = children
+ children = []
+
+ cond_print(print_options, "Calculating splits - done!", f"Took {delta_now(begin_calc_split_time)}s.", suffix = (' ' * 79) + '\n')
+
+ ### Save all the dataframes ###
+ time_begin_all_saving = time.time()
+ for i, (name, v) in enumerate(parents):
+ name = make_valid_fn(name)
+ if update_user:
+ print_noendl(f"({i}/{len(parents)}): Saving {name}.xlsx... ")
+
+ time_begin_save = time.time()
+ save_df(v, out_filetype, name)
+ cond_print(print_options, f"saved", f'in {delta_now(time_begin_save)}s', suffix=".\n")
+
+ cond_print(print_options, "", f"Took {delta_now(time_begin_all_saving)}s to save all files.\n")
+
+
+def main():
+ reply = ask_open_file()
+ df = reply.df
+ print(f"Dataframe has {len(df)} rows and {len(df.iloc[0])} columns.")
+
+ out = input("Enter the name of the output file (w/o extension): ")
+
+ ft = ask_excel_or_csv()
+
+ splits = query_split(df)
+
+ split(out, df, splits, ft)
+
+if __name__ == "__main__":
+ main()