diff options
Diffstat (limited to 'split.py')
-rwxr-xr-x | split.py | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/split.py b/split.py new file mode 100755 index 0000000..f88d05e --- /dev/null +++ b/split.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + + +# Delete `import ray` and initting if not using modin. # +# TODO: add try: except: around the `import ray`, `import modin.pandas`. +import ray +import logging +import os +os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1" + +ray.init(log_to_driver=False, logging_level=logging.FATAL) + +import modin.pandas as pd + +from typing import List +from enum import Enum +from dataclasses import dataclass +from utilfn import * +import time + + +def query_split(df: pd.DataFrame) -> List[str]: + ret: List[str] = [] + header = df.iloc[:1] + for idx, val in enumerate(header): + print(f"{idx + 1}: {val}") + sel = input("Select the fields to split on, delimited by commas: ") + if sel.lower() in [ "quit", "exit" ]: + print("Quitting...") + quit() + + sellist = sel.strip()\ + .replace(",", ";")\ + .replace(".", ";")\ + .replace("\t", ";")\ + .replace(" ", ";")\ + .split(";") + + for idx, val in enumerate(header): + if str(idx + 1) in sellist: + # It's selected, so add it. + ret.append(val) + print(f"Selections are: {ret}") + return ret + +def make_valid_fn(fn: str) -> str: + # Possibly add spaces into this as well? + return fn.replace(":", "-") \ + .replace(";", "-")\ + .replace("/", "-")\ + .replace("\\", "-") + + +def split(filename: str, df: pd.DataFrame, splits: List[str], + out_filetype: Filetype, + update_user: bool = True, + show_timings: bool = True): + # filename: the output filename. + # df: the input dataframe + # splits: the columns to split on. + # out_filetype: either Excel or Csv format to save in. + print_options = PrintOptions(update_user, show_timings) + + ### Checks ### + if len(splits) == 0: + print("No values selected to split on! Quitting...") + quit() + if filename == "": + print("No filename provided! Quitting...") + quit() + + parents: List[Tuple[str, pd.DataFrame]] = [(make_valid_fn(filename), df)] + children = [] + + ### Split the file into individual dataframes for each split ### + begin_calc_split_time = time.time() + for outer_index, entry in enumerate(splits): + values = df[entry].unique().tolist() + for inner_index, (name, parent) in enumerate(parents): + if update_user: + print_noendl(f"Calculating splits for ({outer_index + 1} of {len(splits)}) {entry}: entry {inner_index} of {len(parents)} ") + for val in values: + # For each value which is different, create the new index and + # add to children - with each iteration, the children become + # the next parents. + key = f'{entry}' + view = parent[parent[key] == val] + children.append((f'{name} - {val}', view)) + + if update_user: + reprint_noendl('') + parents = children + children = [] + + cond_print(print_options, "Calculating splits - done!", f"Took {delta_now(begin_calc_split_time)}s.", suffix = (' ' * 79) + '\n') + + ### Save all the dataframes ### + time_begin_all_saving = time.time() + for i, (name, v) in enumerate(parents): + name = make_valid_fn(name) + if update_user: + print_noendl(f"({i}/{len(parents)}): Saving {name}.xlsx... ") + + time_begin_save = time.time() + save_df(v, out_filetype, name) + cond_print(print_options, f"saved", f'in {delta_now(time_begin_save)}s', suffix=".\n") + + cond_print(print_options, "", f"Took {delta_now(time_begin_all_saving)}s to save all files.\n") + + +def main(): + reply = ask_open_file() + df = reply.df + print(f"Dataframe has {len(df)} rows and {len(df.iloc[0])} columns.") + + out = input("Enter the name of the output file (w/o extension): ") + + ft = ask_excel_or_csv() + + splits = query_split(df) + + split(out, df, splits, ft) + +if __name__ == "__main__": + main() |