#!/usr/bin/env python3 # Delete `import ray` and initting if not using modin. # # TODO: add try: except: around the `import ray`, `import modin.pandas`. import ray import logging import os os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1" ray.init(log_to_driver=False, logging_level=logging.FATAL) import modin.pandas as pd from typing import List from enum import Enum from dataclasses import dataclass from utilfn import * import time def query_split(df: pd.DataFrame) -> List[str]: ret: List[str] = [] header = df.iloc[:1] for idx, val in enumerate(header): print(f"{idx + 1}: {val}") sel = input("Select the fields to split on, delimited by commas: ") if sel.lower() in [ "quit", "exit" ]: print("Quitting...") quit() sellist = sel.strip()\ .replace(",", ";")\ .replace(".", ";")\ .replace("\t", ";")\ .replace(" ", ";")\ .split(";") for idx, val in enumerate(header): if str(idx + 1) in sellist: # It's selected, so add it. ret.append(val) print(f"Selections are: {ret}") return ret def make_valid_fn(fn: str) -> str: # Possibly add spaces into this as well? return fn.replace(":", "-") \ .replace(";", "-")\ .replace("/", "-")\ .replace("\\", "-") def split(filename: str, df: pd.DataFrame, splits: List[str], out_filetype: Filetype, update_user: bool = True, show_timings: bool = True): # filename: the output filename. # df: the input dataframe # splits: the columns to split on. # out_filetype: either Excel or Csv format to save in. print_options = PrintOptions(update_user, show_timings) ### Checks ### if len(splits) == 0: print("No values selected to split on! Quitting...") quit() if filename == "": print("No filename provided! Quitting...") quit() parents: List[Tuple[str, pd.DataFrame]] = [(make_valid_fn(filename), df)] children = [] ### Split the file into individual dataframes for each split ### begin_calc_split_time = time.time() for outer_index, entry in enumerate(splits): values = df[entry].unique().tolist() for inner_index, (name, parent) in enumerate(parents): if update_user: print_noendl(f"Calculating splits for ({outer_index + 1} of {len(splits)}) {entry}: entry {inner_index} of {len(parents)} ") for val in values: # For each value which is different, create the new index and # add to children - with each iteration, the children become # the next parents. key = f'{entry}' view = parent[parent[key] == val] children.append((f'{name} - {val}', view)) if update_user: reprint_noendl('') parents = children children = [] cond_print(print_options, "Calculating splits - done!", f"Took {delta_now(begin_calc_split_time)}s.", suffix = (' ' * 79) + '\n') ### Save all the dataframes ### time_begin_all_saving = time.time() for i, (name, v) in enumerate(parents): name = make_valid_fn(name) if update_user: print_noendl(f"({i}/{len(parents)}): Saving {name}.xlsx... ") time_begin_save = time.time() save_df(v, out_filetype, name) cond_print(print_options, f"saved", f'in {delta_now(time_begin_save)}s', suffix=".\n") cond_print(print_options, "", f"Took {delta_now(time_begin_all_saving)}s to save all files.\n") def main(): reply = ask_open_file() df = reply.df print(f"Dataframe has {len(df)} rows and {len(df.iloc[0])} columns.") out = input("Enter the name of the output file (w/o extension): ") ft = ask_excel_or_csv() splits = query_split(df) split(out, df, splits, ft) if __name__ == "__main__": main()