diff options
-rw-r--r-- | GUIDELINES.md | 7 | ||||
-rwxr-xr-x | split.py | 125 | ||||
-rw-r--r-- | utilfn.py | 122 | ||||
-rw-r--r-- | utiltypes.py | 8 |
4 files changed, 262 insertions, 0 deletions
diff --git a/GUIDELINES.md b/GUIDELINES.md new file mode 100644 index 0000000..8e9a1ee --- /dev/null +++ b/GUIDELINES.md @@ -0,0 +1,7 @@ +Naming Conventions +------------------ +Classes: AllCaps +Enums: class EnumClass(Enum): Opt1; Opt2 +Local vars: local_variable + +Use 4 spaces not tabs. diff --git a/split.py b/split.py new file mode 100755 index 0000000..f88d05e --- /dev/null +++ b/split.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + + +# Delete `import ray` and initting if not using modin. # +# TODO: add try: except: around the `import ray`, `import modin.pandas`. +import ray +import logging +import os +os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1" + +ray.init(log_to_driver=False, logging_level=logging.FATAL) + +import modin.pandas as pd + +from typing import List +from enum import Enum +from dataclasses import dataclass +from utilfn import * +import time + + +def query_split(df: pd.DataFrame) -> List[str]: + ret: List[str] = [] + header = df.iloc[:1] + for idx, val in enumerate(header): + print(f"{idx + 1}: {val}") + sel = input("Select the fields to split on, delimited by commas: ") + if sel.lower() in [ "quit", "exit" ]: + print("Quitting...") + quit() + + sellist = sel.strip()\ + .replace(",", ";")\ + .replace(".", ";")\ + .replace("\t", ";")\ + .replace(" ", ";")\ + .split(";") + + for idx, val in enumerate(header): + if str(idx + 1) in sellist: + # It's selected, so add it. + ret.append(val) + print(f"Selections are: {ret}") + return ret + +def make_valid_fn(fn: str) -> str: + # Possibly add spaces into this as well? + return fn.replace(":", "-") \ + .replace(";", "-")\ + .replace("/", "-")\ + .replace("\\", "-") + + +def split(filename: str, df: pd.DataFrame, splits: List[str], + out_filetype: Filetype, + update_user: bool = True, + show_timings: bool = True): + # filename: the output filename. + # df: the input dataframe + # splits: the columns to split on. + # out_filetype: either Excel or Csv format to save in. + print_options = PrintOptions(update_user, show_timings) + + ### Checks ### + if len(splits) == 0: + print("No values selected to split on! Quitting...") + quit() + if filename == "": + print("No filename provided! Quitting...") + quit() + + parents: List[Tuple[str, pd.DataFrame]] = [(make_valid_fn(filename), df)] + children = [] + + ### Split the file into individual dataframes for each split ### + begin_calc_split_time = time.time() + for outer_index, entry in enumerate(splits): + values = df[entry].unique().tolist() + for inner_index, (name, parent) in enumerate(parents): + if update_user: + print_noendl(f"Calculating splits for ({outer_index + 1} of {len(splits)}) {entry}: entry {inner_index} of {len(parents)} ") + for val in values: + # For each value which is different, create the new index and + # add to children - with each iteration, the children become + # the next parents. + key = f'{entry}' + view = parent[parent[key] == val] + children.append((f'{name} - {val}', view)) + + if update_user: + reprint_noendl('') + parents = children + children = [] + + cond_print(print_options, "Calculating splits - done!", f"Took {delta_now(begin_calc_split_time)}s.", suffix = (' ' * 79) + '\n') + + ### Save all the dataframes ### + time_begin_all_saving = time.time() + for i, (name, v) in enumerate(parents): + name = make_valid_fn(name) + if update_user: + print_noendl(f"({i}/{len(parents)}): Saving {name}.xlsx... ") + + time_begin_save = time.time() + save_df(v, out_filetype, name) + cond_print(print_options, f"saved", f'in {delta_now(time_begin_save)}s', suffix=".\n") + + cond_print(print_options, "", f"Took {delta_now(time_begin_all_saving)}s to save all files.\n") + + +def main(): + reply = ask_open_file() + df = reply.df + print(f"Dataframe has {len(df)} rows and {len(df.iloc[0])} columns.") + + out = input("Enter the name of the output file (w/o extension): ") + + ft = ask_excel_or_csv() + + splits = query_split(df) + + split(out, df, splits, ft) + +if __name__ == "__main__": + main() diff --git a/utilfn.py b/utilfn.py new file mode 100644 index 0000000..e062d1b --- /dev/null +++ b/utilfn.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# A set of functions and types to be included for general use. +from enum import Enum +from typing import * +import time +import modin.pandas as pd +from dataclasses import dataclass + +# Local imports +from utiltypes import * + + +# Returns a displayable time, use e.g. with print(f"{delta_now(time_begin)}") +def delta_now(t: float, dp: int = 2) -> float: + return round(time.time() - t, dp) + +def ask_file(prompt: str = "What file should be opened: ") -> str: + filename = input(prompt) + return filename + +def open_file(file: str) -> pd.DataFrame: + if file.endswith(".csv"): + return pd.read_csv(file) + else: + return pd.read_excel(file) + +# Returns None if successfully saved, else a str error message. +# TODO: add error handling. +def save_df(df: pd.DataFrame, filetype: Filetype, name: str, sheet_name = None): + if sheet_name == None: + sheet_name = name + + if filetype == Filetype.Excel: + df.to_excel(f"{name}.xlsx", sheet_name = f"{sheet_name}", index = False) + else: + df.to_csv(f"{name}.csv", index = False) + +@dataclass +class PrintOptions: + update_user: bool + show_timings: bool + +def print_noendl(s: str): + print(s, end = '', flush = True) + +def reprint_noendl(s: str): + print(f'\r{s}', end = '', flush = True) + +# Please supply your own \n if you need it. +def cond_print(print_options: PrintOptions, s: str, timing_str: str = None, suffix = ""): + if timing_str and print_options.show_timings: + print(s, timing_str, end = "", flush = True) + elif print_options.update_user: + print(s, end = "", flush = True) + print(suffix, end = "", flush = True) + +@dataclass +class AskOpenFileReply: + df: pd.DataFrame + fname: str + sheet: str + time_to_open_file: float + +# Asks for a filename, and opens it; keeps opening until a valid file passed. +def ask_open_file(show_timings: bool = True) -> AskOpenFileReply: + while True: + try: + file = ask_file() + if file.lower().strip() in [ "quit", "exit" ]: + print("Quitting...") + quit() + + print("Opening file... ", end="", flush=True) + # TODO: add in to open a specific sheet; prompt if more than one + # sheet. At present it just opens the first one. Use optparam + # sheet_name=, with either str of sheet name or int index (deflt 0) + start_time = time.time() + df = open_file(file) + time_taken = time.time() - start_time + if show_timings: print(f" opened in {round(time_taken, 2)}s!") + else: print( " opened!") + break + except Exception as exc: + print(" File failed to open or could not be found.", flush=True) + print(f"Details: {exc}") + + # TODO: make `sheet` sheet value when I add this logic. + reply = AskOpenFileReply(df = df, fname = file, sheet = "", time_to_open_file = time_taken) + return reply + +def ask_excel_or_csv(given: str = None, prompt: bool = True) -> Filetype: + # prompt: if False, throw if given not valid instead of prompting the user. + exit_options = [ "exit", "quit" ] + excel_options = [ "xl", "excel", "xlsx", ".xlsx" ] + csv_options = [ "csv", ".csv" ] + + if given: + given = given.lower().strip() + if given in exit_options: + quit() + elif given in excel_options: + return Filetype.Excel + elif given in csv_options: + return Filetype.Csv + # Do not add an else clause as we need to handle the case when `given` + # is `None` in the below prompt. + + if not prompt: + # TODO: change this to be a more specific exception type. + raise Exception(f"ask_excel_or_csv: passed {given} as parameter with prompt == False") + + answer = input("Should I output as `.xlsx` or `.csv` (`csv` is much faster)? [csv, excel, quit] ") + answer = answer.lower().strip() + while True: + if answer in exit_options: + quit() + elif answer in csv_options: + return Filetype.Csv + elif answer in excel_options: + return Filetype.Excel + else: + print("Invalid input - I want one of `csv`, `excel`, or `quit`.") diff --git a/utiltypes.py b/utiltypes.py new file mode 100644 index 0000000..69ed918 --- /dev/null +++ b/utiltypes.py @@ -0,0 +1,8 @@ +from enum import Enum +from typing import * +from dataclasses import dataclass + +class Filetype(Enum): + Excel = 0 + Csv = 1 + |