Commit allHEAD master

author: George Abbott <george@gabbott.dev> 2023-11-03 21:26:37 +0000
committer: George Abbott <george@gabbott.dev> 2023-11-03 21:26:37 +0000
commit: f76a2b4e8dffe2044cd7532ca40554267f0d5454 (patch)
tree: b25fca668b027869c2f7030025217fadcc7b1df6
4 files changed, 262 insertions, 0 deletions
diff --git a/GUIDELINES.md b/GUIDELINES.md
new file mode 100644
index 0000000..8e9a1ee
--- /dev/null
+++ b/GUIDELINES.md
@@ -0,0 +1,7 @@
+Naming Conventions
+------------------
+Classes:	AllCaps
+Enums:		class EnumClass(Enum): Opt1; Opt2
+Local vars:	local_variable
+
+Use 4 spaces not tabs.
diff --git a/split.py b/split.py
new file mode 100755
index 0000000..f88d05e
--- /dev/null
+++ b/split.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+
+
+# Delete `import ray` and initting if not using modin. #
+# TODO: add try: except: around the `import ray`, `import modin.pandas`.
+import ray
+import logging
+import os
+os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1"
+
+ray.init(log_to_driver=False, logging_level=logging.FATAL)
+
+import modin.pandas as pd
+
+from typing import List
+from enum import Enum
+from dataclasses import dataclass
+from utilfn import *
+import time
+
+
+def query_split(df: pd.DataFrame) -> List[str]:
+    ret: List[str] = [] 
+    header = df.iloc[:1]
+    for idx, val in enumerate(header):
+        print(f"{idx + 1}: {val}")
+    sel = input("Select the fields to split on, delimited by commas: ")
+    if sel.lower() in [ "quit", "exit" ]: 
+        print("Quitting...")
+        quit()
+
+    sellist = sel.strip()\
+                 .replace(",", ";")\
+                 .replace(".", ";")\
+                 .replace("\t", ";")\
+                 .replace(" ", ";")\
+                 .split(";")
+
+    for idx, val in enumerate(header):
+        if str(idx + 1) in sellist:
+            # It's selected, so add it. 
+            ret.append(val)
+    print(f"Selections are: {ret}")
+    return ret
+
+def make_valid_fn(fn: str) -> str:
+    # Possibly add spaces into this as well?
+    return fn.replace(":", "-") \
+       .replace(";", "-")\
+       .replace("/", "-")\
+       .replace("\\", "-")
+
+
+def split(filename: str, df: pd.DataFrame, splits: List[str], 
+        out_filetype: Filetype, 
+        update_user: bool = True, 
+        show_timings: bool = True):
+    # filename: the output filename. 
+    # df: the input dataframe
+    # splits: the columns to split on.
+    # out_filetype: either Excel or Csv format to save in.
+    print_options = PrintOptions(update_user, show_timings)
+
+    ### Checks ###
+    if len(splits) == 0:
+        print("No values selected to split on! Quitting...")
+        quit()
+    if filename == "":
+        print("No filename provided! Quitting...")
+        quit()
+
+    parents: List[Tuple[str, pd.DataFrame]] = [(make_valid_fn(filename), df)]
+    children = []
+
+    ### Split the file into individual dataframes for each split ### 
+    begin_calc_split_time = time.time()
+    for outer_index, entry in enumerate(splits):
+        values = df[entry].unique().tolist()
+        for inner_index, (name, parent) in enumerate(parents):
+            if update_user:
+                print_noendl(f"Calculating splits for ({outer_index + 1} of {len(splits)}) {entry}: entry {inner_index} of {len(parents)} ")
+            for val in values:
+                # For each value which is different, create the new index and 
+                # add to children - with each iteration, the children become
+                # the next parents.
+                key = f'{entry}'
+                view = parent[parent[key] == val]
+                children.append((f'{name} - {val}', view))
+
+            if update_user: 
+                reprint_noendl('')
+        parents = children
+        children = []
+
+    cond_print(print_options, "Calculating splits - done!", f"Took {delta_now(begin_calc_split_time)}s.", suffix = (' ' * 79) + '\n')
+
+    ### Save all the dataframes ### 
+    time_begin_all_saving = time.time()
+    for i, (name, v) in enumerate(parents):
+        name = make_valid_fn(name)
+        if update_user:
+            print_noendl(f"({i}/{len(parents)}): Saving {name}.xlsx... ")
+
+        time_begin_save = time.time()
+        save_df(v, out_filetype, name)
+        cond_print(print_options, f"saved", f'in {delta_now(time_begin_save)}s', suffix=".\n")
+
+    cond_print(print_options, "", f"Took {delta_now(time_begin_all_saving)}s to save all files.\n")
+
+
+def main():
+    reply = ask_open_file()
+    df = reply.df
+    print(f"Dataframe has {len(df)} rows and {len(df.iloc[0])} columns.")
+
+    out = input("Enter the name of the output file (w/o extension): ")
+
+    ft  = ask_excel_or_csv()
+
+    splits = query_split(df)
+
+    split(out, df, splits, ft)
+
+if __name__ == "__main__":
+    main()
diff --git a/utilfn.py b/utilfn.py
new file mode 100644
index 0000000..e062d1b
--- /dev/null
+++ b/utilfn.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# A set of functions and types to be included for general use. 
+from enum import Enum
+from typing import * 
+import time
+import modin.pandas as pd
+from dataclasses import dataclass
+
+# Local imports
+from utiltypes import *
+
+
+# Returns a displayable time, use e.g. with print(f"{delta_now(time_begin)}")
+def delta_now(t: float, dp: int = 2) -> float:
+    return round(time.time() - t, dp)
+
+def ask_file(prompt: str = "What file should be opened: ") -> str:
+    filename = input(prompt)
+    return filename
+
+def open_file(file: str) -> pd.DataFrame:
+    if file.endswith(".csv"):
+        return pd.read_csv(file)
+    else:
+        return pd.read_excel(file)
+
+# Returns None if successfully saved, else a str error message.
+# TODO: add error handling.
+def save_df(df: pd.DataFrame, filetype: Filetype, name: str, sheet_name = None):
+    if sheet_name == None:
+        sheet_name = name
+
+    if filetype == Filetype.Excel:
+        df.to_excel(f"{name}.xlsx", sheet_name = f"{sheet_name}", index = False)
+    else:
+        df.to_csv(f"{name}.csv", index = False)
+
+@dataclass
+class PrintOptions:
+    update_user: bool
+    show_timings: bool
+
+def print_noendl(s: str):
+    print(s, end = '', flush = True)
+
+def reprint_noendl(s: str):
+    print(f'\r{s}', end = '', flush = True)
+
+# Please supply your own \n if you need it. 
+def cond_print(print_options: PrintOptions, s: str, timing_str: str = None, suffix = ""):
+    if timing_str and print_options.show_timings: 
+        print(s, timing_str, end = "", flush = True)
+    elif print_options.update_user: 
+        print(s, end = "", flush = True)
+    print(suffix, end = "", flush = True)
+
+@dataclass
+class AskOpenFileReply:
+    df: pd.DataFrame
+    fname: str
+    sheet: str
+    time_to_open_file: float
+
+# Asks for a filename, and opens it; keeps opening until a valid file passed.
+def ask_open_file(show_timings: bool = True) -> AskOpenFileReply:
+    while True:
+        try:
+            file = ask_file()
+            if file.lower().strip() in [ "quit", "exit" ]:
+                print("Quitting...")
+                quit()
+
+            print("Opening file... ", end="", flush=True)
+            # TODO: add in to open a specific sheet; prompt if more than one 
+            # sheet. At present it just opens the first one. Use optparam
+            # sheet_name=, with either str of sheet name or int index (deflt 0)
+            start_time = time.time()
+            df = open_file(file)
+            time_taken = time.time() - start_time
+            if show_timings: print(f" opened in {round(time_taken, 2)}s!")
+            else:            print( " opened!")
+            break
+        except Exception as exc: 
+            print(" File failed to open or could not be found.", flush=True)
+            print(f"Details: {exc}")
+
+    # TODO: make `sheet` sheet value when I add this logic.
+    reply = AskOpenFileReply(df = df, fname = file, sheet = "", time_to_open_file = time_taken) 
+    return reply
+
+def ask_excel_or_csv(given: str = None, prompt: bool = True) -> Filetype:
+    # prompt: if False, throw if given not valid instead of prompting the user.
+    exit_options  = [ "exit", "quit" ]
+    excel_options = [ "xl", "excel", "xlsx", ".xlsx" ]
+    csv_options   = [ "csv", ".csv" ]
+
+    if given:
+        given = given.lower().strip()
+        if given in exit_options:
+            quit()
+        elif given in excel_options:
+            return Filetype.Excel
+        elif given in csv_options:
+            return Filetype.Csv
+        # Do not add an else clause as we need to handle the case when `given`
+        # is `None` in the below prompt.
+
+    if not prompt:
+        # TODO: change this to be a more specific exception type.
+        raise Exception(f"ask_excel_or_csv: passed {given} as parameter with prompt == False")
+
+    answer = input("Should I output as `.xlsx` or `.csv` (`csv` is much faster)? [csv, excel, quit] ")
+    answer = answer.lower().strip()
+    while True:
+        if answer in exit_options:
+            quit()
+        elif answer in csv_options:
+            return Filetype.Csv
+        elif answer in excel_options:
+            return Filetype.Excel
+        else:
+            print("Invalid input - I want one of `csv`, `excel`, or `quit`.")
diff --git a/utiltypes.py b/utiltypes.py
new file mode 100644
index 0000000..69ed918
--- /dev/null
+++ b/utiltypes.py
@@ -0,0 +1,8 @@
+from enum import Enum
+from typing import *
+from dataclasses import dataclass
+
+class Filetype(Enum):
+    Excel = 0  
+    Csv   = 1
+
author	George Abbott <george@gabbott.dev>	2023-11-03 21:26:37 +0000
committer	George Abbott <george@gabbott.dev>	2023-11-03 21:26:37 +0000
commit	f76a2b4e8dffe2044cd7532ca40554267f0d5454 (patch)
tree	b25fca668b027869c2f7030025217fadcc7b1df6