summaryrefslogtreecommitdiff
path: root/split.py
blob: f88d05e52fd2879b3555dbd2c457232adde8b911 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3


# Delete `import ray` and initting if not using modin. #
# TODO: add try: except: around the `import ray`, `import modin.pandas`.
import ray
import logging
import os
os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1"

ray.init(log_to_driver=False, logging_level=logging.FATAL)

import modin.pandas as pd

from typing import List
from enum import Enum
from dataclasses import dataclass
from utilfn import *
import time


def query_split(df: pd.DataFrame) -> List[str]:
    ret: List[str] = [] 
    header = df.iloc[:1]
    for idx, val in enumerate(header):
        print(f"{idx + 1}: {val}")
    sel = input("Select the fields to split on, delimited by commas: ")
    if sel.lower() in [ "quit", "exit" ]: 
        print("Quitting...")
        quit()

    sellist = sel.strip()\
                 .replace(",", ";")\
                 .replace(".", ";")\
                 .replace("\t", ";")\
                 .replace(" ", ";")\
                 .split(";")

    for idx, val in enumerate(header):
        if str(idx + 1) in sellist:
            # It's selected, so add it. 
            ret.append(val)
    print(f"Selections are: {ret}")
    return ret

def make_valid_fn(fn: str) -> str:
    # Possibly add spaces into this as well?
    return fn.replace(":", "-") \
       .replace(";", "-")\
       .replace("/", "-")\
       .replace("\\", "-")


def split(filename: str, df: pd.DataFrame, splits: List[str], 
        out_filetype: Filetype, 
        update_user: bool = True, 
        show_timings: bool = True):
    # filename: the output filename. 
    # df: the input dataframe
    # splits: the columns to split on.
    # out_filetype: either Excel or Csv format to save in.
    print_options = PrintOptions(update_user, show_timings)

    ### Checks ###
    if len(splits) == 0:
        print("No values selected to split on! Quitting...")
        quit()
    if filename == "":
        print("No filename provided! Quitting...")
        quit()

    parents: List[Tuple[str, pd.DataFrame]] = [(make_valid_fn(filename), df)]
    children = []

    ### Split the file into individual dataframes for each split ### 
    begin_calc_split_time = time.time()
    for outer_index, entry in enumerate(splits):
        values = df[entry].unique().tolist()
        for inner_index, (name, parent) in enumerate(parents):
            if update_user:
                print_noendl(f"Calculating splits for ({outer_index + 1} of {len(splits)}) {entry}: entry {inner_index} of {len(parents)} ")
            for val in values:
                # For each value which is different, create the new index and 
                # add to children - with each iteration, the children become
                # the next parents.
                key = f'{entry}'
                view = parent[parent[key] == val]
                children.append((f'{name} - {val}', view))

            if update_user: 
                reprint_noendl('')
        parents = children
        children = []

    cond_print(print_options, "Calculating splits - done!", f"Took {delta_now(begin_calc_split_time)}s.", suffix = (' ' * 79) + '\n')

    ### Save all the dataframes ### 
    time_begin_all_saving = time.time()
    for i, (name, v) in enumerate(parents):
        name = make_valid_fn(name)
        if update_user:
            print_noendl(f"({i}/{len(parents)}): Saving {name}.xlsx... ")

        time_begin_save = time.time()
        save_df(v, out_filetype, name)
        cond_print(print_options, f"saved", f'in {delta_now(time_begin_save)}s', suffix=".\n")

    cond_print(print_options, "", f"Took {delta_now(time_begin_all_saving)}s to save all files.\n")


def main():
    reply = ask_open_file()
    df = reply.df
    print(f"Dataframe has {len(df)} rows and {len(df.iloc[0])} columns.")

    out = input("Enter the name of the output file (w/o extension): ")

    ft  = ask_excel_or_csv()

    splits = query_split(df)

    split(out, df, splits, ft)

if __name__ == "__main__":
    main()