Update last_db.txt, ReadMe.MD, Splitter_Advanced.py, test.py files

0140bd77 · Taraneh Tavakoli · 0140bd77 · 0140bd77 · 0140bd77 · 0140bd77
Commit 0140bd77 authored 3 years ago by Taraneh Tavakoli
--- a/ReadMe.MD
+++ b/ReadMe.MD
+You need to open all python files (the query_history_log, loading data and utils are in the splitting Data project) All functions have been defined in the Splitter_Advanced folder and you need to run the test in order to observe the results.
\ No newline at end of file
--- a/Splitter_Advanced.py
+++ b/Splitter_Advanced.py
+import math
+import random
+import pandas
+import mysql.connector
+import sqlalchemy as db
+import pandas as pd
+import pymysql
+
+import mysql.connector
+
+
+def selector(df, column_name,threshold):
+    count = df[column_name].value_counts(normalize=False)
+    ids = count.index.tolist()
+    print ('######################')
+    print (count)
+
+    for id in ids:
+
+        new_ids = ids.copy()
+        new_ids.remove(id)
+
+        ref, temp = 0 , 0
+        ref_id =  id
+        for test in new_ids :
+            temp = count.loc[id] + count.loc[test]
+            if temp > ref and temp <= threshold:
+                ref = temp
+                ref_id = test
+        if id == ref_id:
+            temp = count.loc[id]
+            ids.remove(id)
+            print ( 'values with id {} have a length of {} , no suitable pair found'.format(id,temp))
+
+        else:
+            ids.remove(id)
+            ids.remove(ref_id)
+            print ('combination of values with ids {} and {} have a sum of {}, good pair'.format(id,ref_id,ref))
+
+        print("new ids = ",ids)
+
+
+def get_combinations(df,column_name, threshold: int ):
+    count = df[column_name].value_counts(normalize=False)
+    ids = count.index.tolist()
+
+    closest, ref, temp = 0, 0, 0
+
+    closest_ids = []
+
+    while closest + count.loc[ids[-1]] <= threshold:
+        ref = closest
+        ref_id, ref_id_2 = None, None
+        for id in ids:
+            new_ids = ids.copy()
+            new_ids.remove(id)
+
+            temp = count.loc[id] + closest
+
+            if temp > ref and temp <= threshold:
+                ref += count.loc[id]
+                ref_id = id
+
+        if ref_id == None:
+            ref = 0
+
+        else:
+            closest = ref
+            ids.remove(ref_id)
+            closest_ids.append(ref_id)
+            if len(ids) == 0:
+                print("No more values to add, exiting loop")
+                break
+
+
+    x = [count.loc[id] for id in closest_ids]
+    print("Closest combination to {} is given by the sum of {} which is {}".format(threshold, closest_ids, sum(x)))
+    selected = []
+    for column in closest_ids:
+        temp = df.loc[(df[column_name] == column)]
+
+        df = df.drop(index=temp.index.tolist())
+
+        if len(selected) == 0:
+
+            selected = temp
+
+        else:
+
+            selected = pd.concat([selected , temp],axis =  0,ignore_index = True)
+
+    # print (selected)
+
+    return selected , df
+
+
+
+
+
+def create_db(db_name,host =  'localhost' , user = "root" , password = ""):
+    mydb = mysql.connector.connect(
+      host=host,
+      user=user,
+      password=password
+    )
+
+    mycursor = mydb.cursor()
+
+    mycursor.execute("CREATE DATABASE {}".format(db_name))
+    print ("Successfully created Database {}".format(db_name))
+    mycursor.close()
+
+def to_sql( df , table_name: str,con: str):
+    engine = db.create_engine(con)
+    df.to_sql(name = table_name, con=engine, if_exists = "append",chunksize= 10,index = False)
+
+    print ('DB updated successfully')
+def get_db_name(filename):
+    with open(filename) as f:
+        lines = f.read().replace('\n', '')
+    return lines
+
+def save_lastdb(filename,db_name):
+    with open(filename, 'r+') as f:
+
+        f.seek(0)
+        f.write(db_name)
+        f.truncate()
+    print("Saved last database name successfully")
+
+def calculate_size(con,table):
+
+    try:
+        engine = db.create_engine(con)
+
+        dbConnection    = engine.connect()
+
+        frame  = pd.read_sql("select * from " +table, dbConnection);
+
+        print(frame.head())
+
+        print("shape: ",frame.shape)
+        return frame.shape[0]
+
+    except db.exc.ProgrammingError:
+        return 0
+
+def database(dataframe,threshold,column_name,host =  'localhost' , user_ = "root" , password = ""):
+    table_name = "mytable"
+
+    db_name = get_db_name ("last_db.txt")
+
+    print (db_name)
+    con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
+    if db_name is None or db_name == '':
+        db_name = "DB_0"
+        create_db(db_name=db_name)
+        print ("{} non existent , Successfully created")
+        last_db = 0
+        con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
+
+    else:
+        last_db = calculate_size(con=con, table=table_name)
+
+
+
+
+    counter =int(db_name.split("_")[1])
+
+
+
+
+    print ("{} has {} values already existing can update with {} values".format(db_name,last_db,threshold-last_db))
+    if len(dataframe) <= threshold - last_db :
+
+        if len(dataframe) <=  threshold - last_db:
+            print ('{} has {} values, new data doesnt exceed threshold, updating {}'.format(db_name,last_db,db_name))
+            to_sql(df = dataframe,table_name= table_name,con=con)
+            last_db +=  len(dataframe)
+    else:
+
+        print('threshold exceeded, splitting data')
+
+        df = dataframe.copy()
+        last_db = calculate_size(con=con, table=table_name)
+        if df[column_name].value_counts(normalize=False)[-1] >  threshold - last_db:
+            counter += 1
+            db_name = "DB_{}".format(counter)
+            print("{} created ".format(db_name))
+            create_db(db_name=db_name)
+            last_db = 0
+        while len(df) > 0:
+            con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
+            thresh = min(threshold , threshold - last_db)
+            selected, df = get_combinations(df, 'user_id', thresh)
+
+            to_sql(df=selected, table_name=table_name, con=con)
+
+            if len (df) > 0 :
+                counter += 1
+                db_name = "DB_{}".format(counter)
+                print("{} created ".format(db_name))
+                create_db(db_name=db_name)
+                last_db = 0
+    # db_name = "db_0"
+    # con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
+    # new = pd.read_sql('SELECT * from {}'.format(table_name),con)
+    # print (new.head)
+    save_lastdb(filename='last_db.txt', db_name = db_name)
+    print ("ALL DBs SUCCESSFULLY UPDATED LAST DB NAME SAVED")
--- a/last_db.txt
+++ b/last_db.txt
+
--- a/test.py
+++ b/test.py
+import pandas as pd
+from utils import generate_windows_dataset
+import Splitter_Advanced
+windows = generate_windows_dataset()
+
+df=pd.DataFrame(windows)
+Splitter_Advanced.selector(df,'user_id', 1000)
+Splitter_Advanced.database(df,1000,'user_id')