Skip to content
Snippets Groups Projects
Commit 0140bd77 authored by Taraneh Tavakoli's avatar Taraneh Tavakoli
Browse files

Update last_db.txt, ReadMe.MD, Splitter_Advanced.py, test.py files

parents
No related branches found
No related tags found
No related merge requests found
You need to open all python files (the query_history_log, loading data and utils are in the splitting Data project) All functions have been defined in the Splitter_Advanced folder and you need to run the test in order to observe the results.
\ No newline at end of file
import math
import random
import pandas
import mysql.connector
import sqlalchemy as db
import pandas as pd
import pymysql
import mysql.connector
def selector(df, column_name,threshold):
count = df[column_name].value_counts(normalize=False)
ids = count.index.tolist()
print ('######################')
print (count)
for id in ids:
new_ids = ids.copy()
new_ids.remove(id)
ref, temp = 0 , 0
ref_id = id
for test in new_ids :
temp = count.loc[id] + count.loc[test]
if temp > ref and temp <= threshold:
ref = temp
ref_id = test
if id == ref_id:
temp = count.loc[id]
ids.remove(id)
print ( 'values with id {} have a length of {} , no suitable pair found'.format(id,temp))
else:
ids.remove(id)
ids.remove(ref_id)
print ('combination of values with ids {} and {} have a sum of {}, good pair'.format(id,ref_id,ref))
print("new ids = ",ids)
def get_combinations(df,column_name, threshold: int ):
count = df[column_name].value_counts(normalize=False)
ids = count.index.tolist()
closest, ref, temp = 0, 0, 0
closest_ids = []
while closest + count.loc[ids[-1]] <= threshold:
ref = closest
ref_id, ref_id_2 = None, None
for id in ids:
new_ids = ids.copy()
new_ids.remove(id)
temp = count.loc[id] + closest
if temp > ref and temp <= threshold:
ref += count.loc[id]
ref_id = id
if ref_id == None:
ref = 0
else:
closest = ref
ids.remove(ref_id)
closest_ids.append(ref_id)
if len(ids) == 0:
print("No more values to add, exiting loop")
break
x = [count.loc[id] for id in closest_ids]
print("Closest combination to {} is given by the sum of {} which is {}".format(threshold, closest_ids, sum(x)))
selected = []
for column in closest_ids:
temp = df.loc[(df[column_name] == column)]
df = df.drop(index=temp.index.tolist())
if len(selected) == 0:
selected = temp
else:
selected = pd.concat([selected , temp],axis = 0,ignore_index = True)
# print (selected)
return selected , df
def create_db(db_name,host = 'localhost' , user = "root" , password = ""):
mydb = mysql.connector.connect(
host=host,
user=user,
password=password
)
mycursor = mydb.cursor()
mycursor.execute("CREATE DATABASE {}".format(db_name))
print ("Successfully created Database {}".format(db_name))
mycursor.close()
def to_sql( df , table_name: str,con: str):
engine = db.create_engine(con)
df.to_sql(name = table_name, con=engine, if_exists = "append",chunksize= 10,index = False)
print ('DB updated successfully')
def get_db_name(filename):
with open(filename) as f:
lines = f.read().replace('\n', '')
return lines
def save_lastdb(filename,db_name):
with open(filename, 'r+') as f:
f.seek(0)
f.write(db_name)
f.truncate()
print("Saved last database name successfully")
def calculate_size(con,table):
try:
engine = db.create_engine(con)
dbConnection = engine.connect()
frame = pd.read_sql("select * from " +table, dbConnection);
print(frame.head())
print("shape: ",frame.shape)
return frame.shape[0]
except db.exc.ProgrammingError:
return 0
def database(dataframe,threshold,column_name,host = 'localhost' , user_ = "root" , password = ""):
table_name = "mytable"
db_name = get_db_name ("last_db.txt")
print (db_name)
con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
if db_name is None or db_name == '':
db_name = "DB_0"
create_db(db_name=db_name)
print ("{} non existent , Successfully created")
last_db = 0
con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
else:
last_db = calculate_size(con=con, table=table_name)
counter =int(db_name.split("_")[1])
print ("{} has {} values already existing can update with {} values".format(db_name,last_db,threshold-last_db))
if len(dataframe) <= threshold - last_db :
if len(dataframe) <= threshold - last_db:
print ('{} has {} values, new data doesnt exceed threshold, updating {}'.format(db_name,last_db,db_name))
to_sql(df = dataframe,table_name= table_name,con=con)
last_db += len(dataframe)
else:
print('threshold exceeded, splitting data')
df = dataframe.copy()
last_db = calculate_size(con=con, table=table_name)
if df[column_name].value_counts(normalize=False)[-1] > threshold - last_db:
counter += 1
db_name = "DB_{}".format(counter)
print("{} created ".format(db_name))
create_db(db_name=db_name)
last_db = 0
while len(df) > 0:
con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
thresh = min(threshold , threshold - last_db)
selected, df = get_combinations(df, 'user_id', thresh)
to_sql(df=selected, table_name=table_name, con=con)
if len (df) > 0 :
counter += 1
db_name = "DB_{}".format(counter)
print("{} created ".format(db_name))
create_db(db_name=db_name)
last_db = 0
# db_name = "db_0"
# con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
# new = pd.read_sql('SELECT * from {}'.format(table_name),con)
# print (new.head)
save_lastdb(filename='last_db.txt', db_name = db_name)
print ("ALL DBs SUCCESSFULLY UPDATED LAST DB NAME SAVED")
import pandas as pd
from utils import generate_windows_dataset
import Splitter_Advanced
windows = generate_windows_dataset()
df=pd.DataFrame(windows)
Splitter_Advanced.selector(df,'user_id', 1000)
Splitter_Advanced.database(df,1000,'user_id')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment