Skip to content
Snippets Groups Projects
Forked from an inaccessible project.
Splitter_Advanced.py 5.87 KiB
import math
import random
import pandas
import mysql.connector
import sqlalchemy as db
import pandas as pd
import pymysql

import mysql.connector


def selector(df, column_name,threshold):
    count = df[column_name].value_counts(normalize=False)
    ids = count.index.tolist()
    print ('######################')
    print (count)

    for id in ids:

        new_ids = ids.copy()
        new_ids.remove(id)

        ref, temp = 0 , 0
        ref_id =  id
        for test in new_ids :
            temp = count.loc[id] + count.loc[test]
            if temp > ref and temp <= threshold:
                ref = temp
                ref_id = test
        if id == ref_id:
            temp = count.loc[id]
            ids.remove(id)
            print ( 'values with id {} have a length of {} , no suitable pair found'.format(id,temp))

        else:
            ids.remove(id)
            ids.remove(ref_id)
            print ('combination of values with ids {} and {} have a sum of {}, good pair'.format(id,ref_id,ref))

        print("new ids = ",ids)


def get_combinations(df,column_name, threshold: int ):
    count = df[column_name].value_counts(normalize=False)
    ids = count.index.tolist()

    closest, ref, temp = 0, 0, 0

    closest_ids = []

    while closest + count.loc[ids[-1]] <= threshold:
        ref = closest
        ref_id, ref_id_2 = None, None
        for id in ids:
            new_ids = ids.copy()
            new_ids.remove(id)

            temp = count.loc[id] + closest

            if temp > ref and temp <= threshold:
                ref += count.loc[id]
                ref_id = id

        if ref_id == None:
            ref = 0

        else:
            closest = ref
            ids.remove(ref_id)
            closest_ids.append(ref_id)
            if len(ids) == 0:
                print("No more values to add, exiting loop")
                break


    x = [count.loc[id] for id in closest_ids]
    print("Closest combination to {} is given by the sum of {} which is {}".format(threshold, closest_ids, sum(x)))
    selected = []
    for column in closest_ids:
        temp = df.loc[(df[column_name] == column)]

        df = df.drop(index=temp.index.tolist())

        if len(selected) == 0:

            selected = temp

        else:

            selected = pd.concat([selected , temp],axis =  0,ignore_index = True)

    # print (selected)

    return selected , df





def create_db(db_name,host =  'localhost' , user = "root" , password = ""):
    mydb = mysql.connector.connect(
      host=host,
      user=user,
      password=password
    )

    mycursor = mydb.cursor()

    mycursor.execute("CREATE DATABASE {}".format(db_name))
    print ("Successfully created Database {}".format(db_name))
    mycursor.close()

def to_sql( df , table_name: str,con: str):
    engine = db.create_engine(con)
    df.to_sql(name = table_name, con=engine, if_exists = "append",chunksize= 10,index = False)

    print ('DB updated successfully')
def get_db_name(filename):
    with open(filename) as f:
        lines = f.read().replace('\n', '')
    return lines

def save_lastdb(filename,db_name):
    with open(filename, 'r+') as f:

        f.seek(0)
        f.write(db_name)
        f.truncate()
    print("Saved last database name successfully")

def calculate_size(con,table):

    try:
        engine = db.create_engine(con)

        dbConnection    = engine.connect()

        frame  = pd.read_sql("select * from " +table, dbConnection);

        print(frame.head())

        print("shape: ",frame.shape)
        return frame.shape[0]

    except db.exc.ProgrammingError:
        return 0

def database(dataframe,threshold,column_name,host =  'localhost' , user_ = "root" , password = ""):
    table_name = "mytable"

    db_name = get_db_name ("last_db.txt")

    print (db_name)
    con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
    if db_name is None or db_name == '':
        db_name = "DB_0"
        create_db(db_name=db_name)
        print ("{} non existent , Successfully created")
        last_db = 0
        con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name

    else:
        last_db = calculate_size(con=con, table=table_name)




    counter =int(db_name.split("_")[1])




    print ("{} has {} values already existing can update with {} values".format(db_name,last_db,threshold-last_db))
    if len(dataframe) <= threshold - last_db :

        if len(dataframe) <=  threshold - last_db:
            print ('{} has {} values, new data doesnt exceed threshold, updating {}'.format(db_name,last_db,db_name))
            to_sql(df = dataframe,table_name= table_name,con=con)
            last_db +=  len(dataframe)
    else:

        print('threshold exceeded, splitting data')

        df = dataframe.copy()
        last_db = calculate_size(con=con, table=table_name)
        if df[column_name].value_counts(normalize=False)[-1] >  threshold - last_db:
            counter += 1
            db_name = "DB_{}".format(counter)
            print("{} created ".format(db_name))
            create_db(db_name=db_name)
            last_db = 0
        while len(df) > 0:
            con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
            thresh = min(threshold , threshold - last_db)
            selected, df = get_combinations(df, 'user_id', thresh)

            to_sql(df=selected, table_name=table_name, con=con)

            if len (df) > 0 :
                counter += 1
                db_name = "DB_{}".format(counter)
                print("{} created ".format(db_name))
                create_db(db_name=db_name)
                last_db = 0
    # db_name = "db_0"
    # con = "mysql+pymysql://" + user_ + ":" + password + "@" + host + "/" + db_name
    # new = pd.read_sql('SELECT * from {}'.format(table_name),con)
    # print (new.head)
    save_lastdb(filename='last_db.txt', db_name = db_name)
    print ("ALL DBs SUCCESSFULLY UPDATED LAST DB NAME SAVED")