import pandas as pd import numpy as np import statistics def create_dict(df, classColumnName): return dict(df[classColumnName].value_counts()) def remove_weak_classes(df, classColumnName, threshold): dictOfClassInstances = create_dict(df,classColumnName) dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } keys = [*dictionary] df_tmp = df[~ df[classColumnName].isin(keys)] #df = df[df[columnTarget] not in keys] #df = df.merge(df_tmp, how = 'outer' ,indicator=True) df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) return df def split_class(df, columnProcessed): i = 0 new_df = pd.DataFrame(columns= df.columns) for index, row in df.iterrows(): #cls = re.split(';', row[columnProcessed]) cls = filter(None, row[columnProcessed].split(';')) cls = list(cls) #cls = re.findall(r"[\w']+", row [columnProcessed]) r = row for categ in cls: r[columnProcessed] = categ #new_df.append(r, ignore_index = True) new_df.loc[i] = r i = i + 1 return new_df def get_median_dict(dict): return statistics.median(dict.values()) def resample_classes(df, classColumnName, numberOfInstances): # numberOfInstances first elements #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns]) #random numberOfInstances elements replace = False # with replacement fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] return df.groupby(classColumnName, as_index=False).apply(fn)