diff --git a/helpers.py b/helpers.py new file mode 100755 index 0000000000000000000000000000000000000000..825dd4a6d522be2770618fe78891d55d745df298 --- /dev/null +++ b/helpers.py @@ -0,0 +1,149 @@ +import os +import time + +import pandas as pd + +import matplotlib.pyplot as plt + +def read_geonames(file): + """ + Return a dataframe that contains Geonames data. + + Parameters + ---------- + file : str + path of the Geonames Csv file + + Returns + ------- + pd.DataFrame + geonames data + """ + dtypes_dict = { + 0: int, # geonameid + 1: str, # name + 2: str, # asciiname + 3: str, # alternatenames + 4: float, # latitude + 5: float, # longitude + 6: str, # feature class + 7: str, # feature code + 8: str, # country code + 9: str, # cc2 + 10: str, # admin1 code + 11: str, # admin2 code + 12: str, # admin3 code + 13: str, # admin4 code + 14: int, # population + 15: str, # elevation + 16: int, # dem (digital elevation model) + 17: str, # timezone + 18: str # modification date yyyy-MM-dd + } + rename_cols = { + 0:"geonameid", # geonameid + 1:"name", # name + 2:"asciiname", # asciiname + 3:"alternatenames", # alternatenames + 4:"latitude", # latitude + 5:"longitude", # longitude + 6:"feature_class", # feature class + 7:"feature_code", # feature code + 8:"country_code", # country code + 9:"cc2", # cc2 + 10:"admin1_code", # admin1 code + 11:"admin2_code", # admin2 code + 12:"admin3_code", # admin3 code + 13:"admin4_code", # admin4 code + 14:"population", # population + 15:"elevation", # elevation + 16:"dem", # dem (digital elevation model) + 17:"timezone", # timezone + 18:"modification_date" # modification date yyyy-MM-dd + } + data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) + data.rename(columns=rename_cols,inplace=True) + return data + +def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False): + # Plot training & validation loss values + plt.gcf() + plt.gca() + plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data") + if validation: + plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data") + plt.title('Layer {0} accuracy'.format(output_layer_name)) + plt.ylabel('Accuracy') + plt.xlabel('Epoch') + plt.ylim((0,1.1)) #1.1 if accuracy = 1 + plt.legend() + plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,)) + if show : + plt.show() + + +def save_embedding(model,tokenizer,layer_idx,fn): + embedding_matrix = model.get_weights()[0] + with open(os.path.join(fn), 'w') as f: + for word, i in tokenizer.word_index.items(): + f.write(word) + for i in embedding_matrix[i]: f.write(' ' + repr(i)) + f.write('\n') + + + +class Chronometer(): + def __init__(self): + self.__task_begin_timestamp = {} + + def start(self,task_name): + """ + Start a new task chronometer + + Parameters + ---------- + task_name : str + task id + + Raises + ------ + ValueError + if a running task already exists with that name + """ + if task_name in self.__task_begin_timestamp: + raise ValueError("A running task exists with the name {0}!".format(task_name)) + self.__task_begin_timestamp[task_name] = time.time() + + def stop(self,task_name): + """ + Stop and return the duration of the task + + Parameters + ---------- + task_name : str + task id + + Returns + ------- + float + duration of the task in seconds + + Raises + ------ + ValueError + if no task exist with the id `task_name` + """ + if not task_name in self.__task_begin_timestamp: + raise ValueError("The {0} task does not exist!".format(task_name)) + duration = time.time() - self.__task_begin_timestamp[task_name] + del self.__task_begin_timestamp[task_name] + return duration + +if __name__ == "__main__": + chrono = Chronometer() + chrono.start("test") + chrono.start("test2") + time.sleep(3) + print(chrono.stop("test")) + time.sleep(3) + print(chrono.stop("test2")) \ No newline at end of file