Forgot to add helpers.py

c92b911a · Jacques Fize · 34f50041 · c92b911a
Commit c92b911a authored 5 years ago by Jacques Fize
--- a/helpers.py
+++ b/helpers.py
+import os
+import time
+
+import pandas as pd
+
+import matplotlib.pyplot as plt
+
+def read_geonames(file):
+    """
+    Return a dataframe that contains Geonames data.
+    
+    Parameters
+    ----------
+    file : str
+        path of the Geonames Csv file
+    
+    Returns
+    -------
+    pd.DataFrame
+        geonames data
+    """
+    dtypes_dict = {
+    0: int, # geonameid
+    1: str,  # name
+    2: str,  # asciiname
+    3: str,  # alternatenames
+    4: float, # latitude
+    5: float, # longitude
+    6: str, # feature class
+    7: str, # feature code
+    8: str, # country code
+    9: str, # cc2
+    10: str, # admin1 code
+    11: str, # admin2 code
+    12: str, # admin3 code
+    13: str, # admin4 code
+    14: int, # population
+    15: str, # elevation
+    16: int, # dem (digital elevation model)
+    17: str, # timezone
+    18: str # modification date yyyy-MM-dd
+    }
+    rename_cols = {
+    0:"geonameid", # geonameid
+    1:"name",  # name
+    2:"asciiname",  # asciiname
+    3:"alternatenames",  # alternatenames
+    4:"latitude", # latitude
+    5:"longitude", # longitude
+    6:"feature_class", # feature class
+    7:"feature_code", # feature code
+    8:"country_code", # country code
+    9:"cc2", # cc2
+    10:"admin1_code", # admin1 code
+    11:"admin2_code", # admin2 code
+    12:"admin3_code", # admin3 code
+    13:"admin4_code", # admin4 code
+    14:"population", # population
+    15:"elevation", # elevation
+    16:"dem", # dem (digital elevation model)
+    17:"timezone", # timezone
+    18:"modification_date" # modification date yyyy-MM-dd
+    }
+    data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
+    data.rename(columns=rename_cols,inplace=True)
+    return data
+
+def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False):
+  # Plot training & validation loss values
+  plt.gcf()
+  plt.gca()
+  plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data")
+  if validation:
+    plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data")
+  plt.title('Layer {0} accuracy'.format(output_layer_name))
+  plt.ylabel('Accuracy')
+  plt.xlabel('Epoch')
+  plt.ylim((0,1.1)) #1.1 if accuracy = 1
+  plt.legend()
+  plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,))
+  if show :
+    plt.show()
+
+
+def save_embedding(model,tokenizer,layer_idx,fn):
+    embedding_matrix = model.get_weights()[0]
+    with open(os.path.join(fn), 'w') as f:
+        for word, i in tokenizer.word_index.items(): 
+            f.write(word)
+            for i in embedding_matrix[i]: f.write(' ' + repr(i))
+            f.write('\n')
+
+
+
+class Chronometer():
+    def __init__(self):
+        self.__task_begin_timestamp = {}
+
+    def start(self,task_name):
+        """
+        Start a new task chronometer
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Raises
+        ------
+        ValueError
+            if a running task already exists with that name
+        """
+        if task_name in self.__task_begin_timestamp:
+            raise ValueError("A running task exists with the name {0}!".format(task_name))
+        self.__task_begin_timestamp[task_name] = time.time()
+
+    def stop(self,task_name):
+        """
+        Stop and return the duration of the task
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Returns
+        -------
+        float
+            duration of the task in seconds
+        
+        Raises
+        ------
+        ValueError
+            if no task exist with the id `task_name`
+        """
+        if not task_name in self.__task_begin_timestamp:
+             raise ValueError("The {0} task does not exist!".format(task_name))
+        duration = time.time() - self.__task_begin_timestamp[task_name]
+        del self.__task_begin_timestamp[task_name]
+        return duration
+
+if __name__ == "__main__":
+    chrono = Chronometer()
+    chrono.start("test")
+    chrono.start("test2")
+    time.sleep(3)
+    print(chrono.stop("test"))
+    time.sleep(3)
+    print(chrono.stop("test2"))
\ No newline at end of file