Skip to content
Snippets Groups Projects
Commit e3962fca authored by Jacques Fize's avatar Jacques Fize
Browse files

Update Readme+ Add Script for runinng multiple training sessions with different parameters

parent 33651e79
No related branches found
No related tags found
No related merge requests found
......@@ -56,6 +56,27 @@ To train the network with default parameter use the following command :
python3 combination_embeddings.py -i <geoname data filename> <hierarchy geonames data filename>
### Train the network with different parameters
We built a tiny module that allows to run the network training using different parameters. To do that use the GridSearchModel class in `lib.run`. You can find
an example in the following code:
```python
from lib.run import GridSearchModel
from collections import OrderedDict
grid = GridSearchModel(\
"python3 combination_embeddings.py",
**OrderedDict({ # We use an OrderedDict since the order of parameters is important
"rel":["-i","-a","-c"],
"-n":[4],
"geoname_fn":"../data/geonamesData/US_FR.txt".split(),
"hierarchy_fn":"../data/geonamesData/hierarchy.txt".split(),
"store_true":["rel"]
}.items()))
grid.run()
```
### Available parameters
......@@ -67,7 +88,7 @@ To train the network with default parameter use the following command :
| --wikipedia-cooc-fn | File that contains the coooccurrence data |
| --cooc-sample-size- | Number of cooccurence relation selected for each location in cooccurrences data |
| --adjacency-iteration | Number of iteration in the adjacency extraction process |
| -n,--ngram-size | ngram size |
| -n,--ngram-size | ngram size x |
| -t,--tolerance-value | K-value in the computation of the accuracy@k |
| -e,--epochs | number of epochs |
| -d,--dimension | size of the ngram embeddings |
......
......@@ -31,6 +31,8 @@ from tqdm import tqdm
import logging
from helpers import parse_title_wiki
logging.getLogger('gensim').setLevel(logging.WARNING)
def get_new_ids(cooc_data,id_first_value):
"""
Return new ids from cooccurrence data
......
import subprocess
import time
import numpy as np
class Chronometer:
"""
To be used for mesure time execution of a block of code
>>> import time
>>> chrono = Chronometer()
>>> chrono.start("task1")
>>> time.sleep(1)
>>> duration = chrono.stop("task1")
>>> print(duration) #Should display '1'
"""
def __init__(self):
self.__task_begin_timestamp = {}
def start(self, task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError(
"A running task exists with the name {0}!".format(task_name)
)
self.__task_begin_timestamp[task_name] = time.time()
def stop(self, task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
class Run(object):
"""
Define a task to execute. A task here is associated to a command line. A task is defined by two entities :
* base_command : runnable
* kwargs : parameters associate to the command
Parmeters formating follows `argparse` format:
* "-i" or "--input" : optional parameter
* "input" : required parameter
>>> task1 = Run("task1","echo",text="hello word")
>>> task1.run()
With optional parameter, we have to use a trick ;)
>>> task1 = Run("task1","echo",**{"text":"hello word","-e":"args")
>>> task1.run()
To save the output, indicate an output filename when the task is run :
>>> task1.run("output_file.txt")
"""
def __init__(self,task_name,base_command,**kwargs):
"""
Constructor
Parameters
----------
command_base : str
command base
**kwargs : dict
parameters
"""
self.chrono = Chronometer()
self.task_name = task_name
self.base_command = base_command
self.run_args = kwargs
def get_command(self):
"""
Return the shell command build on the task attributes (basic command and parameters)
Returns
-------
str
command
"""
command = self.base_command
for key,value in self.run_args.items():
if "-" in key:
command = command + " {0} {1}".format(key,value)
else:
command = command + " {0}".format(value)
return command
def add_parameter(self, key, value):
"""
Add a parameter to the task
Parameters
----------
key : str
key
value : object
value
"""
self.run_args[key] = value
def run(self,log_filename = None):
"""
Run the task
Parameters
----------
log_filename : str, optional
log filename, by default None
"""
self.chrono.start(self.task_name)
out_proc = subprocess.PIPE
if log_filename:
out_proc = open(log_filename,'w')
process = subprocess.Popen(self.get_command().split(),stdout=out_proc)
_, _ = process.communicate() # We don't care of the output (if so, we use the log_filename argument)
duration = self.chrono.stop(self.task_name)
print("RUN {0} finished in {1}seconds OR {2}minutes OR {3}hours".format(\
self.task_name,duration,duration/60,(duration/60)/60
))
def __repr__(self):
return "; ".join(["{0}={1}".format(k,v) for k,v in self.run_args.items()])
class GridSearchModel:
"""
Define a set of model executions based on a set of parameters and their values variations.
For the parameters format, please check the `Run` documentations.
>>> grid = GridSearchModel("ls",test=["-l", "-h","-lh"])
>>> grid.run()
"""
def __init__(self,command_base,**kwargs):
"""
Constructor
Parameters
----------
command_base : str
command base
**kwargs : dict
parameters
"""
self.parameters = kwargs
self.cpt = 0
self.number_of_combination = np.prod([len(v) for _,v in self.parameters.items()])
self.tasks = []
for cpt in range(self.number_of_combination):
new_task = Run(str(cpt),command_base)
self.tasks.append(new_task)
for key,values in self.parameters.items():
split_ = int(self.number_of_combination/len(values))
i = 0
for val in values:
for task in self.tasks[i:i+split_]:
task.add_parameter(key,val)
i += split_
def __repr__(self):
return "\n".join([ t.__repr__() for t in self.tasks])
def run(self,log_filename=None):
"""
Run all the tasks defined
Parameters
----------
log_filename : str, optional
log filename, by default None
"""
for task in self.tasks:
task.run(log_filename=log_filename)
if __name__ == "__main__":
g = GridSearchModel("ls",test=["-l", "-h","-lh"],rel=["-i"])
print(g)
#g.run()
......@@ -121,4 +121,57 @@ class MetaDataSerializer(object):
"index_fn" : self.index_fn,
"keras_model_fn" : self.keras_model_fn,
"train_test_history_fn" : self.train_test_history_fn
},open(fn,'w'))
\ No newline at end of file
},open(fn,'w'))
import time
class Chronometer:
def __init__(self):
self.__task_begin_timestamp = {}
def start(self, task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError(
"A running task exists with the name {0}!".format(task_name)
)
self.__task_begin_timestamp[task_name] = time.time()
def stop(self, task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
\ No newline at end of file
This diff is collapsed.
from lib.run import GridSearchModel
from collections import OrderedDict
rels = ["-i","-a","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt"]
comb = []
for rel in rels:
comb.append(rel)
for rel2 in rels:
if not rel == rel2:
if not rel2+ " " + rel in comb:
comb.append(rel+ " " + rel2)
grid = GridSearchModel(\
"python3 combination_embeddings.py",
**OrderedDict({
"rel":['-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt','-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -i', '-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -a','-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -a -i'],#[comb],
"-n":[4],
"--ngram-word2vec-iter" :[1],
"-e":[100],
"geoname_fn":"../data/geonamesData/US_FR.txt".split(),
"hierarchy_fn":"../data/geonamesData/hierarchy.txt".split(),
"store_true":["rel"]
}.items()))
print("########### THE FOLLOWING COMMAND(S) WILL BE EXECUTED ###########" )
[print(task.get_command()) for task in grid.tasks]
print("#################################################################")
grid.run("log_RUN_TEXAS_IDFrance.txt")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment