diff --git a/README.md b/README.md index b3c253ffd9be685ce217823a42146aa5d06a8da0..b8721bee30842a0eeee746e36fb0978885f84c03 100644 --- a/README.md +++ b/README.md @@ -96,8 +96,8 @@ The data preparation is divided into three steps. First, we retrieve required da 1. First, you must download the Wikipedia corpus from which you want to extract co-occurrences : [English Wikipedia Corpus](https://dumps.wikimedia.org/enwiki/20200201/enwiki-20200201-pages-articles.xml.bz2) 2. Parse the corpus with Gensim script using the following command : `python3 -m gensim.scripts.segment_wiki -i -f <wikicorpus> -o <1stoutputname>.json.gz` - 3. Build a page of interest file that contains a list of Wikipedia pages. Use the script `extract_pages_of_interest.py` for that. You can find [here](https://projet.liris.cnrs.fr/hextgeo/files/pages_of_interest/place_en_fr_page_clean.csv) a page of interest file that contains places that appears in both FR and EN wikipedia. - 4. Then using and index that contains pages of interest run the command : `python3 script/get_cooccurrence.py <page_of_interest_file> <2noutputname> -c <1stoutputname>.json.gz` + 3. Build a page of interest file that contains a list of Wikipedia pages. Use the script `extract_pages_of_interest.py` for that. You can find [here](https://projet.liris.cnrs.fr/hextgeo/files/pages_of_interest/place_en_fr_page_clean.csv) a page of interest file that contains places that appears in FR or EN wikipedia. + 4. Then, using the page of interest file, run the command : `python3 script/get_cooccurrence.py <page_of_interest_file> <final_output_name> -c <1stoutputname>.json.gz` ### Generate dataset @@ -113,6 +113,7 @@ Use the following command to generate the datasets for training your model. | --adj-nside | Healpix resolution where places within are considered adjacent | | --split-nside | Size of the zone where the train/test split are done | | --split-method | [per_pair\|per_entity] Split each dataset based on places (place cannot exists in both train and test) or pairs(place can appears in train and test) | +| --no-sampling | To avoid sampling in generated pairs | ### If you're in a hurry @@ -123,7 +124,7 @@ French (also GB,US) Geonames, French (also GB,US) Wikipedia co-occurrence data, To train the first model use the following command : - python3 train_geocoder_v2.py <dataset_name> <inclusion_dataset> <adjacent_dataset> <cooccurrence_dataset> [-i | -a | -w ]+ [optional args] + python3 train_geocoder.py <dataset_name> <inclusion_dataset> <adjacent_dataset> <cooccurrence_dataset> [-i | -a | -w ]+ [optional args] | Parameter | Description | |-----------------------|---------------------------------------------------------------------------------| diff --git a/helpers.py b/helpers.py index 4bfee0e45f20a618bfb19046260ca2946ac0b025..20f0e74e256a4308f8dbbb3c8d760d8133119c98 100644 --- a/helpers.py +++ b/helpers.py @@ -185,15 +185,15 @@ class EpochTimer(Callback): self.epoch = pd.read_csv(log_filename).Epoch.max() else: self.output = open(log_filename,'w') - self.output.write("{0},{1}\n".format("Epoch","Execution Time")) + self.output.write("{0},{1},{2},{3},{4},{5}\n".format("Epoch","Execution Time","Loss","Val_Loss","Accuracy","Accuracy_val")) self.output.flush() def on_epoch_begin(self,epoch, logs={}): self.timer = time.time() - def on_epoch_end(self, epoch, logs=None): + def on_epoch_end(self, epoch, logs={}): end_time = time.time() - self.timer - self.output.write("{0},{1}\n".format(self.epoch,end_time)) + self.output.write("{0},{1},{2},{3},{4},{5}\n".format(self.epoch,end_time,logs["loss"],logs["val_loss"],logs["compute_metric"],logs["val_compute_metric"])) self.output.flush() self.epoch += 1