Skip to content
Snippets Groups Projects
Commit 70d5b190 authored by Jacques Fize's avatar Jacques Fize
Browse files

Final Debug (problem split)

parent 8e0560b4
No related branches found
No related tags found
No related merge requests found
...@@ -166,5 +166,4 @@ To train the network with default parameter use the following command : ...@@ -166,5 +166,4 @@ To train the network with default parameter use the following command :
| -t,--tolerance-value | K-value in the computation of the accuracy@k | | -t,--tolerance-value | K-value in the computation of the accuracy@k |
| -e,--epochs | number of epochs | | -e,--epochs | number of epochs |
| -d,--dimension | size of the ngram embeddings | | -d,--dimension | size of the ngram embeddings |
| -m,--model | Neural Network architecture used |
| --admin_code_1 | (Optional) If you wish to train the network on a specificate region | | --admin_code_1 | (Optional) If you wish to train the network on a specificate region |
...@@ -97,7 +97,8 @@ logging.basicConfig( ...@@ -97,7 +97,8 @@ logging.basicConfig(
) )
chrono = Chronometer() chrono = Chronometer()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-n 4 -t 0.002 -e 100 -m LSTM -a data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
.parse_args("-n 4 -t 0.002 -e 20 -a -w -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
# Initialisee CONSTANTS # Initialisee CONSTANTS
GEONAME_FN = args.geoname_input GEONAME_FN = args.geoname_input
...@@ -106,6 +107,7 @@ NGRAM_SIZE = args.ngram_size ...@@ -106,6 +107,7 @@ NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = 3
# check for output dir # check for output dir
if not os.path.exists("outputs/"): if not os.path.exists("outputs/"):
...@@ -180,8 +182,8 @@ if args.wikipedia_cooc: ...@@ -180,8 +182,8 @@ if args.wikipedia_cooc:
filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
train_indices.union(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)])) train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
test_indices.union(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)])) test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
logging.info("Merged with Geonames data !") logging.info("Merged with Geonames data !")
...@@ -189,7 +191,7 @@ if args.wikipedia_cooc: ...@@ -189,7 +191,7 @@ if args.wikipedia_cooc:
logging.info("Extracting cooccurrence relationships") logging.info("Extracting cooccurrence relationships")
cpt=0 cpt=0
for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
for inter in row.interlinks.split("|"): for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
cpt+=1 cpt+=1
rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
logging.info("Extract {0} cooccurrence relationships !".format(cpt)) logging.info("Extract {0} cooccurrence relationships !".format(cpt))
...@@ -283,13 +285,15 @@ logging.info("Data prepared !") ...@@ -283,13 +285,15 @@ logging.info("Data prepared !")
# OUTPUT FN BASE # OUTPUT FN BASE
name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn)
if args.adjacency: if args.adjacency:
name+="_A" name += "_A"
if args.inclusion: if args.inclusion:
name+="_I" name += "_I"
if args.wikipedia_cooc:
name += "_C"
# NGRAM EMBDEDDING # NGRAM EMBDEDDING
logging.info("Generating N-GRAM Embedding...") logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=5) embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50)
logging.info("Embedding generated !") logging.info("Embedding generated !")
# DEEP MODEL # DEEP MODEL
......
...@@ -7,12 +7,11 @@ ...@@ -7,12 +7,11 @@
{ "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" },
{ "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
{"long": "--adjacency-iteration", "type":"int","default":5}, {"long": "--adjacency-iteration", "type":"int","default":1},
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" },
{ "long": "--admin_code_1", "default": "None" } { "long": "--admin_code_1", "default": "None" }
] ]
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment