diff --git a/README.md b/README.md index 5ef0cb2e04685d0af1d2f6ee39ebffbe64af57da..3eee5842dba0dccf55dd541f5655c33ea92186ee 100644 --- a/README.md +++ b/README.md @@ -166,5 +166,4 @@ To train the network with default parameter use the following command : | -t,--tolerance-value | K-value in the computation of the accuracy@k | | -e,--epochs | number of epochs | | -d,--dimension | size of the ngram embeddings | -| -m,--model | Neural Network architecture used | | --admin_code_1 | (Optional) If you wish to train the network on a specificate region | diff --git a/combination_embeddings.py b/combination_embeddings.py index d22359e2a932af51805490be2496c4d8564dbc10..9e2f03f7e8898bb61bc7697e5d368754030ac936 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -97,7 +97,8 @@ logging.basicConfig( ) chrono = Chronometer() -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-n 4 -t 0.002 -e 100 -m LSTM -a data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ + .parse_args("-n 4 -t 0.002 -e 20 -a -w -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) # Initialisee CONSTANTS GEONAME_FN = args.geoname_input @@ -106,6 +107,7 @@ NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = 3 # check for output dir if not os.path.exists("outputs/"): @@ -180,8 +182,8 @@ if args.wikipedia_cooc: filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices.union(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)])) - test_indices.union(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)])) + train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) logging.info("Merged with Geonames data !") @@ -189,7 +191,7 @@ if args.wikipedia_cooc: logging.info("Extracting cooccurrence relationships") cpt=0 for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in row.interlinks.split("|"): + for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): cpt+=1 rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) logging.info("Extract {0} cooccurrence relationships !".format(cpt)) @@ -283,13 +285,15 @@ logging.info("Data prepared !") # OUTPUT FN BASE name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) if args.adjacency: - name+="_A" + name += "_A" if args.inclusion: - name+="_I" + name += "_I" +if args.wikipedia_cooc: + name += "_C" # NGRAM EMBDEDDING logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=5) +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50) logging.info("Embedding generated !") # DEEP MODEL diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index 93662e1e63e5f2d95f679d0e49e1d0f64518dc64..a2fd9f120b3e791f17948eba7d02b8e2a34116e3 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -7,12 +7,11 @@ { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, - {"long": "--adjacency-iteration", "type":"int","default":5}, + {"long": "--adjacency-iteration", "type":"int","default":1}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }, { "long": "--admin_code_1", "default": "None" } ] } \ No newline at end of file