Change files organisation

196cc045 · Jacques Fize · 1576e3e5 · 196cc045 · 196cc045
Commit 196cc045 authored 4 years ago by Jacques Fize
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -384,4 +384,5 @@ model.save(MODEL_OUTPUT_FN)
 # Erase Model Checkpoint file
 if os.path.exists(MODEL_OUTPUT_FN + ".part"):
-    os.remove(MODEL_OUTPUT_FN + ".part")
+    import shutil
\ No newline at end of file
+    shutil.rmtree(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
--- a/train_test_split_geonames.py
+++ b/train_test_split_geonames.py
@@ -12,9 +12,8 @@ logging.basicConfig(
    )
 from sklearn.model_selection import train_test_split
-from shapely.geometry import Point
-from lib.geo import Grid
+from lib.geo import latlon2healpix
 from helpers import read_geonames
 from tqdm import tqdm 
@@ -32,47 +31,25 @@ FEATURE_CLASSES = args.feature_classes
 logging.info("Load Geonames data...")
 geoname_data = read_geonames(GEONAME_FN).fillna("")
-geoname_data["geometry"] = geoname_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
-geoname_data = gpd.GeoDataFrame(geoname_data)
 logging.info("Geonames data loaded!")
 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
 filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places
-# World Shape bounds
+filtered["cat"] = filtered.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1)
-world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
-world["nn"] = 1
-dissolved = world.dissolve(by="nn").iloc[0].geometry
-#Creating Grid
-logging.info("Initializing Grid (360,180)...")
-g = Grid(*dissolved.bounds,[360,180])
-logging.info("Fit Data to the Grid...")
-g.fit_data(filtered)
-logging.info("Placing place into the grid...")
-[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))]
-#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
-logging.info("Associate a cell number to each place in the Geoname Dataframe")
-def foo(g,id_):
-    for ix,cell in enumerate(g.cells):
-        if id_ in cell.list_object:
-            return ix
-filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x))
 # TRAIN AND TEST SPLIT
 logging.info("Split Between Train and Test")
 #  Cell can be empty
-i=0
+cat_unique = filtered.cat.unique()
+ci=0
 while 1:
-    if len(filtered[filtered.cat == i])> 1:
+    if len(filtered[filtered.cat == cat_unique[ci]])> 1:
-        X_train,X_test = train_test_split(filtered[filtered.cat == i])
+        X_train,X_test = train_test_split(filtered[filtered.cat == cat_unique[ci]])
        break
-    i+=1
+    ci+=1
-for i in range(i+1,len(g.cells)):
+for i in cat_unique[ci:] :
    try:
        x_train,x_test = train_test_split(filtered[filtered.cat == i])
        X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
@@ -80,12 +57,9 @@ for i in range(i+1,len(g.cells)):
        pass #print("Error",len(filtered[filtered.cat == i]))
-del X_train["geometry"]
-del X_train["nn"]
 del X_train["cat"]
 del X_test["cat"]
-del X_test["geometry"]
-del X_test["nn"]
 # SAVING THE DATA
 logging.info("Saving Output !")
 X_train.to_csv(GEONAME_FN+"_train.csv")