diff --git a/combination_embeddings.py b/combination_embeddings.py index 094578e8f491da71627ae823879f8f052987a916..0f4bd44ba6f1e5a2d3d02e14fdfba53e2c8fd457 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -384,4 +384,5 @@ model.save(MODEL_OUTPUT_FN) #Â Erase Model Checkpoint file if os.path.exists(MODEL_OUTPUT_FN + ".part"): - os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file + import shutil + shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py index 585d26722b3fbe52bd33c564dfa4c187281f48cd..6098c50cbf8f56f2f123c8cc68aaecbe2b105e49 100644 --- a/train_test_split_geonames.py +++ b/train_test_split_geonames.py @@ -12,9 +12,8 @@ logging.basicConfig( ) from sklearn.model_selection import train_test_split -from shapely.geometry import Point -from lib.geo import Grid +from lib.geo import latlon2healpix from helpers import read_geonames from tqdm import tqdm @@ -32,47 +31,25 @@ FEATURE_CLASSES = args.feature_classes logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") -geoname_data["geometry"] = geoname_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -geoname_data = gpd.GeoDataFrame(geoname_data) logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places -#Â World Shape bounds -world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) -world["nn"] = 1 -dissolved = world.dissolve(by="nn").iloc[0].geometry - -#Creating Grid -logging.info("Initializing Grid (360,180)...") -g = Grid(*dissolved.bounds,[360,180]) -logging.info("Fit Data to the Grid...") -g.fit_data(filtered) -logging.info("Placing place into the grid...") -[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))] - -#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME -logging.info("Associate a cell number to each place in the Geoname Dataframe") -def foo(g,id_): - for ix,cell in enumerate(g.cells): - if id_ in cell.list_object: - return ix - -filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x)) - +filtered["cat"] = filtered.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1) # TRAIN AND TEST SPLIT logging.info("Split Between Train and Test") # Cell can be empty -i=0 +cat_unique = filtered.cat.unique() +ci=0 while 1: - if len(filtered[filtered.cat == i])> 1: - X_train,X_test = train_test_split(filtered[filtered.cat == i]) + if len(filtered[filtered.cat == cat_unique[ci]])> 1: + X_train,X_test = train_test_split(filtered[filtered.cat == cat_unique[ci]]) break - i+=1 + ci+=1 -for i in range(i+1,len(g.cells)): +for i in cat_unique[ci:] : try: x_train,x_test = train_test_split(filtered[filtered.cat == i]) X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) @@ -80,12 +57,9 @@ for i in range(i+1,len(g.cells)): pass #print("Error",len(filtered[filtered.cat == i])) -del X_train["geometry"] -del X_train["nn"] del X_train["cat"] del X_test["cat"] -del X_test["geometry"] -del X_test["nn"] + # SAVING THE DATA logging.info("Saving Output !") X_train.to_csv(GEONAME_FN+"_train.csv")