Skip to content
Snippets Groups Projects
Commit af07d919 authored by Nelly Barret's avatar Nelly Barret
Browse files

[REVERT] revert to old version

parent 7b948e7d
No related branches found
No related tags found
No related merge requests found
Showing
with 1703 additions and 1403 deletions
......@@ -20,7 +20,7 @@ def generate_charts():
data.generate_from_cities(cities)
lists = get_selected_indicators_lists(10)
for j, env in enumerate(["social"]):
dataset = Dataset(data, env, selected_indicators=lists["10"][env], train_size=0.8, test_size=0.2, _type="unsupervised")
dataset = Dataset(data, env, selected_indicators=lists["10"][env], train_size=0.8, test_size=0.2)
dataset.init_all_in_one()
algo = Chart(name='chart', dataset=dataset, number_of_iris=len(cities))
algo.compute_trendline()
......
......@@ -80,45 +80,45 @@ def similarity(data1, data2, max_distance, nb_points):
class Chart(Method):
def __init__(self, name, dataset, number_of_iris=16):
def __init__(self, name, dataset, number_of_iris=12):
Method.__init__(self, name, dataset)
self.chart = None
self.dataset = dataset
# set all NaN values as 0
self.dataset.data = self.dataset.data.fillna(0)
self.number_of_iris = number_of_iris if number_of_iris % 2 == 0 else 12
# self.number_of_iris = number_of_iris if number_of_iris % 2 == 0 else 12
self.iris_per_line = 2
self.step = 0
self.max_value = 0
def compute_trendline(self):
# for indicator in self.dataset.selected_indicators:
fig, axs = plt.subplots(int(self.number_of_iris / 2), self.iris_per_line, figsize=(15, 15)) # rows, columns
# fig, axs = plt.subplots(int(self.number_of_iris / 2), self.iris_per_line, figsize=(15, 15)) # rows, columns
i, j, z = 0, 0, 1 # i and j are indices to plot sub-figures and z is the counter to place figures
all_relevant_indicators = []
n = 0
for index, row in self.dataset.data.head(self.number_of_iris).iterrows(): # head(self.number_of_iris).
for index, row in self.dataset.data.iterrows(): # head(self.number_of_iris).
data = []
list_indicators = self.dataset.selected_indicators if self.dataset.selected_indicators is not None else self.dataset.indicators
for indicator in list_indicators:
data.append(row[indicator])
max_value = self.dataset.data.head(self.number_of_iris)[self.dataset.selected_indicators].values.max() # .head(self.number_of_iris)
logging.debug("max value is %.4f", max_value)
x = np.arange(0, len(data))
y = data
# max_value = self.dataset.data[self.dataset.selected_indicators].values.max() # .head(self.number_of_iris)
# logging.debug("max value is %.4f", max_value)
# x = np.arange(0, len(data))
# y = data
mean_of_data = round(sum(data) / len(data), 3)
logging.debug("y = %s", y)
f = interp1d(x, y)
axs[i, j].axis(ymin=0, ymax=max_value)
axs[i, j].set_xticks(np.arange(0, len(data)))
self.max_value = max_value
self.step = max_value / 5
axs[i, j].set_yticks(np.arange(0, max_value, step=self.step))
axs[i, j].plot(x, data, 'o', x, f(x), '-')
for k, v in enumerate(data):
# label = # "{:.1E}".format(v)
axs[i, j].annotate(round(v, 2), (k, v), )
title = str(row['CODE']) + " - " + str(self.dataset.env) + " -" + str(mean_of_data)
# logging.debug("y = %s", y)
# f = interp1d(x, y)
# axs[i, j].axis(ymin=0, ymax=max_value)
# axs[i, j].set_xticks(np.arange(0, len(data)))
# self.max_value = max_value
# self.step = max_value / 5
# axs[i, j].set_yticks(np.arange(0, max_value, step=self.step))
# axs[i, j].plot(x, data, 'o', x, f(x), '-')
# for k, v in enumerate(data):
# # label = # "{:.1E}".format(v)
# axs[i, j].annotate(round(v, 2), (k, v), )
# title = str(row['CODE']) + " - " + str(self.dataset.env) + " -" + str(mean_of_data)
relevant_indicators = []
for ind in range(len(list_indicators)):
indicator = list_indicators[ind]
......@@ -132,15 +132,15 @@ class Chart(Method):
all_relevant_indicators = union(all_relevant_indicators, relevant_indicators)
n += 1
axs[i, j].set_title(title)
if z < self.iris_per_line:
z += 1
j += 1
else:
z = 1
i += 1
j = 0
fig.show()
# axs[i, j].set_title(title)
# if z < self.iris_per_line:
# z += 1
# j += 1
# else:
# z = 1
# i += 1
# j = 0
# fig.show()
print(all_relevant_indicators)
print(len(all_relevant_indicators))
# self.compute_similarity()
......
......@@ -10,7 +10,7 @@ import warnings
from area import area
from predihood import model
from predihood.cleaning import clean
from predihood.cleansing import clean
from predihood.config import FILE_CLEANED_DATA, FOLDER_DATASETS, ENVIRONMENT_VARIABLES, FILE_GROUPING
from predihood.utility_functions import address_to_code, append_indicator, append_target
......@@ -348,13 +348,13 @@ if __name__ == '__main__':
data = Data()
cities = {
# "tassin": ["maisons", "résidentiel", "espaces verts", "périurbain", "ouest lyon", "moyen-sup"],
# "107 rue jean voillot villeurbanne": ["grands ensembles", "autres activités", "urbanisé", "urbain", "est lyon", "popu"],
"tassin": ["maisons", "résidentiel", "espaces verts", "périurbain", "ouest lyon", "moyen-sup"],
"107 rue jean voillot villeurbanne": ["grands ensembles", "autres activités", "urbanisé", "urbain", "est lyon", "popu"],
"saint cyr au mont d'or": ["maisons", "résidentiel", "arboré", "périurbain", "ouest lyon", "sup"],
# "doua villeurbanne": ["immeubles", "autres activités", "urabnisé", "central", "est-lyon", "moyen-inf"],
"doua villeurbanne": ["immeubles", "autres activités", "urabnisé", "central", "est-lyon", "moyen-inf"],
"part dieu lyon": ["immeubles", "commerçant", "urbanisé", "centrtal", "centre lyon", "moyen"],
# "dompierre sur besbre": ["maisons", "résidentiel", "arboré", "rural", "est moulins", "sup"],
# "rue de la favorite lyon": ["mixte", "commercant", "urbanisé", "urbain", "nord-ouest lyon", "moyen"],
# "lezoux": ["maisons", "résidentiel", "arboré", "rural", "est clermont-ferrand", "moyen-sup"]
"dompierre sur besbre": ["maisons", "résidentiel", "arboré", "rural", "est moulins", "sup"],
"rue de la favorite lyon": ["mixte", "commercant", "urbanisé", "urbain", "nord-ouest lyon", "moyen"],
"lezoux": ["maisons", "résidentiel", "arboré", "rural", "est clermont-ferrand", "moyen-sup"]
}
data.generate_from_cities(cities)
import logging
import os
import warnings
import logging
log_format = "[%(levelname)s] - %(filename)s::%(lineno)d \t %(message)s"
logging.basicConfig(level='DEBUG', format=log_format)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from predihood.classes.Method import Method
from predihood.config import ENVIRONMENT_VARIABLES, FILE_CLEANED_DATA, FOLDER_DISTRIBUTION
from predihood.utility_functions import sim, auto_label
warnings.simplefilter(action='ignore', category=FutureWarning)
class MethodCleansing(Method):
def __init__(self, name, dataset):
"""
Constructor of the MethodCleansing class. Initialize attributes.
"""
Method.__init__(self, name, dataset)
self.values_by_env = {} # dict to store values for each environment variable, e.g. [Maisons, Maison]
self.columns_dep = [] # departure columns, e.g. Abatiment, Ausage,...
self.columns_arr = [] # arrival columns, e.g. Nbatiment, Nusage, ...
self.columns = {
"occup": {
"name_dep": "Aoccup",
"data_dep": self.dataset.Aoccup,
"name_arr": "Noccup",
"data_arr": self.dataset.Noccup
}
}
for env in ENVIRONMENT_VARIABLES:
temp = {
"name_dep": "A" + env,
"name_arr": "N" + env,
"data_dep": self.dataset["A" + str(env)],
"data_arr": self.dataset["N" + str(env)]
}
self.columns[env] = temp
for env in self.columns: self.columns_dep.append(self.columns[env]["data_dep"])
for env in self.columns: self.columns_arr.append(self.columns[env]["data_arr"])
# define outliers (to be removed)
self.outliers = ['Oui', 'Moyenne-sup', 'Location']
# plot variables
self.before = {} # departure values for each EV
self.after = {} # arrival values for each EV
self.labels = {} # labels for EV, e.g. maisons, immeubles, grand ensemble...
def clean(self):
"""
Clean data from bad naming conventions.
"""
logging.info(
"The data needs to be cleaned. For each list, write the correct word. For each environment variable, you will get its number of corrections and its error rate.")
# 1. getting wrong values in a dict ordered by env variable
self.values_by_env = {}
for col_dep, col_arr in zip(self.columns_dep, self.columns_arr):
col_name = col_dep.name[1:]
self.values_by_env[col_name] = []
for val in col_dep.unique(): # get possible values for the current column
index = sim(val, self.values_by_env[col_name])
# if the value is similar to another, add it, else create an new array with it
if index >= 0:
self.values_by_env[col_name][index].append(val)
elif index == -1:
self.values_by_env[col_name].append([val])
for val in col_arr.unique():
index = sim(val, self.values_by_env[col_name])
if index >= 0:
self.values_by_env[col_name][index].append(val)
elif index == -1:
self.values_by_env[col_name].append([val])
# 2. renaming these wrong values in data
for key, value in self.values_by_env.items():
col_nameA = "A" + key
col_nameN = "N" + key
nb_replacement_dep = 0
nb_replacement_arr = 0
for i in range(len(value)):
if len(value[i]) > 1:
arr_without_duplicates = list(dict.fromkeys(value[i]))
chosen_label = input(str(arr_without_duplicates) + ": ")
for label in value[i]:
if label != chosen_label: # if label == chosen_label: skip it because no replacement is needed
nb_replacement_dep += pd.Series(self.dataset[col_nameA] == label).sum()
nb_replacement_arr += pd.Series(self.dataset[col_nameN] == label).sum()
self.dataset.loc[self.dataset[col_nameA] == label, col_nameA] = chosen_label
self.dataset.loc[self.dataset[col_nameN] == label, col_nameN] = chosen_label
size = int(self.dataset.count()["A" + key]) + int(self.dataset.count()["N" + key])
mean_error = ((nb_replacement_dep + nb_replacement_arr) / size) * 100
logging.debug(
"%d IRIS have been corrected for the environment variable %s, corresponding to an error rate of %.0f %%",
(nb_replacement_dep + nb_replacement_arr), key, mean_error)
# 3. removing outliers from data
count = 0
for outlier in self.outliers:
self.dataset.drop(self.dataset[self.dataset.eq(outlier).any(1)].index, inplace=True)
count += 1
logging.debug("%d outliers removed", count)
# 4. save data
self.dataset.to_csv(FILE_CLEANED_DATA, index=False, encoding='utf-8')
logging.info("Cleaned data is in %s", FILE_CLEANED_DATA)
def create_before_after_labels(self, name_dep, name_arr):
"""
Creates the arrays 'before', 'after' and 'labels' from data.
:param name_dep: the name of the departure column, e.g. Aoccup, Abatiment, Ausage...
:param name_arr: the name of the arrival column, e.g. Noccup, Nbatiment, Nusage...
"""
all_repartition = {}
self.before = {}
self.after = {}
for status, value in self.dataset[name_dep].value_counts().items():
if name_dep == "Ageo": # if geo, get only the geo position (South, East, ..) and not the city
status = status.split(" ")[0]
if status in self.before:
self.before[status] += value
else:
self.before[status] = value
else:
self.before[status] = value # self.dataset[values_before].value_counts()[status]
for status, value in self.dataset[name_arr].value_counts().items():
if name_arr == "Ngeo": # if geo, get only the geo position (South, East, ..) and not the city
status = status.split(" ")[0]
if status in self.after:
self.after[status] += value
else:
self.after[status] = value
else:
self.after[status] = value # self.dataset[values_after].value_counts()[status]
# 2. merge before and after data in the sale dict
for status in self.before:
all_repartition[status] = [self.before[status], 0]
for status in self.after:
if status not in all_repartition:
all_repartition[status] = [0, self.after[status]]
else:
all_repartition[status][1] = self.after[status]
# 3. convert dict in 3 arrays
self.before = []
self.after = []
self.labels = []
for key in all_repartition:
if not isinstance(key, float): # to remove nan values
self.before.append(all_repartition[key][0])
self.after.append(all_repartition[key][1])
self.labels.append(key)
def create_bar_chart(self, name, title):
"""
Plot before/after charts.
:param name: the name of the target to plot, i.e. environment variable, e.g. usage, batiment, ...
:param title: the title of the plot.
"""
x = np.arange(len(self.labels)) # the label locations
width = 0.35
fig, ax = plt.subplots()
ax.bar(x - width / 2, [154 for _ in range(len(self.labels))], width=width, color="#DCDCDC") # grey bar
bef = ax.bar(x - width / 2, self.before, width=width, label='Avant') # before data
ax.bar(x + width / 2, [154 for _ in range(len(self.labels))], width=width, color="#DCDCDC") # grey bar
aft = ax.bar(x + width / 2, self.after, width=width, label='Après') # after data
ax.set_ylabel('Nombre de personnes')
plt.xticks(x, self.labels, rotation='vertical')
auto_label(bef, ax)
auto_label(aft, ax)
plt.tight_layout()
ax.legend()
filename = os.path.join(FOLDER_DISTRIBUTION, "distribution_" + name + ".png")
fig.savefig(filename)
ax.set_title(title)
plt.show()
def to_chart(self, env, name, title):
"""
Create before/after data and plot it.
:param env: the target to plot, i.e. the environment variable, e.g. usage, paysage...
:param name: the name to save the file.
:param title: the title of the plot.
"""
self.create_before_after_labels(self.columns[env]["name_dep"], self.columns[env]["name_arr"])
self.create_bar_chart(name, title)
......@@ -36,7 +36,7 @@ class MethodPrediction(Method):
def predict(self, iris_code=None):
"""
Predict environment variables for the given iris. The environment variable to predict is stored in the dataset as "env" variable
Predict environment variables for the given iris.
"""
iris_object = model.get_iris_from_code(iris_code)
iris_area = area(model.get_coords_from_code(iris_code)) / 1000000
......@@ -69,7 +69,6 @@ class MethodPrediction(Method):
df = pd.DataFrame([iris_indicators_values], columns=iris_indicators_names)
self.prediction = self.classifier.predict(df)[0]
print(self.prediction)
def plot(self):
max_depths = np.linspace(1, 32, 32, endpoint=True)
......
......@@ -62,11 +62,10 @@ class MethodSelection(Method):
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
self.best_indicators = []
print(upper)
for i in range(len(upper.columns)):
column = upper.columns[i]
for k, value in upper[column].items():
if value == 1 and column not in self.best_indicators: # and (column, k) not in self.best_indicators and (k, column) not in self.best_indicators:
if value == 1 and column not in self.best_indicators:
self.best_indicators.append(column)
if TITLES: plt.title("Matrice de corrélation : filtrage = " + (
......
This diff is collapsed.
This diff is collapsed.
import logging
log_format = "[%(levelname)s] - %(filename)s::%(lineno)d \t %(message)s"
logging.basicConfig(level='DEBUG', format=log_format)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from predihood.classes.MethodCleansing import MethodCleansing
from predihood.config import FILE_DATA_HIL, FOLDER_FEATURE_SELECTION, TITLES
def clean():
data = pd.read_excel(FILE_DATA_HIL)
# rename columns because of bad naming convention
data.columns = ["id", "nom", "num_HiL", "sexe", "age", "nb_enfant",
"rev_mensuel", "loyer_charges_mensuel", "rev_fiscal",
"Aadresse", "Apays", "Aoccup", "Aloyer",
"Nadresse", "Npays", "Noccup",
"motif",
"Acontexte", "Atype", "Abatiment", "Ausage", "Apaysage", "Amorpho", "Ageo", "Asocial",
"Ncontexte", "Nbatiment", "Nusage", "Npaysage", "Nmorpho", "Ngeo", "Nsocial"]
data.head()
cleansing = MethodCleansing("cleansing", data)
cleansing.clean()
logging.info("Many plots have be generated in " + FOLDER_FEATURE_SELECTION)
# 1. distribution between women and men
women_men = data.iloc[:, 3].value_counts() # répartition homme/femme
labels = 'Femme', 'Homme'
sizes = [women_men['Femme'], women_men['Homme']]
colors = ['salmon', 'lightblue']
plt.pie(sizes, labels=labels, colors=colors, autopct='%i%%', shadow=True, startangle=90)
plt.axis('equal')
if TITLES: plt.title("Distribution selon le genre")
plt.show()
# 2. distribution between ages
data_temp = data
data_temp = data_temp.dropna()
ages_plot = []
total_plot = []
min_age = int(min(data_temp.iloc[:, 4]))
max_age = int(max(data_temp.iloc[:, 4]))
for counter in range(min_age, max_age + 1):
total_plot.append(data_temp.loc[data_temp.age == float(counter), 'age'].count())
ages_plot.append(counter)
mean = np.average(ages_plot, weights=total_plot)
logging.debug("mean age: %.1f", mean)
plt.bar(ages_plot, total_plot)
plt.axvline(x=mean, color='red') # ligne médiane
plt.xlabel("Âge (en années)")
plt.ylabel("Nombre de personnes")
if TITLES: plt.title('Distribution selon l\'âge')
plt.show()
ages = data_temp.iloc[:, 4]
plt.hist(ages, facecolor='gray', align='mid')
plt.xlabel("Âge (en années)")
plt.ylabel("Nombre de personnes")
if TITLES: plt.title("Distribution selon l'âge")
plt.show()
# 3. distribution between incomes
incomes = data.iloc[:, 6]
plt.hist(incomes, facecolor='gray', align='mid')
if TITLES: plt.title("Distribution selon le revenu le mensuel")
plt.xlabel("Revenu mensuel (en euros)")
plt.ylabel("Nombre de personnes")
plt.show()
# 4. distribution between reasons of transfer
transfers = data.iloc[:, 16].value_counts()
labels = transfers.index.tolist()
sizes = [transfers[i] for i in range(len(transfers))]
plt.pie(sizes, labels=labels, autopct='%.2f', shadow=True, startangle=90)
plt.axis('equal')
if TITLES: plt.title("Distribution selon le motif de mutation")
plt.show()
# 5. distribution between geographic positions
geo = pd.concat([data.iloc[:, 23], data.iloc[:, 30]], ignore_index=True) # .values_count()
split_geo = [geo[i].split()[0] if not isinstance(geo[i], float) else "" for i in range(len(geo))]
set_geo = set(split_geo)
uniques = [split_geo.count(elem) for elem in set_geo]
labels = set_geo
plt.pie(uniques, labels=labels, autopct='%.2f', shadow=True, startangle=90)
if TITLES: plt.title("Distribution selon la position géographique")
plt.show()
# 6. evolution between before and after transfer for each environment variable
cleansing.to_chart('occup', 'status', 'Évolution des statuts avant et après la mutation')
cleansing.to_chart('batiment', 'building_type', 'Évolution des types de bâtiments avant et après la mutation')
cleansing.to_chart('usage', 'building_usage', 'Évolution des usages de bâtiments avant et après la mutation')
cleansing.to_chart('paysage', 'landscapes', 'Évolution des paysages avant et après la mutation')
cleansing.to_chart('morpho', 'morpho', 'Évolution des morphologies avant et après la mutation')
cleansing.to_chart('social', 'social', 'Évolution des classes sociales avant et après la mutation')
cleansing.to_chart('geo', 'geo', 'Évolution des positions géographiques avant et après la mutation')
if __name__ == '__main__':
clean()
,nelly,MacBook-Pro.local,11.06.2020 17:30,file:///Users/nelly/Library/Application%20Support/LibreOffice/4;
\ No newline at end of file
,nelly,MacBook-Pro.local,29.05.2020 15:26,file:///Users/nelly/Library/Application%20Support/LibreOffice/4;
\ No newline at end of file
P14_RP,DENSITY,C14_MEN,C14_ACTOCC1564,P14_MEN,P14_LOG,DEC_Q314,DEC_D214,P14_RP_ACHTOT,P14_NBPI_RP_ANEM0509,P14_ANEM_RP_PROP,P14_POP1564,P14_RP_SDB,DEC_D114,P14_NPER_RP_PROP,DEC_D314,C14_PMEN_MENFAM,P14_POP30P,P14_NBPI_RPMAISON,P14_NBPI_RP_ANEM0204,C14_ACT1564,AREA,P14_POP,DEC_D414,P14_NBPI_RPAPPART,P14_POP15P,P14_ACTOCC1564,P14_PMEN,P14_ANEM_RP_LOCHLMV,P14_RPAPPART_ACHTOT,P14_NSCOL15P,DEC_D714,P14_RPAPPART,DEC_D614,DEC_D814,DEC_MED14,P14_NBPI_RP_ANEM10P,P14_PMEN_ANEM10P,P14_NBPI_RP,P14_ANEM_RP,DEC_Q114,P14_ACTOCC15P,C14_PMEN_MENCOUPAENF,P14_NPER_RP,P14_POPMEN15P,P14_MAISON,P14_ACT1564,P14_ANEM_RP_LOC,C14_POP15P,P14_RSECOCC,P14_APPART,P14_POP_FR,DEC_D914,P14_POP2064,C14_ACTOCC15P,C14_PMEN
P14_LOG,DENSITY,P14_RP_SDB,P14_POP15P,DEC_D914,P14_ACTOCC15P,C14_ACTOCC1564,DEC_MED14,P14_MAISON,DEC_D814,DEC_D614,P14_POP30P,P14_POP_FR,P14_NBPI_RPMAISON,P14_NBPI_RP_ANEM10P,DEC_D414,DEC_D114,P14_ANEM_RP_LOCHLMV,P14_PMEN_ANEM10P,C14_PMEN_MENCOUPAENF,P14_RPAPPART_ACHTOT,P14_APPART,P14_NBPI_RP,P14_RPAPPART,C14_MEN,P14_ANEM_RP_PROP,DEC_Q314,DEC_D714,P14_PMEN,P14_POP2064,P14_NPER_RP_PROP,P14_MEN,P14_POP,C14_ACTOCC15P,P14_NPER_RP,DEC_D314,P14_POPMEN15P,P14_RP,P14_NSCOL15P,C14_PMEN_MENFAM,P14_POP1564,P14_NBPI_RP_ANEM0509,AREA,C14_POP15P,P14_ACTOCC1564,P14_ANEM_RP_LOC,C14_ACT1564,C14_PMEN,P14_NBPI_RPAPPART,P14_RP_ACHTOT,P14_ANEM_RP,DEC_Q114,P14_ACT1564,DEC_D214,P14_RSECOCC,P14_NBPI_RP_ANEM0204
......@@ -96,12 +96,10 @@ def run_algorithm():
@app.route('/predict_iris', methods=["GET"])
def predict_iris():
iris_code_to_predict = request.args['iris_code']
clf_name = request.args['algorithm_name']
clf = get_classifier(clf_name)
data = Data(normalize="density", filter=True)
data.init_all_in_one()
predictions = predict_one_iris(iris_code_to_predict, data, clf, 0.8, 0.2, False) # clf
predictions = predict_one_iris(iris_code_to_predict, data, KNeighborsClassifier(n_neighbors=30), 0.8, 0.2, False)
return {"predictions": predictions}
......
......@@ -347,11 +347,11 @@ def expe4(data, clf, train_size, test_size, remove_outliers=False):
results[env] = OrderedDict()
logging.debug("--- %s ---", env)
dataset = Dataset(data_not_filtered, env, selected_indicators=data_not_filtered.indicators, train_size=train_size, test_size=test_size, outliers=remove_outliers, _type='supervised')
dataset = Dataset(data_not_filtered, env, selected_indicators=data_not_filtered.indicators, train_size=train_size, test_size=test_size, outliers=remove_outliers)
dataset.init_all_in_one()
mean_classifier = 0.0
algo = MethodPrediction(name="", dataset=dataset, classifier=clf)
algo = MethodPrediction(name="", dataset=dataset, classifier=clf, _type='supervised')
algo.fit()
algo.compute_performance()
results[env]["accuracy_none"] = algo.accuracy
......@@ -359,9 +359,9 @@ def expe4(data, clf, train_size, test_size, remove_outliers=False):
logging.debug("accuracy for %s without filtering: %f", env, algo.accuracy)
predictions[env] = []
for top_k, lst in lists.items():
dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers, _type='supervised')
dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers)
dataset.init_all_in_one()
algo2 = MethodPrediction(name='', dataset=dataset, classifier=clf)
algo2 = MethodPrediction(name='', dataset=dataset, classifier=clf, _type='supervised')
# logging.debug("size of X_train: %d", len(algo2.dataset.X_train.columns))
# logging.debug(algo2.dataset.X_train.columns)
algo2.fit()
......@@ -426,20 +426,19 @@ def predict_one_iris(iris_code, data, clf, train_size, test_size, remove_outlier
for j, env in enumerate(ENVIRONMENT_VARIABLES):
predictions_lst = []
for top_k, lst in lists.items():
dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers, _type='supervised')
dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers)
dataset.init_all_in_one()
algo = MethodPrediction(name='', dataset=dataset, classifier=clf)
algo = MethodPrediction(name='', dataset=dataset, classifier=clf, _type='supervised')
# logging.debug("size of X_train: %d", len(algo2.dataset.X_train.columns))
# logging.debug(algo2.dataset.X_train.columns)
algo.fit()
algo.predict(iris_code)
predictions_lst.append(algo.prediction)
predictions[env] = get_most_frequent(predictions_lst) # get the most frequent value, i.e. choose among the result of each list
print(predictions)
return predictions
def predict_k_means(data, iris_code):
def predict_k_means(data):
nb_clusters = {
"batiment": 5,
"usage": 3,
......@@ -448,32 +447,30 @@ def predict_k_means(data, iris_code):
"geo": 9,
"social": 5
}
lists = get_selected_indicators_lists()
for j, env in enumerate(ENVIRONMENT_VARIABLES):
for top_k, lst in lists.items():
if top_k == "10":
dataset = Dataset(data, env, selected_indicators=lst[env], _type='unsupervised')
dataset.init_all_in_one()
# cost = []
# for i in range(1, 11):
# kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=i, random_state=RANDOM_STATE))
# kmeans.fit()
# cost.append(kmeans.classifier.inertia_)
# plt.plot(range(1, 11), cost, color='g', linewidth='3')
# plt.xlabel("Value of K")
# plt.ylabel("Squared Error (Cost)")
# plt.show()
# print("top-k:", top_k, "--", env)
# dataset = Dataset(data, env, _type='unsupervised') # selected_indicators=lst[env],
# dataset.init_all_in_one()
# print("K-means with", nb_clusters[env], "clusters")
kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=nb_clusters[env], random_state=RANDOM_STATE))
kmeans.fit()
kmeans.predict(iris_code)
print(kmeans.classifier.labels_)
chart = Chart(dataset=dataset, name='')
chart.compute_trendline()
# for top_k, lst in lists.items():
# if top_k == "10":
# dataset = Dataset(data, env, selected_indicators=lst[env], _type='unsupervised')
# dataset.init_all_in_one()
# cost = []
# for i in range(1, 11):
# kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=i, random_state=RANDOM_STATE))
# kmeans.fit()
# cost.append(kmeans.classifier.inertia_)
# plt.plot(range(1, 11), cost, color='g', linewidth='3')
# plt.xlabel("Value of K")
# plt.ylabel("Squared Error (Cost)")
# plt.show()
# print("top-k:", top_k, "--", env)
dataset = Dataset(data, env, _type='unsupervised') # selected_indicators=lst[env],
dataset.init_all_in_one()
# print("K-means with", nb_clusters[env], "clusters")
# kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=nb_clusters[env], random_state=RANDOM_STATE))
# kmeans.fit()
# print(kmeans.classifier.labels_)
chart = Chart(dataset=dataset, name='')
chart.compute_trendline()
if __name__ == '__main__':
......@@ -497,7 +494,7 @@ if __name__ == '__main__':
#
data = Data(normalize="density", filter=True)
data.init_all_in_one()
predict_k_means(data, "692440102")
# predict_k_means(data)
#
# data = Data(normalize="pop", filter=True)
# data.init_all_in_one()
......@@ -507,4 +504,4 @@ if __name__ == '__main__':
# expe2(data)
# expe3(data)
# expe4(data, RandomForestClassifier(), 0.8, 0.2)
# expe5(data, RandomForestClassifier(), 0.8, 0.2)
expe5(data, RandomForestClassifier(), 0.8, 0.2)
......@@ -74,13 +74,13 @@ def generate_lists():
data.init_all_in_one()
# # 2. Run heat map and get less correlated indicators
dataset = Dataset(data, "batiment", 'unsupervised') # WARNING: fill _type parameter
dataset = Dataset(data, "batiment") # WARNING: fill _type parameter
dataset.init_all_in_one()
heat_map = MethodSelection(name="heat map EV-agnostic", dataset=dataset, parameters=PARAMETERS)
heat_map.results()
# heat_map.draw_and_save()
fully_correlated_indicators = heat_map.best_indicators
logging.info("fully correlated indicators: %d %s", len(fully_correlated_indicators), fully_correlated_indicators)
logging.debug("fully correlated indicators: %s", fully_correlated_indicators)
hierarchy = pd.read_csv(FILE_HIERARCHY)
......@@ -96,7 +96,7 @@ def generate_lists():
# logging.debug("%s { threshold_HM: %f, min_col_HM: %f, top_k: %d}", env, PARAMETERS["threshold_HM"], PARAMETERS["min_col_HM"], PARAMETERS["top_k"])
# B. FEATURE IMPORTANCE on uncorrelated indicators (the ones that are not chosen by heat map) to select the most relevant ones
dataset = Dataset(data, env, indicators_to_remove=fully_correlated_indicators, _type="supervised") # WARNING: fill _type parameter
dataset = Dataset(data, env, indicators_to_remove=fully_correlated_indicators) # WARNING: fill _type parameter
dataset.init_all_in_one()
# a. get best indicators for ET
......@@ -117,8 +117,9 @@ def generate_lists():
primary_FI = best_indicators_FI_ET # [indicator[0] for indicator in best_indicators_FI_ET]
indic_ET = [best_indicators_FI_ET[i][0] for i in range(len(best_indicators_FI_ET))]
for i in range(len(best_indicators_FI_RF)):
# [['indic1', score1], ['indic2', score2], ...]
index_indicator_in_ET = indic_ET.index(best_indicators_FI_RF[i][0]) if best_indicators_FI_RF[i][0] in indic_ET else -1
# [['indic1', score1], ['indic2, score2], ...]
index_indicator_in_ET = indic_ET.index(best_indicators_FI_RF[i][0]) if best_indicators_FI_RF[i][
0] in indic_ET else -1
if index_indicator_in_ET >= 0:
primary_FI[index_indicator_in_ET][1] += best_indicators_FI_RF[i][1]
else:
......@@ -142,6 +143,7 @@ def generate_lists():
def selection_by_distribution(dataset):
# for indicator in self.dataset.selected_indicators:
# fig, axs = plt.subplots(int(self.number_of_iris / 2), self.iris_per_line, figsize=(15, 15)) # rows, columns
i, j, z = 0, 0, 1 # i and j are indices to plot sub-figures and z is the counter to place figures
all_relevant_indicators = []
n = 0
for index, row in dataset.data.iterrows(): # head(self.number_of_iris).
......@@ -173,9 +175,7 @@ def selection_by_distribution(dataset):
if __name__ == '__main__':
# generate_all_data()
# generate_lists()
data = Data(normalize="density", filter=True)
data.init_all_in_one()
dataset = Dataset(data, "batiment", "unsupervised")
......
......@@ -14,7 +14,6 @@ let baseLayers = null; // array of basic layers
let overlayLayers = null; // array of overlaying layers
let osmLayer = null; // openstreetmap basic layer
let irisLayer = null; // layer of displayed IRIS
let previously_selected_algorithm = null; // store the selected algorithm in the popup
/**
......@@ -43,14 +42,12 @@ function initialize() {
** Event for zoom changes : updates a label and if zoom enabled and above min zoom level, display iris
*/
function zoomendEvent() {
let zoomLevel = map.getZoom();
zoomLevel = map.getZoom();
document.getElementById("spanZoomLevel").innerHTML = zoomLevel;
let isZoomDisabled = $("#inputZoomLevel").prop("disabled");
let iris;
let bounds;
if (!isZoomDisabled) { // display after zoomend is enabled
isZoomDisabled = $("#inputZoomLevel").prop("disabled");
if(!isZoomDisabled) { // display after zoomend is enabled
let minZoomLevel = $("#inputZoomLevel").val();
if (zoomLevel >= minZoomLevel) { // display iris on the selected zone
if(zoomLevel >= minZoomLevel) { // display iris on the selected zone
bounds = map.getBounds();
iris = getIrisForBounds(bounds);
}
......@@ -77,52 +74,36 @@ function resetHighlightAll() {
}
}
function displayPopup(e) {
let layer = e.target;
let code_iris = layer.feature.properties.CODE_IRIS;
let selected_algorithm = $("#selectAlgorithmTooltip option:selected").val();
let predictions = undefined;
if (selected_algorithm !== "undefined" && selected_algorithm !== undefined) {
predictions = predict(code_iris, selected_algorithm)
console.log(predictions)
}
let divInformation = $("<div>");
divInformation
.prop("id", "divInfos")
.append("CODE IRIS : " + layer.feature.properties.CODE_IRIS).append($("<br>"))
.append("IRIS : " + layer.feature.properties.NOM_IRIS).append($("<br>"))
.append("COMMUNE : " + layer.feature.properties.NOM_COM).append($("<br>"));
let moreInfosLink = $("<a>");
moreInfosLink
.prop("href", "details-iris.html?code_iris="+layer.feature.properties.CODE_IRIS)
.prop("target", "_blank")
.text("Plus de détails")
.append($("<br>"));
divInformation.append(moreInfosLink);
let selectAlgorithm = $("<select>")
selectAlgorithm
.prop("id", "selectAlgorithmTooltip")
.append($("<option>").prop("value", "undefined").text("---"))
for(let algorithm of classifiers) { selectAlgorithm.append($("<option>").prop("value", algorithm).text(algorithm)); }
previously_selected_algorithm = selected_algorithm;
let divPredictions = $("<div>").prop("id", "divPredictions");
if(predictions !== undefined) {
for(let key in predictions) { divPredictions.append(key+': ' + predictions[key]["most_frequent"] + " (" + predictions[key]["count_frequent"] + "/7)").append($('<br>')); }
function showPredictions(e) {
var layer = e.target;
var code_iris = layer.feature.properties.CODE_IRIS
var algorithm = $("#selectAlgorithmTooltip option:selected").text();
console.log(algorithm)
let predictions = predict(code_iris)
let messageTooltip = '<div>CODE IRIS : ' + layer.feature.properties.CODE_IRIS + '<br/>'
messageTooltip += 'IRIS : ' + layer.feature.properties.NOM_IRIS + '<br/>'
messageTooltip += 'COMMUNE : ' + layer.feature.properties.NOM_COM + '<br/><br/>'
messageTooltip += '<select id="selectAlgorithmTooltip">'
messageTooltip += "<option value='undefined'>" + "---" + "</option>"
for(let algorithm of classifiers) {
console.log(algorithm)
messageTooltip += "<option value=" + algorithm + ">" + algorithm + "</option>"
}
let messageTooltip = divInformation[0].outerHTML + selectAlgorithm[0].outerHTML + divPredictions[0].outerHTML;
console.log(messageTooltip)
messageTooltip += "</select>"
for(let key in predictions) { messageTooltip += key+': ' + predictions[key] + '<br/>' }
messageTooltip += '<a href="details-iris.html?code_iris='+layer.feature.properties.CODE_IRIS + '" target="_blank">Plus de détails</a></div>';
layer.bindPopup(messageTooltip)
layer.bringToFront();
layer.openPopup();
$("#selectAlgorithmTooltip").val(previously_selected_algorithm); // must be after binding the popup to be effective
$("#selectAlgorithmTooltip").on("click", function() { displayPopup(e)}) // update popup (env variables) when click on an algorithm
}
$("#selectAlgorithmTooltip").on("change", function() {
alert("changed")
eventsIRIS()
});
/**
* Add IRIS layer from GeoJSON data
* @param {geojson} geojson .
......@@ -139,7 +120,7 @@ function addLayerFromGeoJSON(geojson, events, style, typeMethod){
irisLayer = new L.geoJSON(geojson, {onEachFeature: events});
irisLayer.setStyle(style);
irisLayer.addTo(map);
if(typeMethod !== "searchBounds") // if searchBounds (zoom), fitBounds() will decrease the zoom, thus reloading searchBounds...
if(typeMethod != "searchBounds") // if searchBounds (zoom), fitBounds() will decrease the zoom, thus reloading searchBounds...
map.fitBounds(irisLayer.getBounds()); // zoom on the displayed iris
}
return irisLayer;
......@@ -155,7 +136,7 @@ function eventsIRIS(feature, layer) {
//mouseover: highlightFeature,
//mouseout: resetHighlight,
//click: clickProperties
click: displayPopup //showPredictions
click: showPredictions
});
}
......
function predict(iris_code, algorithm_name) {
function predict(iris_code) {
let predictions = null
$(document.body).css({'cursor' : 'wait'});
$.ajax({
type: "GET",
url: "/predict_iris",
data: {
'iris_code': iris_code,
'algorithm_name': algorithm_name
'iris_code': iris_code
},
"async": false,
contentType: 'application/json;charset=UTF-8',
......@@ -15,11 +12,9 @@ function predict(iris_code, algorithm_name) {
console.log(result)
console.log(result['predictions'])
predictions = result['predictions']
$(document.body).css({'cursor' : 'auto'});
},
error: function(result, textStatus, errorThrown) {
console.log(errorThrown);
$(document.body).css({'cursor' : 'auto'});
}
});
return predictions
......
import logging
# logging.basicConfig(level=logging.DEBUG)
log_format = "[%(levelname)s] \t %(name)s \t %(filename)s::%(lineno)d \t %(message)s"
logging.basicConfig(level='DEBUG', format=log_format)
logging.debug("This is a debug message")
logging.info("This is an informational message")
logging.warning("Careful! Something does not look right")
logging.error("You have encountered an error")
logging.critical("You are in trouble")
def hypotenuse(a, b):
"""Compute the hypotenuse"""
return (a ** 2 + b ** 2) ** 0.5
a=3
b=4
logging.debug("Hypotenuse of {a}, {b} is {c}".format(a=a, b=b, c=hypotenuse(a, b)))
# > INFO:root:Hypotenuse of 3, 4 is 5.0
......@@ -306,9 +306,7 @@ def get_most_frequent(lst):
"""Get the most frequent item in a list. If many elements are frequent, it returns the first one.
:param lst: the list to find the most frequent element.
"""
most_frequent_element = max(set(lst), key=lst.count)
dictionary = {"most_frequent": most_frequent_element, "count_frequent": lst.count(most_frequent_element)}
return dictionary
return max(set(lst), key=lst.count)
#################### plot functions ####################
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment