[REVERT] revert to old version

af07d919 · Nelly Barret · 7b948e7d · af07d919 · af07d919 · af07d919
Commit af07d919 authored 5 years ago by Nelly Barret
--- a/predihood/charts.py
+++ b/predihood/charts.py
@@ -20,7 +20,7 @@ def generate_charts():
    data.generate_from_cities(cities)
    lists = get_selected_indicators_lists(10)
    for j, env in enumerate(["social"]):
-        dataset = Dataset(data, env, selected_indicators=lists["10"][env], train_size=0.8, test_size=0.2, _type="unsupervised")
+        dataset = Dataset(data, env, selected_indicators=lists["10"][env], train_size=0.8, test_size=0.2)
        dataset.init_all_in_one()
        algo = Chart(name='chart', dataset=dataset, number_of_iris=len(cities))
        algo.compute_trendline()

--- a/predihood/classes/Chart.py
+++ b/predihood/classes/Chart.py
@@ -80,45 +80,45 @@ def similarity(data1, data2, max_distance, nb_points):


 class Chart(Method):
-    def __init__(self, name, dataset, number_of_iris=16):
+    def __init__(self, name, dataset, number_of_iris=12):
        Method.__init__(self, name, dataset)
        self.chart = None
        self.dataset = dataset
        # set all NaN values as 0
        self.dataset.data = self.dataset.data.fillna(0)
-        self.number_of_iris = number_of_iris if number_of_iris % 2 == 0 else 12
+        # self.number_of_iris = number_of_iris if number_of_iris % 2 == 0 else 12
        self.iris_per_line = 2
        self.step = 0
        self.max_value = 0

    def compute_trendline(self):
        # for indicator in self.dataset.selected_indicators:
-        fig, axs = plt.subplots(int(self.number_of_iris / 2), self.iris_per_line, figsize=(15, 15))  # rows, columns
+        # fig, axs = plt.subplots(int(self.number_of_iris / 2), self.iris_per_line, figsize=(15, 15))  # rows, columns
        i, j, z = 0, 0, 1  # i and j are indices to plot sub-figures and z is the counter to place figures
        all_relevant_indicators = []
        n = 0
-        for index, row in self.dataset.data.head(self.number_of_iris).iterrows():  # head(self.number_of_iris).
+        for index, row in self.dataset.data.iterrows():  # head(self.number_of_iris).
            data = []
            list_indicators = self.dataset.selected_indicators if self.dataset.selected_indicators is not None else self.dataset.indicators
            for indicator in list_indicators:
                data.append(row[indicator])
-            max_value = self.dataset.data.head(self.number_of_iris)[self.dataset.selected_indicators].values.max()  # .head(self.number_of_iris)
-            logging.debug("max value is %.4f", max_value)
-            x = np.arange(0, len(data))
-            y = data
+            # max_value = self.dataset.data[self.dataset.selected_indicators].values.max()  # .head(self.number_of_iris)
+            # logging.debug("max value is %.4f", max_value)
+            # x = np.arange(0, len(data))
+            # y = data
            mean_of_data = round(sum(data) / len(data), 3)
-            logging.debug("y = %s", y)
-            f = interp1d(x, y)
-            axs[i, j].axis(ymin=0, ymax=max_value)
-            axs[i, j].set_xticks(np.arange(0, len(data)))
-            self.max_value = max_value
-            self.step = max_value / 5
-            axs[i, j].set_yticks(np.arange(0, max_value, step=self.step))
-            axs[i, j].plot(x, data, 'o', x, f(x), '-')
-            for k, v in enumerate(data):
-                # label = # "{:.1E}".format(v)
-                axs[i, j].annotate(round(v, 2), (k, v), )
-            title = str(row['CODE']) + " - " + str(self.dataset.env) + " -" + str(mean_of_data)
+            # logging.debug("y = %s", y)
+            # f = interp1d(x, y)
+            # axs[i, j].axis(ymin=0, ymax=max_value)
+            # axs[i, j].set_xticks(np.arange(0, len(data)))
+            # self.max_value = max_value
+            # self.step = max_value / 5
+            # axs[i, j].set_yticks(np.arange(0, max_value, step=self.step))
+            # axs[i, j].plot(x, data, 'o', x, f(x), '-')
+            # for k, v in enumerate(data):
+            #     # label = # "{:.1E}".format(v)
+            #     axs[i, j].annotate(round(v, 2), (k, v), )
+            # title = str(row['CODE']) + " - " + str(self.dataset.env) + " -" + str(mean_of_data)
            relevant_indicators = []
            for ind in range(len(list_indicators)):
                indicator = list_indicators[ind]
@@ -132,15 +132,15 @@ class Chart(Method):
                all_relevant_indicators = union(all_relevant_indicators, relevant_indicators)
            n += 1

-            axs[i, j].set_title(title)
-            if z < self.iris_per_line:
-                z += 1
-                j += 1
-            else:
-                z = 1
-                i += 1
-                j = 0
-        fig.show()
+        #     axs[i, j].set_title(title)
+        #     if z < self.iris_per_line:
+        #         z += 1
+        #         j += 1
+        #     else:
+        #         z = 1
+        #         i += 1
+        #         j = 0
+        # fig.show()
        print(all_relevant_indicators)
        print(len(all_relevant_indicators))
        # self.compute_similarity()

--- a/predihood/classes/Data.py
+++ b/predihood/classes/Data.py
@@ -10,7 +10,7 @@ import warnings
 from area import area

 from predihood import model
-from predihood.cleaning import clean
+from predihood.cleansing import clean
 from predihood.config import FILE_CLEANED_DATA, FOLDER_DATASETS, ENVIRONMENT_VARIABLES, FILE_GROUPING
 from predihood.utility_functions import address_to_code, append_indicator, append_target

@@ -348,13 +348,13 @@ if __name__ == '__main__':
    data = Data()

    cities = {
-        # "tassin": ["maisons", "résidentiel", "espaces verts", "périurbain", "ouest lyon", "moyen-sup"],
-        # "107 rue jean voillot villeurbanne": ["grands ensembles", "autres activités", "urbanisé", "urbain", "est lyon", "popu"],
+        "tassin": ["maisons", "résidentiel", "espaces verts", "périurbain", "ouest lyon", "moyen-sup"],
+        "107 rue jean voillot villeurbanne": ["grands ensembles", "autres activités", "urbanisé", "urbain", "est lyon", "popu"],
        "saint cyr au mont d'or": ["maisons", "résidentiel", "arboré", "périurbain", "ouest lyon", "sup"],
-        # "doua villeurbanne": ["immeubles", "autres activités", "urabnisé", "central", "est-lyon", "moyen-inf"],
+        "doua villeurbanne": ["immeubles", "autres activités", "urabnisé", "central", "est-lyon", "moyen-inf"],
        "part dieu lyon": ["immeubles", "commerçant", "urbanisé", "centrtal", "centre lyon", "moyen"],
-        # "dompierre sur besbre": ["maisons", "résidentiel", "arboré", "rural", "est moulins", "sup"],
-        # "rue de la favorite lyon": ["mixte", "commercant", "urbanisé", "urbain", "nord-ouest lyon", "moyen"],
-        # "lezoux": ["maisons", "résidentiel", "arboré", "rural", "est clermont-ferrand", "moyen-sup"]
+        "dompierre sur besbre": ["maisons", "résidentiel", "arboré", "rural", "est moulins", "sup"],
+        "rue de la favorite lyon": ["mixte", "commercant", "urbanisé", "urbain", "nord-ouest lyon", "moyen"],
+        "lezoux": ["maisons", "résidentiel", "arboré", "rural", "est clermont-ferrand", "moyen-sup"]
    }
    data.generate_from_cities(cities)
--- a/predihood/classes/MethodCleansing.py
+++ b/predihood/classes/MethodCleansing.py
+import logging
+import os
+import warnings
+import logging
+log_format = "[%(levelname)s] - %(filename)s::%(lineno)d \t %(message)s"
+logging.basicConfig(level='DEBUG', format=log_format)
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from predihood.classes.Method import Method
+from predihood.config import ENVIRONMENT_VARIABLES, FILE_CLEANED_DATA, FOLDER_DISTRIBUTION
+from predihood.utility_functions import sim, auto_label
+
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+
+class MethodCleansing(Method):
+    def __init__(self, name, dataset):
+        """
+        Constructor of the MethodCleansing class. Initialize attributes.
+        """
+        Method.__init__(self, name, dataset)
+        self.values_by_env = {}  # dict to store values for each environment variable, e.g. [Maisons, Maison]
+        self.columns_dep = []  # departure columns, e.g. Abatiment, Ausage,...
+        self.columns_arr = []  # arrival columns, e.g. Nbatiment, Nusage, ...
+        self.columns = {
+            "occup": {
+                "name_dep": "Aoccup",
+                "data_dep": self.dataset.Aoccup,
+                "name_arr": "Noccup",
+                "data_arr": self.dataset.Noccup
+            }
+        }
+
+        for env in ENVIRONMENT_VARIABLES:
+            temp = {
+                "name_dep": "A" + env,
+                "name_arr": "N" + env,
+                "data_dep": self.dataset["A" + str(env)],
+                "data_arr": self.dataset["N" + str(env)]
+            }
+            self.columns[env] = temp
+
+        for env in self.columns: self.columns_dep.append(self.columns[env]["data_dep"])
+        for env in self.columns: self.columns_arr.append(self.columns[env]["data_arr"])
+
+        # define outliers (to be removed)
+        self.outliers = ['Oui', 'Moyenne-sup', 'Location']
+
+        # plot variables
+        self.before = {}  # departure values for each EV
+        self.after = {}  # arrival values for each EV
+        self.labels = {}  # labels for EV, e.g. maisons, immeubles, grand ensemble...
+
+    def clean(self):
+        """
+        Clean data from bad naming conventions.
+        """
+        logging.info(
+            "The data needs to be cleaned. For each list, write the correct word. For each environment variable, you will get its number of corrections and its error rate.")
+        # 1. getting wrong values in a dict ordered by env variable
+        self.values_by_env = {}
+        for col_dep, col_arr in zip(self.columns_dep, self.columns_arr):
+            col_name = col_dep.name[1:]
+            self.values_by_env[col_name] = []
+            for val in col_dep.unique():  # get possible values for the current column
+                index = sim(val, self.values_by_env[col_name])
+                # if the value is similar to another, add it, else create an new array with it
+                if index >= 0:
+                    self.values_by_env[col_name][index].append(val)
+                elif index == -1:
+                    self.values_by_env[col_name].append([val])
+            for val in col_arr.unique():
+                index = sim(val, self.values_by_env[col_name])
+                if index >= 0:
+                    self.values_by_env[col_name][index].append(val)
+                elif index == -1:
+                    self.values_by_env[col_name].append([val])
+
+        # 2. renaming these wrong values in data
+        for key, value in self.values_by_env.items():
+            col_nameA = "A" + key
+            col_nameN = "N" + key
+            nb_replacement_dep = 0
+            nb_replacement_arr = 0
+            for i in range(len(value)):
+                if len(value[i]) > 1:
+                    arr_without_duplicates = list(dict.fromkeys(value[i]))
+                    chosen_label = input(str(arr_without_duplicates) + ": ")
+                    for label in value[i]:
+                        if label != chosen_label:  # if label == chosen_label: skip it because no replacement is needed
+                            nb_replacement_dep += pd.Series(self.dataset[col_nameA] == label).sum()
+                            nb_replacement_arr += pd.Series(self.dataset[col_nameN] == label).sum()
+                            self.dataset.loc[self.dataset[col_nameA] == label, col_nameA] = chosen_label
+                            self.dataset.loc[self.dataset[col_nameN] == label, col_nameN] = chosen_label
+            size = int(self.dataset.count()["A" + key]) + int(self.dataset.count()["N" + key])
+            mean_error = ((nb_replacement_dep + nb_replacement_arr) / size) * 100
+            logging.debug(
+                "%d IRIS have been corrected for the environment variable %s, corresponding to an error rate of %.0f %%",
+                (nb_replacement_dep + nb_replacement_arr), key, mean_error)
+
+        # 3. removing outliers from data
+        count = 0
+        for outlier in self.outliers:
+            self.dataset.drop(self.dataset[self.dataset.eq(outlier).any(1)].index, inplace=True)
+            count += 1
+        logging.debug("%d outliers removed", count)
+
+        # 4. save data
+        self.dataset.to_csv(FILE_CLEANED_DATA, index=False, encoding='utf-8')
+        logging.info("Cleaned data is in %s", FILE_CLEANED_DATA)
+
+    def create_before_after_labels(self, name_dep, name_arr):
+        """
+        Creates the arrays 'before', 'after' and 'labels' from data.
+        :param name_dep: the name of the departure column, e.g. Aoccup, Abatiment, Ausage...
+        :param name_arr: the name of the arrival column, e.g. Noccup, Nbatiment, Nusage...
+        """
+        all_repartition = {}
+        self.before = {}
+        self.after = {}
+
+        for status, value in self.dataset[name_dep].value_counts().items():
+            if name_dep == "Ageo":  # if geo, get only the geo position (South, East, ..) and not the city
+                status = status.split(" ")[0]
+                if status in self.before:
+                    self.before[status] += value
+                else:
+                    self.before[status] = value
+            else:
+                self.before[status] = value  # self.dataset[values_before].value_counts()[status]
+
+        for status, value in self.dataset[name_arr].value_counts().items():
+            if name_arr == "Ngeo":  # if geo, get only the geo position (South, East, ..) and not the city
+                status = status.split(" ")[0]
+                if status in self.after:
+                    self.after[status] += value
+                else:
+                    self.after[status] = value
+            else:
+                self.after[status] = value  # self.dataset[values_after].value_counts()[status]
+
+        # 2. merge before and after data in the sale dict
+        for status in self.before:
+            all_repartition[status] = [self.before[status], 0]
+        for status in self.after:
+            if status not in all_repartition:
+                all_repartition[status] = [0, self.after[status]]
+            else:
+                all_repartition[status][1] = self.after[status]
+
+        # 3. convert dict in 3 arrays
+        self.before = []
+        self.after = []
+        self.labels = []
+        for key in all_repartition:
+            if not isinstance(key, float):  # to remove nan values
+                self.before.append(all_repartition[key][0])
+                self.after.append(all_repartition[key][1])
+                self.labels.append(key)
+
+    def create_bar_chart(self, name, title):
+        """
+        Plot before/after charts.
+        :param name: the name of the target to plot, i.e. environment variable, e.g. usage, batiment, ...
+        :param title: the title of the plot.
+        """
+        x = np.arange(len(self.labels))  # the label locations
+        width = 0.35
+
+        fig, ax = plt.subplots()
+
+        ax.bar(x - width / 2, [154 for _ in range(len(self.labels))], width=width, color="#DCDCDC")  # grey bar
+        bef = ax.bar(x - width / 2, self.before, width=width, label='Avant')  # before data
+        ax.bar(x + width / 2, [154 for _ in range(len(self.labels))], width=width, color="#DCDCDC")  # grey bar
+        aft = ax.bar(x + width / 2, self.after, width=width, label='Après')  # after data
+
+        ax.set_ylabel('Nombre de personnes')
+        plt.xticks(x, self.labels, rotation='vertical')
+        auto_label(bef, ax)
+        auto_label(aft, ax)
+        plt.tight_layout()
+        ax.legend()
+        filename = os.path.join(FOLDER_DISTRIBUTION, "distribution_" + name + ".png")
+        fig.savefig(filename)
+        ax.set_title(title)
+        plt.show()
+
+    def to_chart(self, env, name, title):
+        """
+        Create before/after data and plot it.
+        :param env: the target to plot, i.e. the environment variable, e.g. usage, paysage...
+        :param name: the name to save the file.
+        :param title: the title of the plot.
+        """
+        self.create_before_after_labels(self.columns[env]["name_dep"], self.columns[env]["name_arr"])
+        self.create_bar_chart(name, title)
--- a/predihood/classes/MethodPrediction.py
+++ b/predihood/classes/MethodPrediction.py
@@ -36,7 +36,7 @@ class MethodPrediction(Method):

    def predict(self, iris_code=None):
        """
-        Predict environment variables for the given iris. The environment variable to predict is stored in the dataset as "env" variable
+        Predict environment variables for the given iris.
        """
        iris_object = model.get_iris_from_code(iris_code)
        iris_area = area(model.get_coords_from_code(iris_code)) / 1000000
@@ -69,7 +69,6 @@ class MethodPrediction(Method):

        df = pd.DataFrame([iris_indicators_values], columns=iris_indicators_names)
        self.prediction = self.classifier.predict(df)[0]
-        print(self.prediction)

    def plot(self):
        max_depths = np.linspace(1, 32, 32, endpoint=True)

--- a/predihood/classes/MethodSelection.py
+++ b/predihood/classes/MethodSelection.py
@@ -62,11 +62,10 @@ class MethodSelection(Method):
            upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

            self.best_indicators = []
-            print(upper)
            for i in range(len(upper.columns)):
                column = upper.columns[i]
                for k, value in upper[column].items():
-                    if value == 1 and column not in self.best_indicators: # and (column, k) not in self.best_indicators and (k, column) not in self.best_indicators:
+                    if value == 1 and column not in self.best_indicators:
                        self.best_indicators.append(column)

            if TITLES: plt.title("Matrice de corrélation : filtrage = " + (

--- a/predihood/classes/hierarchy-ancestors.csv
+++ b/predihood/classes/hierarchy-ancestors.csv
--- a/predihood/classes/regrouping.csv
+++ b/predihood/classes/regrouping.csv
--- a/predihood/cleansing.py
+++ b/predihood/cleansing.py
+import logging
+log_format = "[%(levelname)s] - %(filename)s::%(lineno)d \t %(message)s"
+logging.basicConfig(level='DEBUG', format=log_format)
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from predihood.classes.MethodCleansing import MethodCleansing
+from predihood.config import FILE_DATA_HIL, FOLDER_FEATURE_SELECTION, TITLES
+
+
+def clean():
+    data = pd.read_excel(FILE_DATA_HIL)
+
+    # rename columns because of bad naming convention
+    data.columns = ["id", "nom", "num_HiL", "sexe", "age", "nb_enfant",
+                    "rev_mensuel", "loyer_charges_mensuel", "rev_fiscal",
+                    "Aadresse", "Apays", "Aoccup", "Aloyer",
+                    "Nadresse", "Npays", "Noccup",
+                    "motif",
+                    "Acontexte", "Atype", "Abatiment", "Ausage", "Apaysage", "Amorpho", "Ageo", "Asocial",
+                    "Ncontexte", "Nbatiment", "Nusage", "Npaysage", "Nmorpho", "Ngeo", "Nsocial"]
+    data.head()
+
+    cleansing = MethodCleansing("cleansing", data)
+    cleansing.clean()
+
+    logging.info("Many plots have be generated in " + FOLDER_FEATURE_SELECTION)
+
+    # 1. distribution between women and men
+    women_men = data.iloc[:, 3].value_counts()  # répartition homme/femme
+    labels = 'Femme', 'Homme'
+    sizes = [women_men['Femme'], women_men['Homme']]
+    colors = ['salmon', 'lightblue']
+    plt.pie(sizes, labels=labels, colors=colors, autopct='%i%%', shadow=True, startangle=90)
+    plt.axis('equal')
+    if TITLES: plt.title("Distribution selon le genre")
+    plt.show()
+
+    # 2. distribution between ages
+    data_temp = data
+    data_temp = data_temp.dropna()
+
+    ages_plot = []
+    total_plot = []
+    min_age = int(min(data_temp.iloc[:, 4]))
+    max_age = int(max(data_temp.iloc[:, 4]))
+    for counter in range(min_age, max_age + 1):
+        total_plot.append(data_temp.loc[data_temp.age == float(counter), 'age'].count())
+        ages_plot.append(counter)
+
+    mean = np.average(ages_plot, weights=total_plot)
+    logging.debug("mean age: %.1f", mean)
+
+    plt.bar(ages_plot, total_plot)
+    plt.axvline(x=mean, color='red')  # ligne médiane
+    plt.xlabel("Âge (en années)")
+    plt.ylabel("Nombre de personnes")
+    if TITLES: plt.title('Distribution selon l\'âge')
+    plt.show()
+
+    ages = data_temp.iloc[:, 4]
+    plt.hist(ages, facecolor='gray', align='mid')
+    plt.xlabel("Âge (en années)")
+    plt.ylabel("Nombre de personnes")
+    if TITLES: plt.title("Distribution selon l'âge")
+    plt.show()
+
+    # 3. distribution between incomes
+    incomes = data.iloc[:, 6]
+    plt.hist(incomes, facecolor='gray', align='mid')
+    if TITLES: plt.title("Distribution selon le revenu le mensuel")
+    plt.xlabel("Revenu mensuel (en euros)")
+    plt.ylabel("Nombre de personnes")
+    plt.show()
+
+    # 4. distribution between reasons of transfer
+    transfers = data.iloc[:, 16].value_counts()
+    labels = transfers.index.tolist()
+    sizes = [transfers[i] for i in range(len(transfers))]
+    plt.pie(sizes, labels=labels, autopct='%.2f', shadow=True, startangle=90)
+    plt.axis('equal')
+    if TITLES: plt.title("Distribution selon le motif de mutation")
+    plt.show()
+
+    # 5. distribution between geographic positions
+    geo = pd.concat([data.iloc[:, 23], data.iloc[:, 30]], ignore_index=True)  # .values_count()
+    split_geo = [geo[i].split()[0] if not isinstance(geo[i], float) else "" for i in range(len(geo))]
+    set_geo = set(split_geo)
+
+    uniques = [split_geo.count(elem) for elem in set_geo]
+    labels = set_geo
+    plt.pie(uniques, labels=labels, autopct='%.2f', shadow=True, startangle=90)
+
+    if TITLES: plt.title("Distribution selon la position géographique")
+    plt.show()
+
+    # 6. evolution between before and after transfer for each environment variable
+    cleansing.to_chart('occup', 'status', 'Évolution des statuts avant et après la mutation')
+    cleansing.to_chart('batiment', 'building_type', 'Évolution des types de bâtiments avant et après la mutation')
+    cleansing.to_chart('usage', 'building_usage', 'Évolution des usages de bâtiments avant et après la mutation')
+    cleansing.to_chart('paysage', 'landscapes', 'Évolution des paysages avant et après la mutation')
+    cleansing.to_chart('morpho', 'morpho', 'Évolution des morphologies avant et après la mutation')
+    cleansing.to_chart('social', 'social', 'Évolution des classes sociales avant et après la mutation')
+    cleansing.to_chart('geo', 'geo', 'Évolution des positions géographiques avant et après la mutation')
+
+
+if __name__ == '__main__':
+    clean()
--- a/predihood/generated_files/datasets/.~lock.data_density.csv#
+++ b/predihood/generated_files/datasets/.~lock.data_density.csv#
+,nelly,MacBook-Pro.local,11.06.2020 17:30,file:///Users/nelly/Library/Application%20Support/LibreOffice/4;
\ No newline at end of file
--- a/predihood/generated_files/selected-indicators/.~lock.selection-distribution.csv#
+++ b/predihood/generated_files/selected-indicators/.~lock.selection-distribution.csv#
+,nelly,MacBook-Pro.local,29.05.2020 15:26,file:///Users/nelly/Library/Application%20Support/LibreOffice/4;
\ No newline at end of file
--- a/predihood/generated_files/selected-indicators/selection-distribution.csv
+++ b/predihood/generated_files/selected-indicators/selection-distribution.csv
-P14_RP,DENSITY,C14_MEN,C14_ACTOCC1564,P14_MEN,P14_LOG,DEC_Q314,DEC_D214,P14_RP_ACHTOT,P14_NBPI_RP_ANEM0509,P14_ANEM_RP_PROP,P14_POP1564,P14_RP_SDB,DEC_D114,P14_NPER_RP_PROP,DEC_D314,C14_PMEN_MENFAM,P14_POP30P,P14_NBPI_RPMAISON,P14_NBPI_RP_ANEM0204,C14_ACT1564,AREA,P14_POP,DEC_D414,P14_NBPI_RPAPPART,P14_POP15P,P14_ACTOCC1564,P14_PMEN,P14_ANEM_RP_LOCHLMV,P14_RPAPPART_ACHTOT,P14_NSCOL15P,DEC_D714,P14_RPAPPART,DEC_D614,DEC_D814,DEC_MED14,P14_NBPI_RP_ANEM10P,P14_PMEN_ANEM10P,P14_NBPI_RP,P14_ANEM_RP,DEC_Q114,P14_ACTOCC15P,C14_PMEN_MENCOUPAENF,P14_NPER_RP,P14_POPMEN15P,P14_MAISON,P14_ACT1564,P14_ANEM_RP_LOC,C14_POP15P,P14_RSECOCC,P14_APPART,P14_POP_FR,DEC_D914,P14_POP2064,C14_ACTOCC15P,C14_PMEN
+P14_LOG,DENSITY,P14_RP_SDB,P14_POP15P,DEC_D914,P14_ACTOCC15P,C14_ACTOCC1564,DEC_MED14,P14_MAISON,DEC_D814,DEC_D614,P14_POP30P,P14_POP_FR,P14_NBPI_RPMAISON,P14_NBPI_RP_ANEM10P,DEC_D414,DEC_D114,P14_ANEM_RP_LOCHLMV,P14_PMEN_ANEM10P,C14_PMEN_MENCOUPAENF,P14_RPAPPART_ACHTOT,P14_APPART,P14_NBPI_RP,P14_RPAPPART,C14_MEN,P14_ANEM_RP_PROP,DEC_Q314,DEC_D714,P14_PMEN,P14_POP2064,P14_NPER_RP_PROP,P14_MEN,P14_POP,C14_ACTOCC15P,P14_NPER_RP,DEC_D314,P14_POPMEN15P,P14_RP,P14_NSCOL15P,C14_PMEN_MENFAM,P14_POP1564,P14_NBPI_RP_ANEM0509,AREA,C14_POP15P,P14_ACTOCC1564,P14_ANEM_RP_LOC,C14_ACT1564,C14_PMEN,P14_NBPI_RPAPPART,P14_RP_ACHTOT,P14_ANEM_RP,DEC_Q114,P14_ACT1564,DEC_D214,P14_RSECOCC,P14_NBPI_RP_ANEM0204
--- a/predihood/main.py
+++ b/predihood/main.py
@@ -96,12 +96,10 @@ def run_algorithm():
 @app.route('/predict_iris', methods=["GET"])
 def predict_iris():
    iris_code_to_predict = request.args['iris_code']
-    clf_name = request.args['algorithm_name']
-    clf = get_classifier(clf_name)

    data = Data(normalize="density", filter=True)
    data.init_all_in_one()
-    predictions = predict_one_iris(iris_code_to_predict, data, clf, 0.8, 0.2, False)  # clf
+    predictions = predict_one_iris(iris_code_to_predict, data, KNeighborsClassifier(n_neighbors=30), 0.8, 0.2, False)
    return {"predictions": predictions}



--- a/predihood/predict.py
+++ b/predihood/predict.py
@@ -347,11 +347,11 @@ def expe4(data, clf, train_size, test_size, remove_outliers=False):
        results[env] = OrderedDict()
        logging.debug("--- %s ---", env)

-        dataset = Dataset(data_not_filtered, env, selected_indicators=data_not_filtered.indicators, train_size=train_size, test_size=test_size, outliers=remove_outliers, _type='supervised')
+        dataset = Dataset(data_not_filtered, env, selected_indicators=data_not_filtered.indicators, train_size=train_size, test_size=test_size, outliers=remove_outliers)
        dataset.init_all_in_one()

        mean_classifier = 0.0
-        algo = MethodPrediction(name="", dataset=dataset, classifier=clf)
+        algo = MethodPrediction(name="", dataset=dataset, classifier=clf, _type='supervised')
        algo.fit()
        algo.compute_performance()
        results[env]["accuracy_none"] = algo.accuracy
@@ -359,9 +359,9 @@ def expe4(data, clf, train_size, test_size, remove_outliers=False):
        logging.debug("accuracy for %s without filtering: %f", env, algo.accuracy)
        predictions[env] = []
        for top_k, lst in lists.items():
-            dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers, _type='supervised')
+            dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers)
            dataset.init_all_in_one()
-            algo2 = MethodPrediction(name='', dataset=dataset, classifier=clf)
+            algo2 = MethodPrediction(name='', dataset=dataset, classifier=clf, _type='supervised')
            # logging.debug("size of X_train: %d", len(algo2.dataset.X_train.columns))
            # logging.debug(algo2.dataset.X_train.columns)
            algo2.fit()
@@ -426,20 +426,19 @@ def predict_one_iris(iris_code, data, clf, train_size, test_size, remove_outlier
    for j, env in enumerate(ENVIRONMENT_VARIABLES):
        predictions_lst = []
        for top_k, lst in lists.items():
-            dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers, _type='supervised')
+            dataset = Dataset(data, env, selected_indicators=lst[env], train_size=train_size, test_size=test_size, outliers=remove_outliers)
            dataset.init_all_in_one()
-            algo = MethodPrediction(name='', dataset=dataset, classifier=clf)
+            algo = MethodPrediction(name='', dataset=dataset, classifier=clf, _type='supervised')
            # logging.debug("size of X_train: %d", len(algo2.dataset.X_train.columns))
            # logging.debug(algo2.dataset.X_train.columns)
            algo.fit()
            algo.predict(iris_code)
            predictions_lst.append(algo.prediction)
        predictions[env] = get_most_frequent(predictions_lst)  # get the most frequent value, i.e. choose among the result of each list
-    print(predictions)
    return predictions


-def predict_k_means(data, iris_code):
+def predict_k_means(data):
    nb_clusters = {
        "batiment": 5,
        "usage": 3,
@@ -448,32 +447,30 @@ def predict_k_means(data, iris_code):
        "geo": 9,
        "social": 5
    }
-
    lists = get_selected_indicators_lists()
    for j, env in enumerate(ENVIRONMENT_VARIABLES):
-        for top_k, lst in lists.items():
-            if top_k == "10":
-                dataset = Dataset(data, env, selected_indicators=lst[env], _type='unsupervised')
-                dataset.init_all_in_one()
-                # cost = []
-                # for i in range(1, 11):
-                #     kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=i, random_state=RANDOM_STATE))
-                #     kmeans.fit()
-                #     cost.append(kmeans.classifier.inertia_)
-                # plt.plot(range(1, 11), cost, color='g', linewidth='3')
-                # plt.xlabel("Value of K")
-                # plt.ylabel("Squared Error (Cost)")
-                # plt.show()
-                # print("top-k:", top_k, "--", env)
-                # dataset = Dataset(data, env, _type='unsupervised')  # selected_indicators=lst[env],
-                # dataset.init_all_in_one()
-                # print("K-means with", nb_clusters[env], "clusters")
-                kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=nb_clusters[env], random_state=RANDOM_STATE))
-                kmeans.fit()
-                kmeans.predict(iris_code)
-                print(kmeans.classifier.labels_)
-                chart = Chart(dataset=dataset, name='')
-                chart.compute_trendline()
+        # for top_k, lst in lists.items():
+        # if top_k == "10":
+        # dataset = Dataset(data, env, selected_indicators=lst[env], _type='unsupervised')
+        # dataset.init_all_in_one()
+        # cost = []
+        # for i in range(1, 11):
+        #     kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=i, random_state=RANDOM_STATE))
+        #     kmeans.fit()
+        #     cost.append(kmeans.classifier.inertia_)
+        # plt.plot(range(1, 11), cost, color='g', linewidth='3')
+        # plt.xlabel("Value of K")
+        # plt.ylabel("Squared Error (Cost)")
+        # plt.show()
+        # print("top-k:", top_k, "--", env)
+        dataset = Dataset(data, env, _type='unsupervised')  # selected_indicators=lst[env],
+        dataset.init_all_in_one()
+        # print("K-means with", nb_clusters[env], "clusters")
+        # kmeans = MethodPrediction(name='', dataset=dataset, classifier=KMeans(n_clusters=nb_clusters[env], random_state=RANDOM_STATE))
+        # kmeans.fit()
+        # print(kmeans.classifier.labels_)
+        chart = Chart(dataset=dataset, name='')
+        chart.compute_trendline()


 if __name__ == '__main__':
@@ -497,7 +494,7 @@ if __name__ == '__main__':
    #
    data = Data(normalize="density", filter=True)
    data.init_all_in_one()
-    predict_k_means(data, "692440102")
+    # predict_k_means(data)
    #
    # data = Data(normalize="pop", filter=True)
    # data.init_all_in_one()
@@ -507,4 +504,4 @@ if __name__ == '__main__':
    # expe2(data)
    # expe3(data)
    # expe4(data, RandomForestClassifier(), 0.8, 0.2)
-    # expe5(data, RandomForestClassifier(), 0.8, 0.2)
+    expe5(data, RandomForestClassifier(), 0.8, 0.2)
--- a/predihood/selection.py
+++ b/predihood/selection.py
@@ -74,13 +74,13 @@ def generate_lists():
    data.init_all_in_one()

    # # 2. Run heat map and get less correlated indicators
-    dataset = Dataset(data, "batiment", 'unsupervised')  # WARNING: fill _type parameter
+    dataset = Dataset(data, "batiment")  # WARNING: fill _type parameter
    dataset.init_all_in_one()
    heat_map = MethodSelection(name="heat map EV-agnostic", dataset=dataset, parameters=PARAMETERS)
    heat_map.results()
    # heat_map.draw_and_save()
    fully_correlated_indicators = heat_map.best_indicators
-    logging.info("fully correlated indicators: %d %s", len(fully_correlated_indicators), fully_correlated_indicators)
+    logging.debug("fully correlated indicators: %s", fully_correlated_indicators)

    hierarchy = pd.read_csv(FILE_HIERARCHY)

@@ -96,7 +96,7 @@ def generate_lists():
            # logging.debug("%s { threshold_HM: %f, min_col_HM: %f, top_k: %d}", env, PARAMETERS["threshold_HM"], PARAMETERS["min_col_HM"], PARAMETERS["top_k"])

            # B. FEATURE IMPORTANCE on uncorrelated indicators (the ones that are not chosen by heat map) to select the most relevant ones
-            dataset = Dataset(data, env, indicators_to_remove=fully_correlated_indicators, _type="supervised")  # WARNING: fill _type parameter
+            dataset = Dataset(data, env, indicators_to_remove=fully_correlated_indicators)  # WARNING: fill _type parameter
            dataset.init_all_in_one()

            # a. get best indicators for ET
@@ -117,8 +117,9 @@ def generate_lists():
            primary_FI = best_indicators_FI_ET  # [indicator[0] for indicator in best_indicators_FI_ET]
            indic_ET = [best_indicators_FI_ET[i][0] for i in range(len(best_indicators_FI_ET))]
            for i in range(len(best_indicators_FI_RF)):
-                # [['indic1', score1], ['indic2', score2], ...]
-                index_indicator_in_ET = indic_ET.index(best_indicators_FI_RF[i][0]) if best_indicators_FI_RF[i][0] in indic_ET else -1
+                # [['indic1', score1], ['indic2, score2], ...]
+                index_indicator_in_ET = indic_ET.index(best_indicators_FI_RF[i][0]) if best_indicators_FI_RF[i][
+                                                                                           0] in indic_ET else -1
                if index_indicator_in_ET >= 0:
                    primary_FI[index_indicator_in_ET][1] += best_indicators_FI_RF[i][1]
                else:
@@ -142,6 +143,7 @@ def generate_lists():
 def selection_by_distribution(dataset):
    # for indicator in self.dataset.selected_indicators:
    # fig, axs = plt.subplots(int(self.number_of_iris / 2), self.iris_per_line, figsize=(15, 15))  # rows, columns
+    i, j, z = 0, 0, 1  # i and j are indices to plot sub-figures and z is the counter to place figures
    all_relevant_indicators = []
    n = 0
    for index, row in dataset.data.iterrows():  # head(self.number_of_iris).
@@ -173,9 +175,7 @@ def selection_by_distribution(dataset):

 if __name__ == '__main__':
    # generate_all_data()
-
    # generate_lists()
-
    data = Data(normalize="density", filter=True)
    data.init_all_in_one()
    dataset = Dataset(data, "batiment", "unsupervised")

--- a/predihood/static/js/carto.js
+++ b/predihood/static/js/carto.js
@@ -14,7 +14,6 @@ let baseLayers = null; // array of basic layers
 let overlayLayers = null; // array of overlaying layers
 let osmLayer = null; // openstreetmap basic layer
 let irisLayer = null; // layer of displayed IRIS
-let previously_selected_algorithm = null; // store the selected algorithm in the popup


 /**
@@ -43,14 +42,12 @@ function initialize() {
 ** Event for zoom changes : updates a label and if zoom enabled and above min zoom level, display iris
 */
 function zoomendEvent() {
-    let zoomLevel = map.getZoom();
+    zoomLevel = map.getZoom();
    document.getElementById("spanZoomLevel").innerHTML = zoomLevel;
-    let isZoomDisabled = $("#inputZoomLevel").prop("disabled");
-    let iris;
-    let bounds;
-    if (!isZoomDisabled) {  // display after zoomend is enabled
+    isZoomDisabled = $("#inputZoomLevel").prop("disabled");
+    if(!isZoomDisabled) {  // display after zoomend is enabled
        let minZoomLevel = $("#inputZoomLevel").val();
-        if (zoomLevel >= minZoomLevel) { // display iris on the selected zone
+        if(zoomLevel >= minZoomLevel) { // display iris on the selected zone
            bounds = map.getBounds();
            iris = getIrisForBounds(bounds);
        }
@@ -77,52 +74,36 @@ function resetHighlightAll() {
 	}
 }

-
-function displayPopup(e) {
-    let layer = e.target;
-    let code_iris = layer.feature.properties.CODE_IRIS;
-    let selected_algorithm = $("#selectAlgorithmTooltip option:selected").val();
-    let predictions = undefined;
-    if (selected_algorithm !== "undefined" && selected_algorithm !== undefined) {
-        predictions = predict(code_iris, selected_algorithm)
-        console.log(predictions)
-    }
-
-    let divInformation = $("<div>");
-    divInformation
-        .prop("id", "divInfos")
-        .append("CODE IRIS : " + layer.feature.properties.CODE_IRIS).append($("<br>"))
-        .append("IRIS : " + layer.feature.properties.NOM_IRIS).append($("<br>"))
-        .append("COMMUNE : " + layer.feature.properties.NOM_COM).append($("<br>"));
-    let moreInfosLink = $("<a>");
-    moreInfosLink
-        .prop("href", "details-iris.html?code_iris="+layer.feature.properties.CODE_IRIS)
-        .prop("target", "_blank")
-        .text("Plus de détails")
-        .append($("<br>"));
-    divInformation.append(moreInfosLink);
-
-    let selectAlgorithm = $("<select>")
-    selectAlgorithm
-        .prop("id", "selectAlgorithmTooltip")
-        .append($("<option>").prop("value", "undefined").text("---"))
-    for(let algorithm of classifiers) { selectAlgorithm.append($("<option>").prop("value", algorithm).text(algorithm)); }
-    previously_selected_algorithm = selected_algorithm;
-
-    let divPredictions = $("<div>").prop("id", "divPredictions");
-    if(predictions !== undefined) {
-        for(let key in predictions) { divPredictions.append(key+': ' + predictions[key]["most_frequent"] + " (" + predictions[key]["count_frequent"] + "/7)").append($('<br>')); }
+function showPredictions(e) {
+    var layer = e.target;
+    var code_iris = layer.feature.properties.CODE_IRIS
+    var algorithm = $("#selectAlgorithmTooltip option:selected").text();
+    console.log(algorithm)
+    let predictions = predict(code_iris)
+
+	let messageTooltip = '<div>CODE IRIS : ' + layer.feature.properties.CODE_IRIS + '<br/>'
+    messageTooltip += 'IRIS : ' + layer.feature.properties.NOM_IRIS + '<br/>'
+    messageTooltip += 'COMMUNE : ' + layer.feature.properties.NOM_COM + '<br/><br/>'
+    messageTooltip += '<select id="selectAlgorithmTooltip">'
+    messageTooltip += "<option value='undefined'>" + "---" + "</option>"
+    for(let algorithm of classifiers) {
+        console.log(algorithm)
+        messageTooltip += "<option value=" + algorithm + ">" + algorithm + "</option>"
    }
-
-    let messageTooltip = divInformation[0].outerHTML + selectAlgorithm[0].outerHTML + divPredictions[0].outerHTML;
-    console.log(messageTooltip)
+    messageTooltip += "</select>"
+    for(let key in predictions) { messageTooltip += key+': ' + predictions[key] + '<br/>' }
+    messageTooltip += '<a href="details-iris.html?code_iris='+layer.feature.properties.CODE_IRIS + '" target="_blank">Plus de détails</a></div>';
    layer.bindPopup(messageTooltip)
    layer.bringToFront();
    layer.openPopup();
-    $("#selectAlgorithmTooltip").val(previously_selected_algorithm); // must be after binding the popup to be effective
-    $("#selectAlgorithmTooltip").on("click", function() { displayPopup(e)}) // update popup (env variables) when click on an algorithm
 }

+$("#selectAlgorithmTooltip").on("change", function() {
+   alert("changed")
+   eventsIRIS()
+});
+
+
 /**
 * Add IRIS layer from GeoJSON data
 * @param {geojson} geojson .
@@ -139,7 +120,7 @@ function addLayerFromGeoJSON(geojson, events, style, typeMethod){
        irisLayer = new L.geoJSON(geojson, {onEachFeature: events});
        irisLayer.setStyle(style);
        irisLayer.addTo(map);
-        if(typeMethod !== "searchBounds") // if searchBounds (zoom), fitBounds() will decrease the zoom, thus reloading searchBounds...
+        if(typeMethod != "searchBounds") // if searchBounds (zoom), fitBounds() will decrease the zoom, thus reloading searchBounds...
            map.fitBounds(irisLayer.getBounds()); // zoom on the displayed iris
    }
    return irisLayer;
@@ -155,7 +136,7 @@ function eventsIRIS(feature, layer) {
 		//mouseover: highlightFeature,
 		//mouseout: resetHighlight,
 		//click: clickProperties
-        click: displayPopup //showPredictions
+        click: showPredictions
 	});
 }


--- a/predihood/static/js/prediction.js
+++ b/predihood/static/js/prediction.js
-function predict(iris_code, algorithm_name) {
+function predict(iris_code) {
 	let predictions = null
-	$(document.body).css({'cursor' : 'wait'});
-
    $.ajax({
 		type: "GET",
 		url: "/predict_iris",
 		data: {
-			'iris_code': iris_code,
-			'algorithm_name': algorithm_name
+			'iris_code': iris_code
 		},
 		"async": false,
 		contentType: 'application/json;charset=UTF-8',
@@ -15,11 +12,9 @@ function predict(iris_code, algorithm_name) {
 			console.log(result)
 			console.log(result['predictions'])
 			predictions = result['predictions']
-			$(document.body).css({'cursor' : 'auto'});
 		},
 		error: function(result, textStatus, errorThrown) {
            console.log(errorThrown);
-            $(document.body).css({'cursor' : 'auto'});
 		}
 	});
    return predictions

--- a/predihood/testlog.py
+++ b/predihood/testlog.py
+import logging
+
+# logging.basicConfig(level=logging.DEBUG)
+
+
+log_format = "[%(levelname)s]  \t %(name)s \t %(filename)s::%(lineno)d  \t %(message)s"
+logging.basicConfig(level='DEBUG', format=log_format)
+logging.debug("This is a debug message")
+logging.info("This is an informational message")
+logging.warning("Careful! Something does not look right")
+logging.error("You have encountered an error")
+logging.critical("You are in trouble")
+
+
+def hypotenuse(a, b):
+    """Compute the hypotenuse"""
+    return (a ** 2 + b ** 2) ** 0.5
+
+
+a=3
+b=4
+logging.debug("Hypotenuse of {a}, {b} is {c}".format(a=a, b=b, c=hypotenuse(a, b)))
+# > INFO:root:Hypotenuse of 3, 4 is 5.0
--- a/predihood/utility_functions.py
+++ b/predihood/utility_functions.py
@@ -306,9 +306,7 @@ def get_most_frequent(lst):
    """Get the most frequent item in a list. If many elements are frequent, it returns the first one.
    :param lst: the list to find the most frequent element.
    """
-    most_frequent_element = max(set(lst), key=lst.count)
-    dictionary = {"most_frequent": most_frequent_element, "count_frequent": lst.count(most_frequent_element)}
-    return dictionary
+    return max(set(lst), key=lst.count)


 #################### plot functions ####################