sentences_transformers_fin_23_06_fin.py

# -*- coding: utf-8 -*-
"""sentences_transformers_fin_23_06_fin.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/15-DLVwOGMef8wpk6TLK9Qb5BiRyaoanj

#1. Install and import sentence_transformers
"""

!pip install sentence_transformers

from sentence_transformers import SentenceTransformer
import scipy.spatial

"""#2. Download the novels

## 2.1. The novels to create the train dataset (4 novels)
"""

f_assommoir = open("/content/assommoir.txt")
f_bonheur = open("/content/bonheur.txt")
f_nana = open("/content/nana.txt")
f_oeuvre = open("/content/oeuvre.txt")

content_french_assommoir = f_assommoir.read()
content_french_bonheur = f_bonheur.read()
content_french_nana = f_nana.read()
content_french_oeuvre = f_oeuvre.read()

"""##2.2. The novels to create test dataset (3 novels)"""

f_sanscravate = open("/content/sanscravate.txt")
f_jack = open("/content/jack.txt")
f_potbouille = open("/content/potbouille.txt")

content_french_sanscravate = f_sanscravate.read()
content_french_jack = f_jack.read()
content_french_potbouille = f_potbouille.read()

"""#3. Transform the novels into lists of sentences"""

import nltk
nltk.download('punkt')

#Function for split the sentences of text using "re" library
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr|Il\s|Elle\s|It\s|Ils\s|Elle\s|Leur\s|Notre\s|Nous\s|On\s|Mais\s|Cependant\s|Ce\s|Cette\s|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    #..... -> .
   # text = re.sub('[.]+', '.', text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

# function for delete the repetation of dot (...)
def del_espace(x):
    newstr = x
    newstr = re.sub('[.]+', '.', newstr)
    return newstr

"""### The sentences of train dataset"""

splited_sentences_bonheur = split_into_sentences(content_french_bonheur)
splited_sentences_assommoir = split_into_sentences(content_french_assommoir)
splited_sentences_nana = split_into_sentences(content_french_nana)
splited_sentences_oeuvre = split_into_sentences(content_french_oeuvre)

print("nombre de phrases du romans assommoir  ",  len(splited_sentences_assommoir))
print("nombre de phrases du romans bonheur  ", len(splited_sentences_bonheur))
print("nombre de phrases du romans nana  ", len(splited_sentences_nana))
print("nombre de phrases du romans oeuvre  ", len(splited_sentences_oeuvre))

corpus_train_pre =  list(set(splited_sentences_bonheur + splited_sentences_assommoir + splited_sentences_nana + splited_sentences_oeuvre))
len(corpus_train_pre) # 10203 + 7083 = 17286 mais ici on a 16587 parce on ne prend pas les éléments dupliqués

corpus_train = []
for i in corpus_train_pre:
    corpus_train.append(del_espace(i))

"""### The sentences of test dataset"""

splited_sentences_jack = split_into_sentences(content_french_jack)
splited_sentences_potbouille = split_into_sentences(content_french_potbouille)
splited_sentences_sanscravate = split_into_sentences(content_french_sanscravate)

print("nombre de phrases du romans potbouille  ", len(splited_sentences_potbouille))
print("nombre de phrases du romans jack  ", len(splited_sentences_jack))
print("nombre de phrases du romans sanscravate  ", len(splited_sentences_sanscravate))

corpus_test =  list(set(splited_sentences_potbouille + splited_sentences_jack + splited_sentences_sanscravate))
len(corpus_test) # 10687 + 8812 = 19599 mais ici on a 18638 parce on ne prend pas les éléments dupliqués

"""#4. Retrieve sentences from XML files to build queries using BeautifulSoup library"""

import bs4
import lxml

# Import BeautifulSoup
from bs4 import BeautifulSoup as bs

content_assommoir = []
# Read the XML file
with open("/content/assommoir_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_assommoir = file.readlines()
    # Combine the lines in the list into a string
    content_assommoir = "".join(content_assommoir)
    bs_content_assommoir = bs(content_assommoir, "lxml")

content_bonheur = []
with open("/content/bonheur_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_bonheur = file.readlines()
    # Combine the lines in the list into a string
    content_bonheur = "".join(content_bonheur)
    bs_content_bonheur = bs(content_bonheur, "lxml")


content_nana = []
with open("/content/nana_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_nana = file.readlines()
    # Combine the lines in the list into a string
    content_nana = "".join(content_nana)
    bs_content_nana = bs(content_nana, "lxml")


content_oeuvre = []
with open("/content/oeuvre_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_oeuvre = file.readlines()
    # Combine the lines in the list into a string
    content_oeuvre = "".join(content_oeuvre)
    bs_content_oeuvre = bs(content_oeuvre, "lxml")

content_jack = []
with open("/content/jack_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_jack = file.readlines()
    # Combine the lines in the list into a string
    content_jack = "".join(content_jack)
    bs_content_jack = bs(content_jack, "lxml")

content_sanscravate = []
with open("/content/sanscravate_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_sanscravate = file.readlines()
    # Combine the lines in the list into a string
    content_sanscravate = "".join(content_sanscravate)
    bs_content_sanscravate = bs(content_sanscravate, "lxml")


content_potbouille = []
with open("/content/potbouille_TEI_perdido.xml", "r") as file:
    # Read each line in the file, readlines() returns a list of lines
    content_potbouille = file.readlines()
    # Combine the lines in the list into a string
    content_potbouille = "".join(content_potbouille)
    bs_content_potbouille = bs(content_potbouille, "lxml")

#For train dataset
result_assommoir = bs_content_assommoir.find_all("rs", {"type": "place"})
print(len(result_assommoir))
result_bonheur = bs_content_bonheur.find_all("rs", {"type": "place"})
print(len(result_bonheur))
result_nana = bs_content_nana.find_all("rs", {"type": "place"})
print(len(result_nana))
result_oeuvre = bs_content_oeuvre.find_all("rs", {"type": "place"})
print(len(result_oeuvre))

#For test dataset
result_potbouille = bs_content_potbouille.find_all("rs", {"type": "place"})
print(len(result_potbouille))
result_jack = bs_content_jack.find_all("rs", {"type": "place"})
print(len(result_jack))
result_sanscravate = bs_content_sanscravate.find_all("rs", {"type": "place"})
print(len(result_sanscravate))


# Functions to parse XML
result_s = []

def check_motion(l):
    if len(l.findchild("motion"))> 0 :
        return True
    else :
        return False

def del_espace(x):
    newstr = x[0].replace("' ", "'")
    newstr1 = newstr.replace(" ,", ",")
    newstr2 = newstr1.replace(" .", ".")
    newstr3 = newstr2.replace(" ?", "?")
    newstr4 = newstr3.replace(" !", "!")
    newstr4 = re.sub('[.]+', '.', newstr4)
    return newstr4

def get_sentence(x):
    result1 = x.parent
    while result1.name != 's':
        result1=result1.parent
    sentence = [' '.join(result1.text.split())]
    return del_espace(sentence)

def get_sentence_parent(x):
    result1 = x.parent
    print(type(result1))
    while result1.name != 's':
        result1=result1.parent
    result_s.append(result1)
    return result_s

def sentences_list(y):
    sen_list = []
    for i in y:
        parent_s = i.findParent("s")
        mo = parent_s.findChild("motion")
        # sen_list.append(get_sentence(i))
        if mo:
            sentence = [' '.join(parent_s.text.split())]
            sen_list.append(del_espace(sentence))
    return sen_list

def sentences_list_parents(y):
    sen_list = []
    for i in y:
        sen_list.append(get_sentence_parent(i))
    return sen_list

#The query of train dataset
queries_train = list(set(sentences_list(result_assommoir) + sentences_list(result_bonheur) + sentences_list(result_nana) + sentences_list(result_oeuvre)))
len(queries_train)

#The query of test dataset
queries_test = list(set(sentences_list(result_potbouille) + sentences_list(result_sanscravate)  + sentences_list(result_jack)))
len(queries_test)

"""#5. Generate (create) an Embedding using SentenceTransformer"""

# Using the model "distiluse-base-multilingual-cased" to create the embeddings of the sentences of
# corpus and query for both train and test datasets
embedder = SentenceTransformer('distiluse-base-multilingual-cased')

# Corpus sentences
corpus_embeddings_train = embedder.encode(corpus_train)
corpus_embeddings_test = embedder.encode(corpus_test)

# Query sentences:
query_embeddings_train = embedder.encode(queries_train)
query_embeddings_test = embedder.encode(queries_test)

"""#6. Find the average score for each sentence with all other sentences in the query

**Semantic Search **: Semantic search is the task of finding similar sentences to a given sentence.

We apply this task on our embeddings (corpus, query)
"""

from statistics import mean
# Here we use the cosine similarity to find the similarity between the sentences of query
closest_n = len(queries_train) # j'ai choisi 12 parce que "12" donne les meilleurs résultats (la limite pour les romans que j'ai choisi)

#For train dataset
sum_score_train = []
i = 0
for query, query_embedding in zip(queries_train, query_embeddings_train):
    distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_train, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")
    sum_score_i = []
    for idx, distance in results[1:closest_n]:
        sum_score_i.append(1-distance)
        print(queries_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
    sum_score_train.append((queries_train[i].strip(),mean(sum_score_i)))
    i = i + 1


#For test dataset
closest_n = len(queries_test)
sum_score_test = []
i= 0
for query, query_embedding in zip(queries_test, query_embeddings_test):
    distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_test, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")
    sum_score_i = []
    for idx, distance in results[1:closest_n]:
        sum_score_i.append(1-distance)
        print(queries_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
    sum_score_test.append((queries_test[i].strip(),mean(sum_score_i)))
    i = i + 1

query_sentences_train = [sen_q[0] for sen_q in sum_score_train]
avg_score_sen_train =  [score_q[1] for score_q in sum_score_train]

query_sentences_test = [sen_q[0] for sen_q in sum_score_test]
avg_score_sen_test =  [score_q[1] for score_q in sum_score_test]

import pandas

df = pandas.DataFrame(data={"sentences": query_sentences_test, "avg": avg_score_sen_test})
df.to_csv("./query_score_test.csv", sep=',',index=False)

# faire la deuxième étape (regrouper les phrases de requêtes)
df_train = pandas.read_csv("/content/train_pos_sort_dup_plus_4.csv")
df_train.info()

print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score
print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score

print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score
print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score

print(df_train[df_train['avg']<0.1].get('sentences'))

"""#7. Find the positive labels"""

# We use scipy with cosine distance (similarity) function to find the 10 most-similar embeddings (sentences) for queries in the corpus:
# Here we create the positive labels for the next step (supervised "classification")
closest_n = 10

#For train dataset
# list of all the 10 closest sentences of the corpus with all sentences of query
sen_plus_sim_1_train = [] # (sentence, score, query sentence)
#list of the sentences that have score more than 0.6
sen_plus_06_train =  [] # (sentence, score, Idx of query sentence)
i = 0 # Idx for the sentences of the query
for query, query_embedding in zip(queries_train, query_embeddings_train):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    print("\n======================\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")
    for idx, distance in results[0:closest_n]:
        sen_plus_sim_1_train.append((corpus_train[idx].strip(),1-distance, query))
        if (distance)<= 0.4:
            sen_plus_06_train.append((corpus_train[idx].strip(),1-distance, i))
        print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)

    i = i + 1

#For test dataset
sen_plus_sim_1_test = [] # list of all the 10 closest sentencesof the corpus with all sentences of query
sen_plus_06_test = []  # list of the sentences that have score more than 0.6
i = 0
for query, query_embedding in zip(queries_test, query_embeddings_test):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")
    for idx, distance in results[0:closest_n]:
        sen_plus_sim_1_test.append((corpus_test[idx].strip(),1-distance, query))
        if (1-distance)>= 0.6:
            sen_plus_06_test.append((corpus_test[idx].strip(),1-distance, i))
        print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
    i = i + 1

# Function to count repetition of elements in a list
def getDuplicatesWithCount(listOfElems):
    ''' Get frequency count of duplicate elements in the given list '''
    dictOfElems = dict()
    # Iterate over each element in list
    for elem in listOfElems:
        # If element exists in dict then increment its value else add it in dict
        if elem in dictOfElems:
            dictOfElems[elem] += 1
        else:
            dictOfElems[elem] = 1

    # Filter key-value pairs in dictionary. Keep pairs whose value is greater than 1 i.e. only duplicate elements from list.
    dictOfElems = { key:value for key, value in dictOfElems.items() if value >= 1}
    # Returns a dict of duplicate elements and thier frequency count
    return dictOfElems

#For train dataset
print(len(sen_plus_sim_1_train))

# The pairs(snetence, score) not duplicated
print(len(list(set(sen_plus_sim_1_train))))

# just the sentences of pairs(sentence, score)
sen_jus_1_train = [x[0] for x in sen_plus_sim_1_train]
print(len(sen_jus_1_train))

# justt the duplicated sentences
dupes_sen_plus_sim_rep_1_train = [x for n, x in enumerate(sen_jus_1_train) if x in sen_jus_1_train[:n]]
dupes_sen_plus_sim_1_train = list(set(dupes_sen_plus_sim_rep_1_train))
print(len(dupes_sen_plus_sim_1_train))

# just the sentences not duplicated
sen_jus_without_rep_1_train = list(set(sen_jus_1_train))
print(len(sen_jus_without_rep_1_train))

#For test dataset
print(len(sen_plus_sim_1_test))

# The pairs(snetence, score) not duplicated
print(len(list(set(sen_plus_sim_1_test))))

# just the sentences of pairs(sentence, score)
sen_jus_1_test = [x[0] for x in sen_plus_sim_1_test]
print(len(sen_jus_1_test))

# justt the duplicated sentences
dupes_sen_plus_sim_rep_1_test = [x for n, x in enumerate(sen_jus_1_test) if x in sen_jus_1_test[:n]]
dupes_sen_plus_sim_1_test = list(set(dupes_sen_plus_sim_rep_1_test))
print(len(dupes_sen_plus_sim_1_test))

# just the sentences not duplicated
sen_jus_without_rep_1_test = list(set(sen_jus_1_test))
print(len(sen_jus_without_rep_1_test))

# Get a dictionary containing duplicate elements in list and their frequency count
dic_sens_train = []
dictOfElems_train = getDuplicatesWithCount(sen_jus_1_train)

for key, value in dictOfElems_train.items():
        if value == 1:
            print('****************************************************************')
        print(key , ' :: ', value)
        dic_sens_train.append((key,value))

len(dic_sens_train)

repetations_train = []
for sen in sen_jus_1_train:
    repetations_train.append(dictOfElems_train.get(sen, ""))

len(repetations_train)

len(dictOfElems_train)

len(list(set(dic_sens_train)-set(queries_train)))

# Get a dictionary containing duplicate elements in list sen_jus_1_test and their frequency count
dic_sens_test = []
dictOfElems_test = getDuplicatesWithCount(sen_jus_1_test)

for key, value in dictOfElems_test.items():
        if value == 1:
            print('****************************************************************')
        print(key , ' :: ', value)
        dic_sens_test.append((key,value))

len(dic_sens_test)

repetations_test = []
for sen in sen_jus_1_test:
    repetations_test.append(dictOfElems.get(sen, ""))

len(list(set(dic_sens_test)-set(queries_test)))

len(dictOfElems_test)

# Create a dataset just for positive labels with the columns (sentences, labals, nb_duplications, query_sentence, sen_score)

# Prepare the query sentences for train and test datasets
sentences_query_test = [sen_q[2] for sen_q in sen_plus_sim_1_test]

sentences_query_train = [sen_q[2] for sen_q in sen_plus_sim_1_train]

# Prepare the labels (here just 1)
leb_liste_train = []

for i in range(len(sen_jus_1_train)):
    leb_liste_train.append('1')


leb_liste_test = []

for i in range(len(sen_jus_1_test)):
    leb_liste_test.append('1')

score_sens = [x[1] for x in sen_plus_sim_1_test]

score_sens_train = [x[1] for x in sen_plus_sim_1_train]

import pandas
df_test = pandas.DataFrame(data={"sentences": sen_jus_1_test, "labels": leb_liste_test, "nb_duplications": repetations,
                                  "query_sentence": sentences_query_test,"sen_score": score_sens})
df_test.to_csv("./test_31_05.csv", sep=',',index=False)

df_train = pandas.DataFrame(data={"sentences": sen_jus_1_train, "labels": leb_liste_train, "nb_duplications": repetations_train,
                                  "query_sentence": sentences_query_train,"sen_score": score_sens_train})
df_train.to_csv("./train_31_05.csv", sep=',',index=False)

df_test = pandas.read_csv("/content/test_31_05.csv")
df_test.info()

df_train = pandas.read_csv("/content/train_31_05.csv")
df_train.info()

#Take just the sentences that are duplicated more that 4 times
df_test_dup_5 = df_test.loc[df_test['nb_duplications'] > 4]
df_train_dup_5 = df_train.loc[df_train['nb_duplications'] > 4]

len(df_test_dup_5)

len(df_train_dup_5)

# Remove the duplication of sentences
df_corpus_test = df_test_dup_5.drop_duplicates('sentences', keep='last')
df_corpus_train = df_train_dup_5.drop_duplicates('sentences', keep='last')

len(df_corpus_test)

len(df_corpus_train)

#Transform to list
sens_corpus_test = df_corpus_test['sentences'].tolist()
sens_corpus_train = df_corpus_train['sentences'].tolist()

len(list(set(sens_corpus_test + queries_test)))

len(list(set(sens_corpus_train + queries_train)))

pos_sens_test = list(set(sens_corpus_test + queries_test))
pos_sens_train = list(set(sens_corpus_train + queries_train))

sens_new = list(set(pos_sens_test) - set(queries_test))
sens_new_train = list(set(pos_sens_train) - set(queries_train))

len(sens_new)

len(sens_new_train)

from itertools import repeat
pos_labels_test = list(repeat(1, len(pos_sens_test)))
pos_labels_train = list(repeat(1, len(pos_sens_train)))

df_test_pos = pandas.DataFrame(data={"sentences": pos_sens_test, "labels": pos_labels_test})
df_test_pos.to_csv("./test_pos_sens_31_05.csv", sep=',',index=False)

df_train_pos = pandas.DataFrame(data={"sentences": pos_sens_train, "labels": pos_labels_train})
df_train_pos.to_csv("./train_pos_sens_31_05.csv", sep=',',index=False)

df_test_new = pandas.DataFrame(data={"sentences": sens_new})
df_test_new.to_csv("./test_pos_new_sens_31_05.csv", sep=',',index=False)

df_train_new = pandas.DataFrame(data={"sentences": sens_new_train})
df_train_new.to_csv("./train_pos_new_sens_31_05.csv", sep=',',index=False)

list_train = df_train_pos.values.tolist()
list_test = df_test_pos.values.tolist()

list_train_sens = [x[0] for x in list_train]
list_test_sens = [x[0] for x in list_test]

len(list_test_sens)

resulting_list_1_train = list_train_sens

resulting_list_1_test = list_test_sens

"""#8. Fine the negatives labels"""

# We use scipy with cosine distance (similarity) function to find the 10 most-different embeddings (sentences) for queries in the corpus:
 # Here we create the negative labels for the next step (supervised "classification")
closest_n = 10

#For train dataset
# list of all the 10 farthest sentences of the corpus with all sentences of query
sen_plus_sim_0_train = [] # (sentence, score)
sen_moins_0_train = []  # list of sentences that have score less than 0 (sentence, score)
for query, query_embedding in zip(queries_train, query_embeddings_train):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")
    for idx, distance in results[-closest_n:]:
        sen_plus_sim_0_train.append((corpus_train[idx].strip(),1-distance))
        if (1-distance)<= 0:
            sen_moins_0_train.append((corpus_train[idx].strip(),1-distance))
        print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)


#For test dataset
# list of all the 10 farthest sentences of the corpus with all sentences of query
sen_plus_sim_0_test = [] # (sentence, score)
sen_moins_0_test = []  # list of sentences that have score less than 0 (sentence, score)
for query, query_embedding in zip(queries_test, query_embeddings_test):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])
    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 10 most similar sentences in corpus:")
    for idx, distance in results[-closest_n:]:
        sen_plus_sim_0_test.append((corpus_test[idx].strip(),1-distance))
        if (1-distance)<= 0:
            sen_moins_0_test.append((corpus_test[idx].strip(),1-distance))
        print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)

#For train dataset
print(len(sen_moins_0_train))

# Here I take the pairs (sentence, score) that they are not duplicated
sen_moins_0_without_dup_train = list(set(sen_moins_0_train))
print(len(sen_moins_0_without_dup_train))

# just the sentences of pairs (sentence, score)
sen_jus_0_train = [x[0] for x in sen_moins_0_without_dup_train]

# just the sentences without duplications
sen_jus_0_without_dup_train = list(set(sen_jus_0_train))
print(len(sen_jus_0_without_dup_train))


# For test dataset (we do like with train dataset)

print(len(sen_moins_0_test))

sen_moins_0_without_dup_test = list(set(sen_moins_0_test))
print(len(sen_moins_0_without_dup_test))

sen_jus_0_test = [x[0] for x in sen_moins_0_without_dup_test]

sen_jus_0_without_dup_test = list(set(sen_jus_0_test))
print(len(sen_jus_0_without_dup_test))

#For train dataset
print(len(sen_plus_sim_0_train))

# The pairs(sentence, score) not duplicated
print(len(list(set(sen_plus_sim_0_train))))

# just the sentences of pairs(phrase, score)
sen_jus_0_train = [x[0] for x in sen_plus_sim_0_train]
print(len(sen_jus_0_train))

# just the duplicated sentences
dupes_sen_plus_sim_0_train = [x for n, x in enumerate(sen_jus_0_train) if x in sen_jus_0_train[:n]]
print(len(dupes_sen_plus_sim_0_train))

# just  the sentences not duplicated
sen_jus_without_rep_0_train = list(set(sen_jus_0_train))
print(len(sen_jus_without_rep_0_train))


# For test dataset (we do the same of train dataset)

print(len(sen_plus_sim_0_test))

print(len(list(set(sen_plus_sim_0_test))))

sen_jus_0_test = [x[0] for x in sen_plus_sim_0_test]
print(len(sen_jus_0_test))

dupes_sen_plus_sim_0_test = [x for n, x in enumerate(sen_jus_0_test) if x in sen_jus_0_test[:n]]
print(len(dupes_sen_plus_sim_0_test))

sen_jus_without_rep_0_test = list(set(sen_jus_0_test))
print(len(sen_jus_without_rep_0_test))

# Merrge the two lists to have the best list of negative labels for train dataset
resulting_list_0_train = list(set(sen_jus_without_rep_0_train + sen_jus_0_without_dup_train))
len(resulting_list_0_train)

# Merrge the two lists to have the best list of negative labels for test dataset
resulting_list_0_test = list(set(sen_jus_without_rep_0_test + sen_jus_0_without_dup_test))
len(resulting_list_0_test)

"""#9. Export the lists of sentences into CSV files"""

len(resulting_list_1_train)

# Sentences list for train dataset (negative labels ~~ positive labels * 3)
num_neg = len(resulting_list_1_train) * 3
sens_list_train = resulting_list_1_train + resulting_list_0_train[:num_neg]
len(sens_list_train)

len(resulting_list_1_test)

# Sentences list for train dataset (negative labels ~~ positive labels * 3)
num_neg_test = len(resulting_list_1_test) * 3
sens_list_test = resulting_list_1_test + resulting_list_0_test[:num_neg_test]
len(sens_list_test)

#Find the labels of train dataset
leb_liste_train = []

for i in range(len(sens_list_train)):
    if i < len(resulting_list_1_train) :
        leb_liste_train.append('1')
    else:
        leb_liste_train.append('0')


#For test dataset

leb_liste_test = []

for i in range(len(sens_list_test)):
    if i < len(resulting_list_1_test) :
        leb_liste_test.append('1')
    else:
        leb_liste_test.append('0')

len(leb_liste_train)

import pandas
df_train = pandas.DataFrame(data={"sentences": sens_list_train, "labels": leb_liste_train})
df_train.to_csv("./train_final_31_05.csv", sep=',',index=False)


df_test = pandas.DataFrame(data={"sentences": sens_list_test, "labels": leb_liste_test})
df_test.to_csv("./test_final_31_05.csv", sep=',',index=False)

"""#10. Do samples of datasets : we do that to randomize the two datasets"""

import pandas
df_test_new = pandas.read_csv("/content/test_final_31_05.csv")
df_test_new.info()

df_train_egal = pandas.read_csv("/content/train_final_31_05.csv")
df_train_egal.info()

df_test_sam = df_test_new.sample(2412)

df_train_egal_sam = df_train_egal.sample(4284)

df_test_sam.to_csv("./test_sample_final_31_05.csv", sep=',',index=False)
df_train_egal_sam.to_csv("./train_sample_final_31_05.csv", sep=',',index=False)