diff --git a/sentences_transformers_fin_23_06_fin.py b/sentences_transformers_fin_23_06_fin.py new file mode 100644 index 0000000000000000000000000000000000000000..5b4f8700d2f1e21bde1c04febec80c2805a7d83a --- /dev/null +++ b/sentences_transformers_fin_23_06_fin.py @@ -0,0 +1,767 @@ +# -*- coding: utf-8 -*- +"""sentences_transformers_fin_23_06_fin.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/15-DLVwOGMef8wpk6TLK9Qb5BiRyaoanj + +#1. Install and import sentence_transformers +""" + +!pip install sentence_transformers + +from sentence_transformers import SentenceTransformer +import scipy.spatial + +"""#2. Download the novels + +## 2.1. The novels to create the train dataset (4 novels) +""" + +f_assommoir = open("/content/assommoir.txt") +f_bonheur = open("/content/bonheur.txt") +f_nana = open("/content/nana.txt") +f_oeuvre = open("/content/oeuvre.txt") + +content_french_assommoir = f_assommoir.read() +content_french_bonheur = f_bonheur.read() +content_french_nana = f_nana.read() +content_french_oeuvre = f_oeuvre.read() + +"""##2.2. The novels to create test dataset (3 novels)""" + +f_sanscravate = open("/content/sanscravate.txt") +f_jack = open("/content/jack.txt") +f_potbouille = open("/content/potbouille.txt") + +content_french_sanscravate = f_sanscravate.read() +content_french_jack = f_jack.read() +content_french_potbouille = f_potbouille.read() + +"""#3. Transform the novels into lists of sentences""" + +import nltk +nltk.download('punkt') + +#Function for split the sentences of text using "re" library +import re +alphabets= "([A-Za-z])" +prefixes = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr)[.]" +suffixes = "(Inc|Ltd|Jr|Sr|Co)" +starters = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr|Il\s|Elle\s|It\s|Ils\s|Elle\s|Leur\s|Notre\s|Nous\s|On\s|Mais\s|Cependant\s|Ce\s|Cette\s|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +websites = "[.](com|net|org|io|gov)" + +def split_into_sentences(text): + text = " " + text + " " + text = text.replace("\n"," ") + text = re.sub(prefixes,"\\1<prd>",text) + text = re.sub(websites,"<prd>\\1",text) + if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") + text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text) + text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text) + text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) + text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) + text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text) + #..... -> . + # text = re.sub('[.]+', '.', text) + if "â€" in text: text = text.replace(".â€","â€.") + if "\"" in text: text = text.replace(".\"","\".") + if "!" in text: text = text.replace("!\"","\"!") + if "?" in text: text = text.replace("?\"","\"?") + text = text.replace(".",".<stop>") + text = text.replace("?","?<stop>") + text = text.replace("!","!<stop>") + text = text.replace("<prd>",".") + sentences = text.split("<stop>") + sentences = sentences[:-1] + sentences = [s.strip() for s in sentences] + return sentences + +# function for delete the repetation of dot (...) +def del_espace(x): + newstr = x + newstr = re.sub('[.]+', '.', newstr) + return newstr + +"""### The sentences of train dataset""" + +splited_sentences_bonheur = split_into_sentences(content_french_bonheur) +splited_sentences_assommoir = split_into_sentences(content_french_assommoir) +splited_sentences_nana = split_into_sentences(content_french_nana) +splited_sentences_oeuvre = split_into_sentences(content_french_oeuvre) + +print("nombre de phrases du romans assommoir ", len(splited_sentences_assommoir)) +print("nombre de phrases du romans bonheur ", len(splited_sentences_bonheur)) +print("nombre de phrases du romans nana ", len(splited_sentences_nana)) +print("nombre de phrases du romans oeuvre ", len(splited_sentences_oeuvre)) + +corpus_train_pre = list(set(splited_sentences_bonheur + splited_sentences_assommoir + splited_sentences_nana + splited_sentences_oeuvre)) +len(corpus_train_pre) # 10203 + 7083 = 17286 mais ici on a 16587 parce on ne prend pas les éléments dupliqués + +corpus_train = [] +for i in corpus_train_pre: + corpus_train.append(del_espace(i)) + +"""### The sentences of test dataset""" + +splited_sentences_jack = split_into_sentences(content_french_jack) +splited_sentences_potbouille = split_into_sentences(content_french_potbouille) +splited_sentences_sanscravate = split_into_sentences(content_french_sanscravate) + +print("nombre de phrases du romans potbouille ", len(splited_sentences_potbouille)) +print("nombre de phrases du romans jack ", len(splited_sentences_jack)) +print("nombre de phrases du romans sanscravate ", len(splited_sentences_sanscravate)) + +corpus_test = list(set(splited_sentences_potbouille + splited_sentences_jack + splited_sentences_sanscravate)) +len(corpus_test) # 10687 + 8812 = 19599 mais ici on a 18638 parce on ne prend pas les éléments dupliqués + +"""#4. Retrieve sentences from XML files to build queries using BeautifulSoup library""" + +import bs4 +import lxml + +# Import BeautifulSoup +from bs4 import BeautifulSoup as bs + +content_assommoir = [] +# Read the XML file +with open("/content/assommoir_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_assommoir = file.readlines() + # Combine the lines in the list into a string + content_assommoir = "".join(content_assommoir) + bs_content_assommoir = bs(content_assommoir, "lxml") + +content_bonheur = [] +with open("/content/bonheur_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_bonheur = file.readlines() + # Combine the lines in the list into a string + content_bonheur = "".join(content_bonheur) + bs_content_bonheur = bs(content_bonheur, "lxml") + + +content_nana = [] +with open("/content/nana_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_nana = file.readlines() + # Combine the lines in the list into a string + content_nana = "".join(content_nana) + bs_content_nana = bs(content_nana, "lxml") + + +content_oeuvre = [] +with open("/content/oeuvre_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_oeuvre = file.readlines() + # Combine the lines in the list into a string + content_oeuvre = "".join(content_oeuvre) + bs_content_oeuvre = bs(content_oeuvre, "lxml") + +content_jack = [] +with open("/content/jack_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_jack = file.readlines() + # Combine the lines in the list into a string + content_jack = "".join(content_jack) + bs_content_jack = bs(content_jack, "lxml") + +content_sanscravate = [] +with open("/content/sanscravate_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_sanscravate = file.readlines() + # Combine the lines in the list into a string + content_sanscravate = "".join(content_sanscravate) + bs_content_sanscravate = bs(content_sanscravate, "lxml") + + +content_potbouille = [] +with open("/content/potbouille_TEI_perdido.xml", "r") as file: + # Read each line in the file, readlines() returns a list of lines + content_potbouille = file.readlines() + # Combine the lines in the list into a string + content_potbouille = "".join(content_potbouille) + bs_content_potbouille = bs(content_potbouille, "lxml") + +#For train dataset +result_assommoir = bs_content_assommoir.find_all("rs", {"type": "place"}) +print(len(result_assommoir)) +result_bonheur = bs_content_bonheur.find_all("rs", {"type": "place"}) +print(len(result_bonheur)) +result_nana = bs_content_nana.find_all("rs", {"type": "place"}) +print(len(result_nana)) +result_oeuvre = bs_content_oeuvre.find_all("rs", {"type": "place"}) +print(len(result_oeuvre)) + +#For test dataset +result_potbouille = bs_content_potbouille.find_all("rs", {"type": "place"}) +print(len(result_potbouille)) +result_jack = bs_content_jack.find_all("rs", {"type": "place"}) +print(len(result_jack)) +result_sanscravate = bs_content_sanscravate.find_all("rs", {"type": "place"}) +print(len(result_sanscravate)) + + + +# Functions to parse XML +result_s = [] + +def check_motion(l): + if len(l.findchild("motion"))> 0 : + return True + else : + return False + +def del_espace(x): + newstr = x[0].replace("' ", "'") + newstr1 = newstr.replace(" ,", ",") + newstr2 = newstr1.replace(" .", ".") + newstr3 = newstr2.replace(" ?", "?") + newstr4 = newstr3.replace(" !", "!") + newstr4 = re.sub('[.]+', '.', newstr4) + return newstr4 + +def get_sentence(x): + result1 = x.parent + while result1.name != 's': + result1=result1.parent + sentence = [' '.join(result1.text.split())] + return del_espace(sentence) + +def get_sentence_parent(x): + result1 = x.parent + print(type(result1)) + while result1.name != 's': + result1=result1.parent + result_s.append(result1) + return result_s + +def sentences_list(y): + sen_list = [] + for i in y: + parent_s = i.findParent("s") + mo = parent_s.findChild("motion") + # sen_list.append(get_sentence(i)) + if mo: + sentence = [' '.join(parent_s.text.split())] + sen_list.append(del_espace(sentence)) + return sen_list + +def sentences_list_parents(y): + sen_list = [] + for i in y: + sen_list.append(get_sentence_parent(i)) + return sen_list + +#The query of train dataset +queries_train = list(set(sentences_list(result_assommoir) + sentences_list(result_bonheur) + sentences_list(result_nana) + sentences_list(result_oeuvre))) +len(queries_train) + +#The query of test dataset +queries_test = list(set(sentences_list(result_potbouille) + sentences_list(result_sanscravate) + sentences_list(result_jack))) +len(queries_test) + +"""#5. Generate (create) an Embedding using SentenceTransformer""" + +# Using the model "distiluse-base-multilingual-cased" to create the embeddings of the sentences of +# corpus and query for both train and test datasets +embedder = SentenceTransformer('distiluse-base-multilingual-cased') + +# Corpus sentences +corpus_embeddings_train = embedder.encode(corpus_train) +corpus_embeddings_test = embedder.encode(corpus_test) + +# Query sentences: +query_embeddings_train = embedder.encode(queries_train) +query_embeddings_test = embedder.encode(queries_test) + +"""#6. Find the average score for each sentence with all other sentences in the query + +**Semantic Search **: Semantic search is the task of finding similar sentences to a given sentence. + +We apply this task on our embeddings (corpus, query) +""" + +from statistics import mean +# Here we use the cosine similarity to find the similarity between the sentences of query +closest_n = len(queries_train) # j'ai choisi 12 parce que "12" donne les meilleurs résultats (la limite pour les romans que j'ai choisi) + +#For train dataset +sum_score_train = [] +i = 0 +for query, query_embedding in zip(queries_train, query_embeddings_train): + distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_train, "cosine")[0] + results = zip(range(len(distances)), distances) + results = sorted(results, key=lambda x: x[1]) + + print("\n\n======================\n\n") + print("Query:", query) + print("\nTop 10 most similar sentences in corpus:") + sum_score_i = [] + for idx, distance in results[1:closest_n]: + sum_score_i.append(1-distance) + print(queries_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) + sum_score_train.append((queries_train[i].strip(),mean(sum_score_i))) + i = i + 1 + + + +#For test dataset +closest_n = len(queries_test) +sum_score_test = [] +i= 0 +for query, query_embedding in zip(queries_test, query_embeddings_test): + distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_test, "cosine")[0] + results = zip(range(len(distances)), distances) + results = sorted(results, key=lambda x: x[1]) + print("\n\n======================\n\n") + print("Query:", query) + print("\nTop 10 most similar sentences in corpus:") + sum_score_i = [] + for idx, distance in results[1:closest_n]: + sum_score_i.append(1-distance) + print(queries_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) + sum_score_test.append((queries_test[i].strip(),mean(sum_score_i))) + i = i + 1 + +query_sentences_train = [sen_q[0] for sen_q in sum_score_train] +avg_score_sen_train = [score_q[1] for score_q in sum_score_train] + +query_sentences_test = [sen_q[0] for sen_q in sum_score_test] +avg_score_sen_test = [score_q[1] for score_q in sum_score_test] + +import pandas + +df = pandas.DataFrame(data={"sentences": query_sentences_test, "avg": avg_score_sen_test}) +df.to_csv("./query_score_test.csv", sep=',',index=False) + +# faire la deuxième étape (regrouper les phrases de requêtes) +df_train = pandas.read_csv("/content/train_pos_sort_dup_plus_4.csv") +df_train.info() + +print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score +print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score + +print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score +print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score + +print(df_train[df_train['avg']<0.1].get('sentences')) + +"""#7. Find the positive labels""" + +# We use scipy with cosine distance (similarity) function to find the 10 most-similar embeddings (sentences) for queries in the corpus: +# Here we create the positive labels for the next step (supervised "classification") +closest_n = 10 + +#For train dataset +# list of all the 10 closest sentences of the corpus with all sentences of query +sen_plus_sim_1_train = [] # (sentence, score, query sentence) +#list of the sentences that have score more than 0.6 +sen_plus_06_train = [] # (sentence, score, Idx of query sentence) +i = 0 # Idx for the sentences of the query +for query, query_embedding in zip(queries_train, query_embeddings_train): + distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0] + results = zip(range(len(distances)), distances) + results = sorted(results, key=lambda x: x[1]) + print("\n======================\n") + print("Query:", query) + print("\nTop 10 most similar sentences in corpus:") + for idx, distance in results[0:closest_n]: + sen_plus_sim_1_train.append((corpus_train[idx].strip(),1-distance, query)) + if (distance)<= 0.4: + sen_plus_06_train.append((corpus_train[idx].strip(),1-distance, i)) + print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) + + i = i + 1 + +#For test dataset +sen_plus_sim_1_test = [] # list of all the 10 closest sentencesof the corpus with all sentences of query +sen_plus_06_test = [] # list of the sentences that have score more than 0.6 +i = 0 +for query, query_embedding in zip(queries_test, query_embeddings_test): + distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0] + results = zip(range(len(distances)), distances) + results = sorted(results, key=lambda x: x[1]) + print("\n\n======================\n\n") + print("Query:", query) + print("\nTop 10 most similar sentences in corpus:") + for idx, distance in results[0:closest_n]: + sen_plus_sim_1_test.append((corpus_test[idx].strip(),1-distance, query)) + if (1-distance)>= 0.6: + sen_plus_06_test.append((corpus_test[idx].strip(),1-distance, i)) + print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) + i = i + 1 + +# Function to count repetition of elements in a list +def getDuplicatesWithCount(listOfElems): + ''' Get frequency count of duplicate elements in the given list ''' + dictOfElems = dict() + # Iterate over each element in list + for elem in listOfElems: + # If element exists in dict then increment its value else add it in dict + if elem in dictOfElems: + dictOfElems[elem] += 1 + else: + dictOfElems[elem] = 1 + + # Filter key-value pairs in dictionary. Keep pairs whose value is greater than 1 i.e. only duplicate elements from list. + dictOfElems = { key:value for key, value in dictOfElems.items() if value >= 1} + # Returns a dict of duplicate elements and thier frequency count + return dictOfElems + +#For train dataset +print(len(sen_plus_sim_1_train)) + +# The pairs(snetence, score) not duplicated +print(len(list(set(sen_plus_sim_1_train)))) + +# just the sentences of pairs(sentence, score) +sen_jus_1_train = [x[0] for x in sen_plus_sim_1_train] +print(len(sen_jus_1_train)) + +# justt the duplicated sentences +dupes_sen_plus_sim_rep_1_train = [x for n, x in enumerate(sen_jus_1_train) if x in sen_jus_1_train[:n]] +dupes_sen_plus_sim_1_train = list(set(dupes_sen_plus_sim_rep_1_train)) +print(len(dupes_sen_plus_sim_1_train)) + +# just the sentences not duplicated +sen_jus_without_rep_1_train = list(set(sen_jus_1_train)) +print(len(sen_jus_without_rep_1_train)) + +#For test dataset +print(len(sen_plus_sim_1_test)) + +# The pairs(snetence, score) not duplicated +print(len(list(set(sen_plus_sim_1_test)))) + +# just the sentences of pairs(sentence, score) +sen_jus_1_test = [x[0] for x in sen_plus_sim_1_test] +print(len(sen_jus_1_test)) + +# justt the duplicated sentences +dupes_sen_plus_sim_rep_1_test = [x for n, x in enumerate(sen_jus_1_test) if x in sen_jus_1_test[:n]] +dupes_sen_plus_sim_1_test = list(set(dupes_sen_plus_sim_rep_1_test)) +print(len(dupes_sen_plus_sim_1_test)) + +# just the sentences not duplicated +sen_jus_without_rep_1_test = list(set(sen_jus_1_test)) +print(len(sen_jus_without_rep_1_test)) + +# Get a dictionary containing duplicate elements in list and their frequency count +dic_sens_train = [] +dictOfElems_train = getDuplicatesWithCount(sen_jus_1_train) + +for key, value in dictOfElems_train.items(): + if value == 1: + print('****************************************************************') + print(key , ' :: ', value) + dic_sens_train.append((key,value)) + +len(dic_sens_train) + +repetations_train = [] +for sen in sen_jus_1_train: + repetations_train.append(dictOfElems_train.get(sen, "")) + +len(repetations_train) + +len(dictOfElems_train) + +len(list(set(dic_sens_train)-set(queries_train))) + +# Get a dictionary containing duplicate elements in list sen_jus_1_test and their frequency count +dic_sens_test = [] +dictOfElems_test = getDuplicatesWithCount(sen_jus_1_test) + +for key, value in dictOfElems_test.items(): + if value == 1: + print('****************************************************************') + print(key , ' :: ', value) + dic_sens_test.append((key,value)) + +len(dic_sens_test) + +repetations_test = [] +for sen in sen_jus_1_test: + repetations_test.append(dictOfElems.get(sen, "")) + +len(list(set(dic_sens_test)-set(queries_test))) + +len(dictOfElems_test) + +# Create a dataset just for positive labels with the columns (sentences, labals, nb_duplications, query_sentence, sen_score) + +# Prepare the query sentences for train and test datasets +sentences_query_test = [sen_q[2] for sen_q in sen_plus_sim_1_test] + +sentences_query_train = [sen_q[2] for sen_q in sen_plus_sim_1_train] + +# Prepare the labels (here just 1) +leb_liste_train = [] + +for i in range(len(sen_jus_1_train)): + leb_liste_train.append('1') + + +leb_liste_test = [] + +for i in range(len(sen_jus_1_test)): + leb_liste_test.append('1') + +score_sens = [x[1] for x in sen_plus_sim_1_test] + +score_sens_train = [x[1] for x in sen_plus_sim_1_train] + +import pandas +df_test = pandas.DataFrame(data={"sentences": sen_jus_1_test, "labels": leb_liste_test, "nb_duplications": repetations, + "query_sentence": sentences_query_test,"sen_score": score_sens}) +df_test.to_csv("./test_31_05.csv", sep=',',index=False) + +df_train = pandas.DataFrame(data={"sentences": sen_jus_1_train, "labels": leb_liste_train, "nb_duplications": repetations_train, + "query_sentence": sentences_query_train,"sen_score": score_sens_train}) +df_train.to_csv("./train_31_05.csv", sep=',',index=False) + +df_test = pandas.read_csv("/content/test_31_05.csv") +df_test.info() + +df_train = pandas.read_csv("/content/train_31_05.csv") +df_train.info() + +#Take just the sentences that are duplicated more that 4 times +df_test_dup_5 = df_test.loc[df_test['nb_duplications'] > 4] +df_train_dup_5 = df_train.loc[df_train['nb_duplications'] > 4] + +len(df_test_dup_5) + +len(df_train_dup_5) + +# Remove the duplication of sentences +df_corpus_test = df_test_dup_5.drop_duplicates('sentences', keep='last') +df_corpus_train = df_train_dup_5.drop_duplicates('sentences', keep='last') + +len(df_corpus_test) + +len(df_corpus_train) + +#Transform to list +sens_corpus_test = df_corpus_test['sentences'].tolist() +sens_corpus_train = df_corpus_train['sentences'].tolist() + +len(list(set(sens_corpus_test + queries_test))) + +len(list(set(sens_corpus_train + queries_train))) + +pos_sens_test = list(set(sens_corpus_test + queries_test)) +pos_sens_train = list(set(sens_corpus_train + queries_train)) + +sens_new = list(set(pos_sens_test) - set(queries_test)) +sens_new_train = list(set(pos_sens_train) - set(queries_train)) + +len(sens_new) + +len(sens_new_train) + +from itertools import repeat +pos_labels_test = list(repeat(1, len(pos_sens_test))) +pos_labels_train = list(repeat(1, len(pos_sens_train))) + +df_test_pos = pandas.DataFrame(data={"sentences": pos_sens_test, "labels": pos_labels_test}) +df_test_pos.to_csv("./test_pos_sens_31_05.csv", sep=',',index=False) + +df_train_pos = pandas.DataFrame(data={"sentences": pos_sens_train, "labels": pos_labels_train}) +df_train_pos.to_csv("./train_pos_sens_31_05.csv", sep=',',index=False) + +df_test_new = pandas.DataFrame(data={"sentences": sens_new}) +df_test_new.to_csv("./test_pos_new_sens_31_05.csv", sep=',',index=False) + +df_train_new = pandas.DataFrame(data={"sentences": sens_new_train}) +df_train_new.to_csv("./train_pos_new_sens_31_05.csv", sep=',',index=False) + +list_train = df_train_pos.values.tolist() +list_test = df_test_pos.values.tolist() + +list_train_sens = [x[0] for x in list_train] +list_test_sens = [x[0] for x in list_test] + +len(list_test_sens) + +resulting_list_1_train = list_train_sens + +resulting_list_1_test = list_test_sens + +"""#8. Fine the negatives labels""" + +# We use scipy with cosine distance (similarity) function to find the 10 most-different embeddings (sentences) for queries in the corpus: + # Here we create the negative labels for the next step (supervised "classification") +closest_n = 10 + +#For train dataset +# list of all the 10 farthest sentences of the corpus with all sentences of query +sen_plus_sim_0_train = [] # (sentence, score) +sen_moins_0_train = [] # list of sentences that have score less than 0 (sentence, score) +for query, query_embedding in zip(queries_train, query_embeddings_train): + distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0] + results = zip(range(len(distances)), distances) + results = sorted(results, key=lambda x: x[1]) + print("\n\n======================\n\n") + print("Query:", query) + print("\nTop 10 most similar sentences in corpus:") + for idx, distance in results[-closest_n:]: + sen_plus_sim_0_train.append((corpus_train[idx].strip(),1-distance)) + if (1-distance)<= 0: + sen_moins_0_train.append((corpus_train[idx].strip(),1-distance)) + print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) + + +#For test dataset +# list of all the 10 farthest sentences of the corpus with all sentences of query +sen_plus_sim_0_test = [] # (sentence, score) +sen_moins_0_test = [] # list of sentences that have score less than 0 (sentence, score) +for query, query_embedding in zip(queries_test, query_embeddings_test): + distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0] + results = zip(range(len(distances)), distances) + results = sorted(results, key=lambda x: x[1]) + print("\n\n======================\n\n") + print("Query:", query) + print("\nTop 10 most similar sentences in corpus:") + for idx, distance in results[-closest_n:]: + sen_plus_sim_0_test.append((corpus_test[idx].strip(),1-distance)) + if (1-distance)<= 0: + sen_moins_0_test.append((corpus_test[idx].strip(),1-distance)) + print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) + +#For train dataset +print(len(sen_moins_0_train)) + +# Here I take the pairs (sentence, score) that they are not duplicated +sen_moins_0_without_dup_train = list(set(sen_moins_0_train)) +print(len(sen_moins_0_without_dup_train)) + +# just the sentences of pairs (sentence, score) +sen_jus_0_train = [x[0] for x in sen_moins_0_without_dup_train] + +# just the sentences without duplications +sen_jus_0_without_dup_train = list(set(sen_jus_0_train)) +print(len(sen_jus_0_without_dup_train)) + + +# For test dataset (we do like with train dataset) + +print(len(sen_moins_0_test)) + +sen_moins_0_without_dup_test = list(set(sen_moins_0_test)) +print(len(sen_moins_0_without_dup_test)) + +sen_jus_0_test = [x[0] for x in sen_moins_0_without_dup_test] + +sen_jus_0_without_dup_test = list(set(sen_jus_0_test)) +print(len(sen_jus_0_without_dup_test)) + +#For train dataset +print(len(sen_plus_sim_0_train)) + +# The pairs(sentence, score) not duplicated +print(len(list(set(sen_plus_sim_0_train)))) + +# just the sentences of pairs(phrase, score) +sen_jus_0_train = [x[0] for x in sen_plus_sim_0_train] +print(len(sen_jus_0_train)) + +# just the duplicated sentences +dupes_sen_plus_sim_0_train = [x for n, x in enumerate(sen_jus_0_train) if x in sen_jus_0_train[:n]] +print(len(dupes_sen_plus_sim_0_train)) + +# just the sentences not duplicated +sen_jus_without_rep_0_train = list(set(sen_jus_0_train)) +print(len(sen_jus_without_rep_0_train)) + + + +# For test dataset (we do the same of train dataset) + +print(len(sen_plus_sim_0_test)) + +print(len(list(set(sen_plus_sim_0_test)))) + +sen_jus_0_test = [x[0] for x in sen_plus_sim_0_test] +print(len(sen_jus_0_test)) + +dupes_sen_plus_sim_0_test = [x for n, x in enumerate(sen_jus_0_test) if x in sen_jus_0_test[:n]] +print(len(dupes_sen_plus_sim_0_test)) + +sen_jus_without_rep_0_test = list(set(sen_jus_0_test)) +print(len(sen_jus_without_rep_0_test)) + +# Merrge the two lists to have the best list of negative labels for train dataset +resulting_list_0_train = list(set(sen_jus_without_rep_0_train + sen_jus_0_without_dup_train)) +len(resulting_list_0_train) + +# Merrge the two lists to have the best list of negative labels for test dataset +resulting_list_0_test = list(set(sen_jus_without_rep_0_test + sen_jus_0_without_dup_test)) +len(resulting_list_0_test) + +"""#9. Export the lists of sentences into CSV files""" + +len(resulting_list_1_train) + +# Sentences list for train dataset (negative labels ~~ positive labels * 3) +num_neg = len(resulting_list_1_train) * 3 +sens_list_train = resulting_list_1_train + resulting_list_0_train[:num_neg] +len(sens_list_train) + +len(resulting_list_1_test) + +# Sentences list for train dataset (negative labels ~~ positive labels * 3) +num_neg_test = len(resulting_list_1_test) * 3 +sens_list_test = resulting_list_1_test + resulting_list_0_test[:num_neg_test] +len(sens_list_test) + +#Find the labels of train dataset +leb_liste_train = [] + +for i in range(len(sens_list_train)): + if i < len(resulting_list_1_train) : + leb_liste_train.append('1') + else: + leb_liste_train.append('0') + + +#For test dataset + +leb_liste_test = [] + +for i in range(len(sens_list_test)): + if i < len(resulting_list_1_test) : + leb_liste_test.append('1') + else: + leb_liste_test.append('0') + +len(leb_liste_train) + +import pandas +df_train = pandas.DataFrame(data={"sentences": sens_list_train, "labels": leb_liste_train}) +df_train.to_csv("./train_final_31_05.csv", sep=',',index=False) + + +df_test = pandas.DataFrame(data={"sentences": sens_list_test, "labels": leb_liste_test}) +df_test.to_csv("./test_final_31_05.csv", sep=',',index=False) + +"""#10. Do samples of datasets : we do that to randomize the two datasets""" + +import pandas +df_test_new = pandas.read_csv("/content/test_final_31_05.csv") +df_test_new.info() + +df_train_egal = pandas.read_csv("/content/train_final_31_05.csv") +df_train_egal.info() + +df_test_sam = df_test_new.sample(2412) + +df_train_egal_sam = df_train_egal.sample(4284) + +df_test_sam.to_csv("./test_sample_final_31_05.csv", sep=',',index=False) +df_train_egal_sam.to_csv("./train_sample_final_31_05.csv", sep=',',index=False) \ No newline at end of file