-
Ghanem Hussam authored094bd937
sentences_transformers_fin_23_06_fin.py 27.90 KiB
# -*- coding: utf-8 -*-
"""sentences_transformers_fin_23_06_fin.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15-DLVwOGMef8wpk6TLK9Qb5BiRyaoanj
#1. Install and import sentence_transformers
"""
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
import scipy.spatial
"""#2. Download the novels
## 2.1. The novels to create the train dataset (4 novels)
"""
f_assommoir = open("/content/assommoir.txt")
f_bonheur = open("/content/bonheur.txt")
f_nana = open("/content/nana.txt")
f_oeuvre = open("/content/oeuvre.txt")
content_french_assommoir = f_assommoir.read()
content_french_bonheur = f_bonheur.read()
content_french_nana = f_nana.read()
content_french_oeuvre = f_oeuvre.read()
"""##2.2. The novels to create test dataset (3 novels)"""
f_sanscravate = open("/content/sanscravate.txt")
f_jack = open("/content/jack.txt")
f_potbouille = open("/content/potbouille.txt")
content_french_sanscravate = f_sanscravate.read()
content_french_jack = f_jack.read()
content_french_potbouille = f_potbouille.read()
"""#3. Transform the novels into lists of sentences"""
import nltk
nltk.download('punkt')
#Function for split the sentences of text using "re" library
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr|Il\s|Elle\s|It\s|Ils\s|Elle\s|Leur\s|Notre\s|Nous\s|On\s|Mais\s|Cependant\s|Ce\s|Cette\s|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
def split_into_sentences(text):
text = " " + text + " "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
#..... -> .
# text = re.sub('[.]+', '.', text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".<stop>")
text = text.replace("?","?<stop>")
text = text.replace("!","!<stop>")
text = text.replace("<prd>",".")
sentences = text.split("<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences
# function for delete the repetation of dot (...)
def del_espace(x):
newstr = x
newstr = re.sub('[.]+', '.', newstr)
return newstr
"""### The sentences of train dataset"""
splited_sentences_bonheur = split_into_sentences(content_french_bonheur)
splited_sentences_assommoir = split_into_sentences(content_french_assommoir)
splited_sentences_nana = split_into_sentences(content_french_nana)
splited_sentences_oeuvre = split_into_sentences(content_french_oeuvre)
print("nombre de phrases du romans assommoir ", len(splited_sentences_assommoir))
print("nombre de phrases du romans bonheur ", len(splited_sentences_bonheur))
print("nombre de phrases du romans nana ", len(splited_sentences_nana))
print("nombre de phrases du romans oeuvre ", len(splited_sentences_oeuvre))
corpus_train_pre = list(set(splited_sentences_bonheur + splited_sentences_assommoir + splited_sentences_nana + splited_sentences_oeuvre))
len(corpus_train_pre) # 10203 + 7083 = 17286 mais ici on a 16587 parce on ne prend pas les éléments dupliqués
corpus_train = []
for i in corpus_train_pre:
corpus_train.append(del_espace(i))
"""### The sentences of test dataset"""
splited_sentences_jack = split_into_sentences(content_french_jack)
splited_sentences_potbouille = split_into_sentences(content_french_potbouille)
splited_sentences_sanscravate = split_into_sentences(content_french_sanscravate)
print("nombre de phrases du romans potbouille ", len(splited_sentences_potbouille))
print("nombre de phrases du romans jack ", len(splited_sentences_jack))
print("nombre de phrases du romans sanscravate ", len(splited_sentences_sanscravate))
corpus_test = list(set(splited_sentences_potbouille + splited_sentences_jack + splited_sentences_sanscravate))
len(corpus_test) # 10687 + 8812 = 19599 mais ici on a 18638 parce on ne prend pas les éléments dupliqués
"""#4. Retrieve sentences from XML files to build queries using BeautifulSoup library"""
import bs4
import lxml
# Import BeautifulSoup
from bs4 import BeautifulSoup as bs
content_assommoir = []
# Read the XML file
with open("/content/assommoir_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_assommoir = file.readlines()
# Combine the lines in the list into a string
content_assommoir = "".join(content_assommoir)
bs_content_assommoir = bs(content_assommoir, "lxml")
content_bonheur = []
with open("/content/bonheur_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_bonheur = file.readlines()
# Combine the lines in the list into a string
content_bonheur = "".join(content_bonheur)
bs_content_bonheur = bs(content_bonheur, "lxml")
content_nana = []
with open("/content/nana_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_nana = file.readlines()
# Combine the lines in the list into a string
content_nana = "".join(content_nana)
bs_content_nana = bs(content_nana, "lxml")
content_oeuvre = []
with open("/content/oeuvre_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_oeuvre = file.readlines()
# Combine the lines in the list into a string
content_oeuvre = "".join(content_oeuvre)
bs_content_oeuvre = bs(content_oeuvre, "lxml")
content_jack = []
with open("/content/jack_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_jack = file.readlines()
# Combine the lines in the list into a string
content_jack = "".join(content_jack)
bs_content_jack = bs(content_jack, "lxml")
content_sanscravate = []
with open("/content/sanscravate_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_sanscravate = file.readlines()
# Combine the lines in the list into a string
content_sanscravate = "".join(content_sanscravate)
bs_content_sanscravate = bs(content_sanscravate, "lxml")
content_potbouille = []
with open("/content/potbouille_TEI_perdido.xml", "r") as file:
# Read each line in the file, readlines() returns a list of lines
content_potbouille = file.readlines()
# Combine the lines in the list into a string
content_potbouille = "".join(content_potbouille)
bs_content_potbouille = bs(content_potbouille, "lxml")
#For train dataset
result_assommoir = bs_content_assommoir.find_all("rs", {"type": "place"})
print(len(result_assommoir))
result_bonheur = bs_content_bonheur.find_all("rs", {"type": "place"})
print(len(result_bonheur))
result_nana = bs_content_nana.find_all("rs", {"type": "place"})
print(len(result_nana))
result_oeuvre = bs_content_oeuvre.find_all("rs", {"type": "place"})
print(len(result_oeuvre))
#For test dataset
result_potbouille = bs_content_potbouille.find_all("rs", {"type": "place"})
print(len(result_potbouille))
result_jack = bs_content_jack.find_all("rs", {"type": "place"})
print(len(result_jack))
result_sanscravate = bs_content_sanscravate.find_all("rs", {"type": "place"})
print(len(result_sanscravate))
# Functions to parse XML
result_s = []
def check_motion(l):
if len(l.findchild("motion"))> 0 :
return True
else :
return False
def del_espace(x):
newstr = x[0].replace("' ", "'")
newstr1 = newstr.replace(" ,", ",")
newstr2 = newstr1.replace(" .", ".")
newstr3 = newstr2.replace(" ?", "?")
newstr4 = newstr3.replace(" !", "!")
newstr4 = re.sub('[.]+', '.', newstr4)
return newstr4
def get_sentence(x):
result1 = x.parent
while result1.name != 's':
result1=result1.parent
sentence = [' '.join(result1.text.split())]
return del_espace(sentence)
def get_sentence_parent(x):
result1 = x.parent
print(type(result1))
while result1.name != 's':
result1=result1.parent
result_s.append(result1)
return result_s
def sentences_list(y):
sen_list = []
for i in y:
parent_s = i.findParent("s")
mo = parent_s.findChild("motion")
# sen_list.append(get_sentence(i))
if mo:
sentence = [' '.join(parent_s.text.split())]
sen_list.append(del_espace(sentence))
return sen_list
def sentences_list_parents(y):
sen_list = []
for i in y:
sen_list.append(get_sentence_parent(i))
return sen_list
#The query of train dataset
queries_train = list(set(sentences_list(result_assommoir) + sentences_list(result_bonheur) + sentences_list(result_nana) + sentences_list(result_oeuvre)))
len(queries_train)
#The query of test dataset
queries_test = list(set(sentences_list(result_potbouille) + sentences_list(result_sanscravate) + sentences_list(result_jack)))
len(queries_test)
"""#5. Generate (create) an Embedding using SentenceTransformer"""
# Using the model "distiluse-base-multilingual-cased" to create the embeddings of the sentences of
# corpus and query for both train and test datasets
embedder = SentenceTransformer('distiluse-base-multilingual-cased')
# Corpus sentences
corpus_embeddings_train = embedder.encode(corpus_train)
corpus_embeddings_test = embedder.encode(corpus_test)
# Query sentences:
query_embeddings_train = embedder.encode(queries_train)
query_embeddings_test = embedder.encode(queries_test)
"""#6. Find the average score for each sentence with all other sentences in the query
**Semantic Search **: Semantic search is the task of finding similar sentences to a given sentence.
We apply this task on our embeddings (corpus, query)
"""
from statistics import mean
# Here we use the cosine similarity to find the similarity between the sentences of query
closest_n = len(queries_train) # j'ai choisi 12 parce que "12" donne les meilleurs résultats (la limite pour les romans que j'ai choisi)
#For train dataset
sum_score_train = []
i = 0
for query, query_embedding in zip(queries_train, query_embeddings_train):
distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_train, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
sum_score_i = []
for idx, distance in results[1:closest_n]:
sum_score_i.append(1-distance)
print(queries_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
sum_score_train.append((queries_train[i].strip(),mean(sum_score_i)))
i = i + 1
#For test dataset
closest_n = len(queries_test)
sum_score_test = []
i= 0
for query, query_embedding in zip(queries_test, query_embeddings_test):
distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_test, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
sum_score_i = []
for idx, distance in results[1:closest_n]:
sum_score_i.append(1-distance)
print(queries_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
sum_score_test.append((queries_test[i].strip(),mean(sum_score_i)))
i = i + 1
query_sentences_train = [sen_q[0] for sen_q in sum_score_train]
avg_score_sen_train = [score_q[1] for score_q in sum_score_train]
query_sentences_test = [sen_q[0] for sen_q in sum_score_test]
avg_score_sen_test = [score_q[1] for score_q in sum_score_test]
import pandas
df = pandas.DataFrame(data={"sentences": query_sentences_test, "avg": avg_score_sen_test})
df.to_csv("./query_score_test.csv", sep=',',index=False)
# faire la deuxième étape (regrouper les phrases de requêtes)
df_train = pandas.read_csv("/content/train_pos_sort_dup_plus_4.csv")
df_train.info()
print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score
print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score
print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score
print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score
print(df_train[df_train['avg']<0.1].get('sentences'))
"""#7. Find the positive labels"""
# We use scipy with cosine distance (similarity) function to find the 10 most-similar embeddings (sentences) for queries in the corpus:
# Here we create the positive labels for the next step (supervised "classification")
closest_n = 10
#For train dataset
# list of all the 10 closest sentences of the corpus with all sentences of query
sen_plus_sim_1_train = [] # (sentence, score, query sentence)
#list of the sentences that have score more than 0.6
sen_plus_06_train = [] # (sentence, score, Idx of query sentence)
i = 0 # Idx for the sentences of the query
for query, query_embedding in zip(queries_train, query_embeddings_train):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n======================\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
for idx, distance in results[0:closest_n]:
sen_plus_sim_1_train.append((corpus_train[idx].strip(),1-distance, query))
if (distance)<= 0.4:
sen_plus_06_train.append((corpus_train[idx].strip(),1-distance, i))
print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
i = i + 1
#For test dataset
sen_plus_sim_1_test = [] # list of all the 10 closest sentencesof the corpus with all sentences of query
sen_plus_06_test = [] # list of the sentences that have score more than 0.6
i = 0
for query, query_embedding in zip(queries_test, query_embeddings_test):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
for idx, distance in results[0:closest_n]:
sen_plus_sim_1_test.append((corpus_test[idx].strip(),1-distance, query))
if (1-distance)>= 0.6:
sen_plus_06_test.append((corpus_test[idx].strip(),1-distance, i))
print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
i = i + 1
# Function to count repetition of elements in a list
def getDuplicatesWithCount(listOfElems):
''' Get frequency count of duplicate elements in the given list '''
dictOfElems = dict()
# Iterate over each element in list
for elem in listOfElems:
# If element exists in dict then increment its value else add it in dict
if elem in dictOfElems:
dictOfElems[elem] += 1
else:
dictOfElems[elem] = 1
# Filter key-value pairs in dictionary. Keep pairs whose value is greater than 1 i.e. only duplicate elements from list.
dictOfElems = { key:value for key, value in dictOfElems.items() if value >= 1}
# Returns a dict of duplicate elements and thier frequency count
return dictOfElems
#For train dataset
print(len(sen_plus_sim_1_train))
# The pairs(snetence, score) not duplicated
print(len(list(set(sen_plus_sim_1_train))))
# just the sentences of pairs(sentence, score)
sen_jus_1_train = [x[0] for x in sen_plus_sim_1_train]
print(len(sen_jus_1_train))
# justt the duplicated sentences
dupes_sen_plus_sim_rep_1_train = [x for n, x in enumerate(sen_jus_1_train) if x in sen_jus_1_train[:n]]
dupes_sen_plus_sim_1_train = list(set(dupes_sen_plus_sim_rep_1_train))
print(len(dupes_sen_plus_sim_1_train))
# just the sentences not duplicated
sen_jus_without_rep_1_train = list(set(sen_jus_1_train))
print(len(sen_jus_without_rep_1_train))
#For test dataset
print(len(sen_plus_sim_1_test))
# The pairs(snetence, score) not duplicated
print(len(list(set(sen_plus_sim_1_test))))
# just the sentences of pairs(sentence, score)
sen_jus_1_test = [x[0] for x in sen_plus_sim_1_test]
print(len(sen_jus_1_test))
# justt the duplicated sentences
dupes_sen_plus_sim_rep_1_test = [x for n, x in enumerate(sen_jus_1_test) if x in sen_jus_1_test[:n]]
dupes_sen_plus_sim_1_test = list(set(dupes_sen_plus_sim_rep_1_test))
print(len(dupes_sen_plus_sim_1_test))
# just the sentences not duplicated
sen_jus_without_rep_1_test = list(set(sen_jus_1_test))
print(len(sen_jus_without_rep_1_test))
# Get a dictionary containing duplicate elements in list and their frequency count
dic_sens_train = []
dictOfElems_train = getDuplicatesWithCount(sen_jus_1_train)
for key, value in dictOfElems_train.items():
if value == 1:
print('****************************************************************')
print(key , ' :: ', value)
dic_sens_train.append((key,value))
len(dic_sens_train)
repetations_train = []
for sen in sen_jus_1_train:
repetations_train.append(dictOfElems_train.get(sen, ""))
len(repetations_train)
len(dictOfElems_train)
len(list(set(dic_sens_train)-set(queries_train)))
# Get a dictionary containing duplicate elements in list sen_jus_1_test and their frequency count
dic_sens_test = []
dictOfElems_test = getDuplicatesWithCount(sen_jus_1_test)
for key, value in dictOfElems_test.items():
if value == 1:
print('****************************************************************')
print(key , ' :: ', value)
dic_sens_test.append((key,value))
len(dic_sens_test)
repetations_test = []
for sen in sen_jus_1_test:
repetations_test.append(dictOfElems.get(sen, ""))
len(list(set(dic_sens_test)-set(queries_test)))
len(dictOfElems_test)
# Create a dataset just for positive labels with the columns (sentences, labals, nb_duplications, query_sentence, sen_score)
# Prepare the query sentences for train and test datasets
sentences_query_test = [sen_q[2] for sen_q in sen_plus_sim_1_test]
sentences_query_train = [sen_q[2] for sen_q in sen_plus_sim_1_train]
# Prepare the labels (here just 1)
leb_liste_train = []
for i in range(len(sen_jus_1_train)):
leb_liste_train.append('1')
leb_liste_test = []
for i in range(len(sen_jus_1_test)):
leb_liste_test.append('1')
score_sens = [x[1] for x in sen_plus_sim_1_test]
score_sens_train = [x[1] for x in sen_plus_sim_1_train]
import pandas
df_test = pandas.DataFrame(data={"sentences": sen_jus_1_test, "labels": leb_liste_test, "nb_duplications": repetations,
"query_sentence": sentences_query_test,"sen_score": score_sens})
df_test.to_csv("./test_31_05.csv", sep=',',index=False)
df_train = pandas.DataFrame(data={"sentences": sen_jus_1_train, "labels": leb_liste_train, "nb_duplications": repetations_train,
"query_sentence": sentences_query_train,"sen_score": score_sens_train})
df_train.to_csv("./train_31_05.csv", sep=',',index=False)
df_test = pandas.read_csv("/content/test_31_05.csv")
df_test.info()
df_train = pandas.read_csv("/content/train_31_05.csv")
df_train.info()
#Take just the sentences that are duplicated more that 4 times
df_test_dup_5 = df_test.loc[df_test['nb_duplications'] > 4]
df_train_dup_5 = df_train.loc[df_train['nb_duplications'] > 4]
len(df_test_dup_5)
len(df_train_dup_5)
# Remove the duplication of sentences
df_corpus_test = df_test_dup_5.drop_duplicates('sentences', keep='last')
df_corpus_train = df_train_dup_5.drop_duplicates('sentences', keep='last')
len(df_corpus_test)
len(df_corpus_train)
#Transform to list
sens_corpus_test = df_corpus_test['sentences'].tolist()
sens_corpus_train = df_corpus_train['sentences'].tolist()
len(list(set(sens_corpus_test + queries_test)))
len(list(set(sens_corpus_train + queries_train)))
pos_sens_test = list(set(sens_corpus_test + queries_test))
pos_sens_train = list(set(sens_corpus_train + queries_train))
sens_new = list(set(pos_sens_test) - set(queries_test))
sens_new_train = list(set(pos_sens_train) - set(queries_train))
len(sens_new)
len(sens_new_train)
from itertools import repeat
pos_labels_test = list(repeat(1, len(pos_sens_test)))
pos_labels_train = list(repeat(1, len(pos_sens_train)))
df_test_pos = pandas.DataFrame(data={"sentences": pos_sens_test, "labels": pos_labels_test})
df_test_pos.to_csv("./test_pos_sens_31_05.csv", sep=',',index=False)
df_train_pos = pandas.DataFrame(data={"sentences": pos_sens_train, "labels": pos_labels_train})
df_train_pos.to_csv("./train_pos_sens_31_05.csv", sep=',',index=False)
df_test_new = pandas.DataFrame(data={"sentences": sens_new})
df_test_new.to_csv("./test_pos_new_sens_31_05.csv", sep=',',index=False)
df_train_new = pandas.DataFrame(data={"sentences": sens_new_train})
df_train_new.to_csv("./train_pos_new_sens_31_05.csv", sep=',',index=False)
list_train = df_train_pos.values.tolist()
list_test = df_test_pos.values.tolist()
list_train_sens = [x[0] for x in list_train]
list_test_sens = [x[0] for x in list_test]
len(list_test_sens)
resulting_list_1_train = list_train_sens
resulting_list_1_test = list_test_sens
"""#8. Fine the negatives labels"""
# We use scipy with cosine distance (similarity) function to find the 10 most-different embeddings (sentences) for queries in the corpus:
# Here we create the negative labels for the next step (supervised "classification")
closest_n = 10
#For train dataset
# list of all the 10 farthest sentences of the corpus with all sentences of query
sen_plus_sim_0_train = [] # (sentence, score)
sen_moins_0_train = [] # list of sentences that have score less than 0 (sentence, score)
for query, query_embedding in zip(queries_train, query_embeddings_train):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
for idx, distance in results[-closest_n:]:
sen_plus_sim_0_train.append((corpus_train[idx].strip(),1-distance))
if (1-distance)<= 0:
sen_moins_0_train.append((corpus_train[idx].strip(),1-distance))
print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
#For test dataset
# list of all the 10 farthest sentences of the corpus with all sentences of query
sen_plus_sim_0_test = [] # (sentence, score)
sen_moins_0_test = [] # list of sentences that have score less than 0 (sentence, score)
for query, query_embedding in zip(queries_test, query_embeddings_test):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0]
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 10 most similar sentences in corpus:")
for idx, distance in results[-closest_n:]:
sen_plus_sim_0_test.append((corpus_test[idx].strip(),1-distance))
if (1-distance)<= 0:
sen_moins_0_test.append((corpus_test[idx].strip(),1-distance))
print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx)
#For train dataset
print(len(sen_moins_0_train))
# Here I take the pairs (sentence, score) that they are not duplicated
sen_moins_0_without_dup_train = list(set(sen_moins_0_train))
print(len(sen_moins_0_without_dup_train))
# just the sentences of pairs (sentence, score)
sen_jus_0_train = [x[0] for x in sen_moins_0_without_dup_train]
# just the sentences without duplications
sen_jus_0_without_dup_train = list(set(sen_jus_0_train))
print(len(sen_jus_0_without_dup_train))
# For test dataset (we do like with train dataset)
print(len(sen_moins_0_test))
sen_moins_0_without_dup_test = list(set(sen_moins_0_test))
print(len(sen_moins_0_without_dup_test))
sen_jus_0_test = [x[0] for x in sen_moins_0_without_dup_test]
sen_jus_0_without_dup_test = list(set(sen_jus_0_test))
print(len(sen_jus_0_without_dup_test))
#For train dataset
print(len(sen_plus_sim_0_train))
# The pairs(sentence, score) not duplicated
print(len(list(set(sen_plus_sim_0_train))))
# just the sentences of pairs(phrase, score)
sen_jus_0_train = [x[0] for x in sen_plus_sim_0_train]
print(len(sen_jus_0_train))
# just the duplicated sentences
dupes_sen_plus_sim_0_train = [x for n, x in enumerate(sen_jus_0_train) if x in sen_jus_0_train[:n]]
print(len(dupes_sen_plus_sim_0_train))
# just the sentences not duplicated
sen_jus_without_rep_0_train = list(set(sen_jus_0_train))
print(len(sen_jus_without_rep_0_train))
# For test dataset (we do the same of train dataset)
print(len(sen_plus_sim_0_test))
print(len(list(set(sen_plus_sim_0_test))))
sen_jus_0_test = [x[0] for x in sen_plus_sim_0_test]
print(len(sen_jus_0_test))
dupes_sen_plus_sim_0_test = [x for n, x in enumerate(sen_jus_0_test) if x in sen_jus_0_test[:n]]
print(len(dupes_sen_plus_sim_0_test))
sen_jus_without_rep_0_test = list(set(sen_jus_0_test))
print(len(sen_jus_without_rep_0_test))
# Merrge the two lists to have the best list of negative labels for train dataset
resulting_list_0_train = list(set(sen_jus_without_rep_0_train + sen_jus_0_without_dup_train))
len(resulting_list_0_train)
# Merrge the two lists to have the best list of negative labels for test dataset
resulting_list_0_test = list(set(sen_jus_without_rep_0_test + sen_jus_0_without_dup_test))
len(resulting_list_0_test)
"""#9. Export the lists of sentences into CSV files"""
len(resulting_list_1_train)
# Sentences list for train dataset (negative labels ~~ positive labels * 3)
num_neg = len(resulting_list_1_train) * 3
sens_list_train = resulting_list_1_train + resulting_list_0_train[:num_neg]
len(sens_list_train)
len(resulting_list_1_test)
# Sentences list for train dataset (negative labels ~~ positive labels * 3)
num_neg_test = len(resulting_list_1_test) * 3
sens_list_test = resulting_list_1_test + resulting_list_0_test[:num_neg_test]
len(sens_list_test)
#Find the labels of train dataset
leb_liste_train = []
for i in range(len(sens_list_train)):
if i < len(resulting_list_1_train) :
leb_liste_train.append('1')
else:
leb_liste_train.append('0')
#For test dataset
leb_liste_test = []
for i in range(len(sens_list_test)):
if i < len(resulting_list_1_test) :
leb_liste_test.append('1')
else:
leb_liste_test.append('0')
len(leb_liste_train)
import pandas
df_train = pandas.DataFrame(data={"sentences": sens_list_train, "labels": leb_liste_train})
df_train.to_csv("./train_final_31_05.csv", sep=',',index=False)
df_test = pandas.DataFrame(data={"sentences": sens_list_test, "labels": leb_liste_test})
df_test.to_csv("./test_final_31_05.csv", sep=',',index=False)
"""#10. Do samples of datasets : we do that to randomize the two datasets"""
import pandas
df_test_new = pandas.read_csv("/content/test_final_31_05.csv")
df_test_new.info()
df_train_egal = pandas.read_csv("/content/train_final_31_05.csv")
df_train_egal.info()
df_test_sam = df_test_new.sample(2412)
df_train_egal_sam = df_train_egal.sample(4284)
df_test_sam.to_csv("./test_sample_final_31_05.csv", sep=',',index=False)
df_train_egal_sam.to_csv("./train_sample_final_31_05.csv", sep=',',index=False)