# -*- coding: utf-8 -*- """sentences_transformers_fin_23_06_fin.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/15-DLVwOGMef8wpk6TLK9Qb5BiRyaoanj #1. Install and import sentence_transformers """ !pip install sentence_transformers from sentence_transformers import SentenceTransformer import scipy.spatial """#2. Download the novels ## 2.1. The novels to create the train dataset (4 novels) """ f_assommoir = open("/content/assommoir.txt") f_bonheur = open("/content/bonheur.txt") f_nana = open("/content/nana.txt") f_oeuvre = open("/content/oeuvre.txt") content_french_assommoir = f_assommoir.read() content_french_bonheur = f_bonheur.read() content_french_nana = f_nana.read() content_french_oeuvre = f_oeuvre.read() """##2.2. The novels to create test dataset (3 novels)""" f_sanscravate = open("/content/sanscravate.txt") f_jack = open("/content/jack.txt") f_potbouille = open("/content/potbouille.txt") content_french_sanscravate = f_sanscravate.read() content_french_jack = f_jack.read() content_french_potbouille = f_potbouille.read() """#3. Transform the novels into lists of sentences""" import nltk nltk.download('punkt') #Function for split the sentences of text using "re" library import re alphabets= "([A-Za-z])" prefixes = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|St|Mme|Mlle|Mrs|Ms|Dr|Il\s|Elle\s|It\s|Ils\s|Elle\s|Leur\s|Notre\s|Nous\s|On\s|Mais\s|Cependant\s|Ce\s|Cette\s|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" def split_into_sentences(text): text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1<prd>",text) text = re.sub(websites,"<prd>\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text) text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text) #..... -> . # text = re.sub('[.]+', '.', text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".<stop>") text = text.replace("?","?<stop>") text = text.replace("!","!<stop>") text = text.replace("<prd>",".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences # function for delete the repetation of dot (...) def del_espace(x): newstr = x newstr = re.sub('[.]+', '.', newstr) return newstr """### The sentences of train dataset""" splited_sentences_bonheur = split_into_sentences(content_french_bonheur) splited_sentences_assommoir = split_into_sentences(content_french_assommoir) splited_sentences_nana = split_into_sentences(content_french_nana) splited_sentences_oeuvre = split_into_sentences(content_french_oeuvre) print("nombre de phrases du romans assommoir ", len(splited_sentences_assommoir)) print("nombre de phrases du romans bonheur ", len(splited_sentences_bonheur)) print("nombre de phrases du romans nana ", len(splited_sentences_nana)) print("nombre de phrases du romans oeuvre ", len(splited_sentences_oeuvre)) corpus_train_pre = list(set(splited_sentences_bonheur + splited_sentences_assommoir + splited_sentences_nana + splited_sentences_oeuvre)) len(corpus_train_pre) # 10203 + 7083 = 17286 mais ici on a 16587 parce on ne prend pas les éléments dupliqués corpus_train = [] for i in corpus_train_pre: corpus_train.append(del_espace(i)) """### The sentences of test dataset""" splited_sentences_jack = split_into_sentences(content_french_jack) splited_sentences_potbouille = split_into_sentences(content_french_potbouille) splited_sentences_sanscravate = split_into_sentences(content_french_sanscravate) print("nombre de phrases du romans potbouille ", len(splited_sentences_potbouille)) print("nombre de phrases du romans jack ", len(splited_sentences_jack)) print("nombre de phrases du romans sanscravate ", len(splited_sentences_sanscravate)) corpus_test = list(set(splited_sentences_potbouille + splited_sentences_jack + splited_sentences_sanscravate)) len(corpus_test) # 10687 + 8812 = 19599 mais ici on a 18638 parce on ne prend pas les éléments dupliqués """#4. Retrieve sentences from XML files to build queries using BeautifulSoup library""" import bs4 import lxml # Import BeautifulSoup from bs4 import BeautifulSoup as bs content_assommoir = [] # Read the XML file with open("/content/assommoir_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_assommoir = file.readlines() # Combine the lines in the list into a string content_assommoir = "".join(content_assommoir) bs_content_assommoir = bs(content_assommoir, "lxml") content_bonheur = [] with open("/content/bonheur_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_bonheur = file.readlines() # Combine the lines in the list into a string content_bonheur = "".join(content_bonheur) bs_content_bonheur = bs(content_bonheur, "lxml") content_nana = [] with open("/content/nana_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_nana = file.readlines() # Combine the lines in the list into a string content_nana = "".join(content_nana) bs_content_nana = bs(content_nana, "lxml") content_oeuvre = [] with open("/content/oeuvre_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_oeuvre = file.readlines() # Combine the lines in the list into a string content_oeuvre = "".join(content_oeuvre) bs_content_oeuvre = bs(content_oeuvre, "lxml") content_jack = [] with open("/content/jack_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_jack = file.readlines() # Combine the lines in the list into a string content_jack = "".join(content_jack) bs_content_jack = bs(content_jack, "lxml") content_sanscravate = [] with open("/content/sanscravate_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_sanscravate = file.readlines() # Combine the lines in the list into a string content_sanscravate = "".join(content_sanscravate) bs_content_sanscravate = bs(content_sanscravate, "lxml") content_potbouille = [] with open("/content/potbouille_TEI_perdido.xml", "r") as file: # Read each line in the file, readlines() returns a list of lines content_potbouille = file.readlines() # Combine the lines in the list into a string content_potbouille = "".join(content_potbouille) bs_content_potbouille = bs(content_potbouille, "lxml") #For train dataset result_assommoir = bs_content_assommoir.find_all("rs", {"type": "place"}) print(len(result_assommoir)) result_bonheur = bs_content_bonheur.find_all("rs", {"type": "place"}) print(len(result_bonheur)) result_nana = bs_content_nana.find_all("rs", {"type": "place"}) print(len(result_nana)) result_oeuvre = bs_content_oeuvre.find_all("rs", {"type": "place"}) print(len(result_oeuvre)) #For test dataset result_potbouille = bs_content_potbouille.find_all("rs", {"type": "place"}) print(len(result_potbouille)) result_jack = bs_content_jack.find_all("rs", {"type": "place"}) print(len(result_jack)) result_sanscravate = bs_content_sanscravate.find_all("rs", {"type": "place"}) print(len(result_sanscravate)) # Functions to parse XML result_s = [] def check_motion(l): if len(l.findchild("motion"))> 0 : return True else : return False def del_espace(x): newstr = x[0].replace("' ", "'") newstr1 = newstr.replace(" ,", ",") newstr2 = newstr1.replace(" .", ".") newstr3 = newstr2.replace(" ?", "?") newstr4 = newstr3.replace(" !", "!") newstr4 = re.sub('[.]+', '.', newstr4) return newstr4 def get_sentence(x): result1 = x.parent while result1.name != 's': result1=result1.parent sentence = [' '.join(result1.text.split())] return del_espace(sentence) def get_sentence_parent(x): result1 = x.parent print(type(result1)) while result1.name != 's': result1=result1.parent result_s.append(result1) return result_s def sentences_list(y): sen_list = [] for i in y: parent_s = i.findParent("s") mo = parent_s.findChild("motion") # sen_list.append(get_sentence(i)) if mo: sentence = [' '.join(parent_s.text.split())] sen_list.append(del_espace(sentence)) return sen_list def sentences_list_parents(y): sen_list = [] for i in y: sen_list.append(get_sentence_parent(i)) return sen_list #The query of train dataset queries_train = list(set(sentences_list(result_assommoir) + sentences_list(result_bonheur) + sentences_list(result_nana) + sentences_list(result_oeuvre))) len(queries_train) #The query of test dataset queries_test = list(set(sentences_list(result_potbouille) + sentences_list(result_sanscravate) + sentences_list(result_jack))) len(queries_test) """#5. Generate (create) an Embedding using SentenceTransformer""" # Using the model "distiluse-base-multilingual-cased" to create the embeddings of the sentences of # corpus and query for both train and test datasets embedder = SentenceTransformer('distiluse-base-multilingual-cased') # Corpus sentences corpus_embeddings_train = embedder.encode(corpus_train) corpus_embeddings_test = embedder.encode(corpus_test) # Query sentences: query_embeddings_train = embedder.encode(queries_train) query_embeddings_test = embedder.encode(queries_test) """#6. Find the average score for each sentence with all other sentences in the query **Semantic Search **: Semantic search is the task of finding similar sentences to a given sentence. We apply this task on our embeddings (corpus, query) """ from statistics import mean # Here we use the cosine similarity to find the similarity between the sentences of query closest_n = len(queries_train) # j'ai choisi 12 parce que "12" donne les meilleurs résultats (la limite pour les romans que j'ai choisi) #For train dataset sum_score_train = [] i = 0 for query, query_embedding in zip(queries_train, query_embeddings_train): distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_train, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 10 most similar sentences in corpus:") sum_score_i = [] for idx, distance in results[1:closest_n]: sum_score_i.append(1-distance) print(queries_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) sum_score_train.append((queries_train[i].strip(),mean(sum_score_i))) i = i + 1 #For test dataset closest_n = len(queries_test) sum_score_test = [] i= 0 for query, query_embedding in zip(queries_test, query_embeddings_test): distances = scipy.spatial.distance.cdist([query_embedding], query_embeddings_test, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 10 most similar sentences in corpus:") sum_score_i = [] for idx, distance in results[1:closest_n]: sum_score_i.append(1-distance) print(queries_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) sum_score_test.append((queries_test[i].strip(),mean(sum_score_i))) i = i + 1 query_sentences_train = [sen_q[0] for sen_q in sum_score_train] avg_score_sen_train = [score_q[1] for score_q in sum_score_train] query_sentences_test = [sen_q[0] for sen_q in sum_score_test] avg_score_sen_test = [score_q[1] for score_q in sum_score_test] import pandas df = pandas.DataFrame(data={"sentences": query_sentences_test, "avg": avg_score_sen_test}) df.to_csv("./query_score_test.csv", sep=',',index=False) # faire la deuxième étape (regrouper les phrases de requêtes) df_train = pandas.read_csv("/content/train_pos_sort_dup_plus_4.csv") df_train.info() print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score print(df_train[df_train['avg']==df_train['avg'].min()]) # minimum de moyen score print(df_train[df_train['avg']==df_train['avg'].max()]) # maximum de moyen score print(df_train[df_train['avg']<0.1].get('sentences')) """#7. Find the positive labels""" # We use scipy with cosine distance (similarity) function to find the 10 most-similar embeddings (sentences) for queries in the corpus: # Here we create the positive labels for the next step (supervised "classification") closest_n = 10 #For train dataset # list of all the 10 closest sentences of the corpus with all sentences of query sen_plus_sim_1_train = [] # (sentence, score, query sentence) #list of the sentences that have score more than 0.6 sen_plus_06_train = [] # (sentence, score, Idx of query sentence) i = 0 # Idx for the sentences of the query for query, query_embedding in zip(queries_train, query_embeddings_train): distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n======================\n") print("Query:", query) print("\nTop 10 most similar sentences in corpus:") for idx, distance in results[0:closest_n]: sen_plus_sim_1_train.append((corpus_train[idx].strip(),1-distance, query)) if (distance)<= 0.4: sen_plus_06_train.append((corpus_train[idx].strip(),1-distance, i)) print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) i = i + 1 #For test dataset sen_plus_sim_1_test = [] # list of all the 10 closest sentencesof the corpus with all sentences of query sen_plus_06_test = [] # list of the sentences that have score more than 0.6 i = 0 for query, query_embedding in zip(queries_test, query_embeddings_test): distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 10 most similar sentences in corpus:") for idx, distance in results[0:closest_n]: sen_plus_sim_1_test.append((corpus_test[idx].strip(),1-distance, query)) if (1-distance)>= 0.6: sen_plus_06_test.append((corpus_test[idx].strip(),1-distance, i)) print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) i = i + 1 # Function to count repetition of elements in a list def getDuplicatesWithCount(listOfElems): ''' Get frequency count of duplicate elements in the given list ''' dictOfElems = dict() # Iterate over each element in list for elem in listOfElems: # If element exists in dict then increment its value else add it in dict if elem in dictOfElems: dictOfElems[elem] += 1 else: dictOfElems[elem] = 1 # Filter key-value pairs in dictionary. Keep pairs whose value is greater than 1 i.e. only duplicate elements from list. dictOfElems = { key:value for key, value in dictOfElems.items() if value >= 1} # Returns a dict of duplicate elements and thier frequency count return dictOfElems #For train dataset print(len(sen_plus_sim_1_train)) # The pairs(snetence, score) not duplicated print(len(list(set(sen_plus_sim_1_train)))) # just the sentences of pairs(sentence, score) sen_jus_1_train = [x[0] for x in sen_plus_sim_1_train] print(len(sen_jus_1_train)) # justt the duplicated sentences dupes_sen_plus_sim_rep_1_train = [x for n, x in enumerate(sen_jus_1_train) if x in sen_jus_1_train[:n]] dupes_sen_plus_sim_1_train = list(set(dupes_sen_plus_sim_rep_1_train)) print(len(dupes_sen_plus_sim_1_train)) # just the sentences not duplicated sen_jus_without_rep_1_train = list(set(sen_jus_1_train)) print(len(sen_jus_without_rep_1_train)) #For test dataset print(len(sen_plus_sim_1_test)) # The pairs(snetence, score) not duplicated print(len(list(set(sen_plus_sim_1_test)))) # just the sentences of pairs(sentence, score) sen_jus_1_test = [x[0] for x in sen_plus_sim_1_test] print(len(sen_jus_1_test)) # justt the duplicated sentences dupes_sen_plus_sim_rep_1_test = [x for n, x in enumerate(sen_jus_1_test) if x in sen_jus_1_test[:n]] dupes_sen_plus_sim_1_test = list(set(dupes_sen_plus_sim_rep_1_test)) print(len(dupes_sen_plus_sim_1_test)) # just the sentences not duplicated sen_jus_without_rep_1_test = list(set(sen_jus_1_test)) print(len(sen_jus_without_rep_1_test)) # Get a dictionary containing duplicate elements in list and their frequency count dic_sens_train = [] dictOfElems_train = getDuplicatesWithCount(sen_jus_1_train) for key, value in dictOfElems_train.items(): if value == 1: print('****************************************************************') print(key , ' :: ', value) dic_sens_train.append((key,value)) len(dic_sens_train) repetations_train = [] for sen in sen_jus_1_train: repetations_train.append(dictOfElems_train.get(sen, "")) len(repetations_train) len(dictOfElems_train) len(list(set(dic_sens_train)-set(queries_train))) # Get a dictionary containing duplicate elements in list sen_jus_1_test and their frequency count dic_sens_test = [] dictOfElems_test = getDuplicatesWithCount(sen_jus_1_test) for key, value in dictOfElems_test.items(): if value == 1: print('****************************************************************') print(key , ' :: ', value) dic_sens_test.append((key,value)) len(dic_sens_test) repetations_test = [] for sen in sen_jus_1_test: repetations_test.append(dictOfElems.get(sen, "")) len(list(set(dic_sens_test)-set(queries_test))) len(dictOfElems_test) # Create a dataset just for positive labels with the columns (sentences, labals, nb_duplications, query_sentence, sen_score) # Prepare the query sentences for train and test datasets sentences_query_test = [sen_q[2] for sen_q in sen_plus_sim_1_test] sentences_query_train = [sen_q[2] for sen_q in sen_plus_sim_1_train] # Prepare the labels (here just 1) leb_liste_train = [] for i in range(len(sen_jus_1_train)): leb_liste_train.append('1') leb_liste_test = [] for i in range(len(sen_jus_1_test)): leb_liste_test.append('1') score_sens = [x[1] for x in sen_plus_sim_1_test] score_sens_train = [x[1] for x in sen_plus_sim_1_train] import pandas df_test = pandas.DataFrame(data={"sentences": sen_jus_1_test, "labels": leb_liste_test, "nb_duplications": repetations, "query_sentence": sentences_query_test,"sen_score": score_sens}) df_test.to_csv("./test_31_05.csv", sep=',',index=False) df_train = pandas.DataFrame(data={"sentences": sen_jus_1_train, "labels": leb_liste_train, "nb_duplications": repetations_train, "query_sentence": sentences_query_train,"sen_score": score_sens_train}) df_train.to_csv("./train_31_05.csv", sep=',',index=False) df_test = pandas.read_csv("/content/test_31_05.csv") df_test.info() df_train = pandas.read_csv("/content/train_31_05.csv") df_train.info() #Take just the sentences that are duplicated more that 4 times df_test_dup_5 = df_test.loc[df_test['nb_duplications'] > 4] df_train_dup_5 = df_train.loc[df_train['nb_duplications'] > 4] len(df_test_dup_5) len(df_train_dup_5) # Remove the duplication of sentences df_corpus_test = df_test_dup_5.drop_duplicates('sentences', keep='last') df_corpus_train = df_train_dup_5.drop_duplicates('sentences', keep='last') len(df_corpus_test) len(df_corpus_train) #Transform to list sens_corpus_test = df_corpus_test['sentences'].tolist() sens_corpus_train = df_corpus_train['sentences'].tolist() len(list(set(sens_corpus_test + queries_test))) len(list(set(sens_corpus_train + queries_train))) pos_sens_test = list(set(sens_corpus_test + queries_test)) pos_sens_train = list(set(sens_corpus_train + queries_train)) sens_new = list(set(pos_sens_test) - set(queries_test)) sens_new_train = list(set(pos_sens_train) - set(queries_train)) len(sens_new) len(sens_new_train) from itertools import repeat pos_labels_test = list(repeat(1, len(pos_sens_test))) pos_labels_train = list(repeat(1, len(pos_sens_train))) df_test_pos = pandas.DataFrame(data={"sentences": pos_sens_test, "labels": pos_labels_test}) df_test_pos.to_csv("./test_pos_sens_31_05.csv", sep=',',index=False) df_train_pos = pandas.DataFrame(data={"sentences": pos_sens_train, "labels": pos_labels_train}) df_train_pos.to_csv("./train_pos_sens_31_05.csv", sep=',',index=False) df_test_new = pandas.DataFrame(data={"sentences": sens_new}) df_test_new.to_csv("./test_pos_new_sens_31_05.csv", sep=',',index=False) df_train_new = pandas.DataFrame(data={"sentences": sens_new_train}) df_train_new.to_csv("./train_pos_new_sens_31_05.csv", sep=',',index=False) list_train = df_train_pos.values.tolist() list_test = df_test_pos.values.tolist() list_train_sens = [x[0] for x in list_train] list_test_sens = [x[0] for x in list_test] len(list_test_sens) resulting_list_1_train = list_train_sens resulting_list_1_test = list_test_sens """#8. Fine the negatives labels""" # We use scipy with cosine distance (similarity) function to find the 10 most-different embeddings (sentences) for queries in the corpus: # Here we create the negative labels for the next step (supervised "classification") closest_n = 10 #For train dataset # list of all the 10 farthest sentences of the corpus with all sentences of query sen_plus_sim_0_train = [] # (sentence, score) sen_moins_0_train = [] # list of sentences that have score less than 0 (sentence, score) for query, query_embedding in zip(queries_train, query_embeddings_train): distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_train, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 10 most similar sentences in corpus:") for idx, distance in results[-closest_n:]: sen_plus_sim_0_train.append((corpus_train[idx].strip(),1-distance)) if (1-distance)<= 0: sen_moins_0_train.append((corpus_train[idx].strip(),1-distance)) print(corpus_train[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) #For test dataset # list of all the 10 farthest sentences of the corpus with all sentences of query sen_plus_sim_0_test = [] # (sentence, score) sen_moins_0_test = [] # list of sentences that have score less than 0 (sentence, score) for query, query_embedding in zip(queries_test, query_embeddings_test): distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings_test, "cosine")[0] results = zip(range(len(distances)), distances) results = sorted(results, key=lambda x: x[1]) print("\n\n======================\n\n") print("Query:", query) print("\nTop 10 most similar sentences in corpus:") for idx, distance in results[-closest_n:]: sen_plus_sim_0_test.append((corpus_test[idx].strip(),1-distance)) if (1-distance)<= 0: sen_moins_0_test.append((corpus_test[idx].strip(),1-distance)) print(corpus_test[idx].strip(),'\n', "(Score: %.4f)" % (1-distance),'\n',idx) #For train dataset print(len(sen_moins_0_train)) # Here I take the pairs (sentence, score) that they are not duplicated sen_moins_0_without_dup_train = list(set(sen_moins_0_train)) print(len(sen_moins_0_without_dup_train)) # just the sentences of pairs (sentence, score) sen_jus_0_train = [x[0] for x in sen_moins_0_without_dup_train] # just the sentences without duplications sen_jus_0_without_dup_train = list(set(sen_jus_0_train)) print(len(sen_jus_0_without_dup_train)) # For test dataset (we do like with train dataset) print(len(sen_moins_0_test)) sen_moins_0_without_dup_test = list(set(sen_moins_0_test)) print(len(sen_moins_0_without_dup_test)) sen_jus_0_test = [x[0] for x in sen_moins_0_without_dup_test] sen_jus_0_without_dup_test = list(set(sen_jus_0_test)) print(len(sen_jus_0_without_dup_test)) #For train dataset print(len(sen_plus_sim_0_train)) # The pairs(sentence, score) not duplicated print(len(list(set(sen_plus_sim_0_train)))) # just the sentences of pairs(phrase, score) sen_jus_0_train = [x[0] for x in sen_plus_sim_0_train] print(len(sen_jus_0_train)) # just the duplicated sentences dupes_sen_plus_sim_0_train = [x for n, x in enumerate(sen_jus_0_train) if x in sen_jus_0_train[:n]] print(len(dupes_sen_plus_sim_0_train)) # just the sentences not duplicated sen_jus_without_rep_0_train = list(set(sen_jus_0_train)) print(len(sen_jus_without_rep_0_train)) # For test dataset (we do the same of train dataset) print(len(sen_plus_sim_0_test)) print(len(list(set(sen_plus_sim_0_test)))) sen_jus_0_test = [x[0] for x in sen_plus_sim_0_test] print(len(sen_jus_0_test)) dupes_sen_plus_sim_0_test = [x for n, x in enumerate(sen_jus_0_test) if x in sen_jus_0_test[:n]] print(len(dupes_sen_plus_sim_0_test)) sen_jus_without_rep_0_test = list(set(sen_jus_0_test)) print(len(sen_jus_without_rep_0_test)) # Merrge the two lists to have the best list of negative labels for train dataset resulting_list_0_train = list(set(sen_jus_without_rep_0_train + sen_jus_0_without_dup_train)) len(resulting_list_0_train) # Merrge the two lists to have the best list of negative labels for test dataset resulting_list_0_test = list(set(sen_jus_without_rep_0_test + sen_jus_0_without_dup_test)) len(resulting_list_0_test) """#9. Export the lists of sentences into CSV files""" len(resulting_list_1_train) # Sentences list for train dataset (negative labels ~~ positive labels * 3) num_neg = len(resulting_list_1_train) * 3 sens_list_train = resulting_list_1_train + resulting_list_0_train[:num_neg] len(sens_list_train) len(resulting_list_1_test) # Sentences list for train dataset (negative labels ~~ positive labels * 3) num_neg_test = len(resulting_list_1_test) * 3 sens_list_test = resulting_list_1_test + resulting_list_0_test[:num_neg_test] len(sens_list_test) #Find the labels of train dataset leb_liste_train = [] for i in range(len(sens_list_train)): if i < len(resulting_list_1_train) : leb_liste_train.append('1') else: leb_liste_train.append('0') #For test dataset leb_liste_test = [] for i in range(len(sens_list_test)): if i < len(resulting_list_1_test) : leb_liste_test.append('1') else: leb_liste_test.append('0') len(leb_liste_train) import pandas df_train = pandas.DataFrame(data={"sentences": sens_list_train, "labels": leb_liste_train}) df_train.to_csv("./train_final_31_05.csv", sep=',',index=False) df_test = pandas.DataFrame(data={"sentences": sens_list_test, "labels": leb_liste_test}) df_test.to_csv("./test_final_31_05.csv", sep=',',index=False) """#10. Do samples of datasets : we do that to randomize the two datasets""" import pandas df_test_new = pandas.read_csv("/content/test_final_31_05.csv") df_test_new.info() df_train_egal = pandas.read_csv("/content/train_final_31_05.csv") df_train_egal.info() df_test_sam = df_test_new.sample(2412) df_train_egal_sam = df_train_egal.sample(4284) df_test_sam.to_csv("./test_sample_final_31_05.csv", sep=',',index=False) df_train_egal_sam.to_csv("./train_sample_final_31_05.csv", sep=',',index=False)