Skip to content
Snippets Groups Projects
Commit b509a0a4 authored by Arthur Batel's avatar Arthur Batel
Browse files

Merge branch 'main' of https://gitlab.liris.cnrs.fr/abatel/cd-bpr

# Conflicts:
#	code/binary_bpr/main.py
parents 28606210 0aaf8c75
No related branches found
No related tags found
No related merge requests found
......@@ -26,8 +26,8 @@ from datetime import datetime
dtype = torch.float32
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
#print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
#print(f"CUDA version: {torch.version.cuda}")
if torch.cuda.is_available():
dev = "cuda:0"
else:
......@@ -49,21 +49,21 @@ def read_file(dataTrain, dataTest):
kc = flattern_arrays(kc.values, kcT.values)
num_kc = len(kc)
dico_kc = { k:v for (k,v) in zip(kc, range(len(kc)))}
print("NB KC", num_kc)
#print("NB KC", num_kc)
# dico users
users = df['user_id']
usersT = dfTest['user_id']
users = flattern_arrays(users.values, usersT.values)
num_users = len(users)
dico_users = { k:v for (k,v) in zip(users, range(num_users))}
print("NB Users", num_users)
#print("NB Users", num_users)
# dico items and their associated kc
itemsDT = df['item_id']
itemsT = dfTest['item_id']
items = flattern_arrays(itemsDT.values, itemsT.values)
num_items = len(items)
dico_items = { k:v for (k,v) in zip(items, range(num_items))}
print("NB Items", num_items, len(dico_items))
#print("NB Items", num_items, len(dico_items))
return dico_kc, dico_users, dico_items
def save_embeddings(xpName: str, modelName: str, embeddings,userEmbDir : str,itemEmbDir : str):
"""
......@@ -81,7 +81,7 @@ def save_embeddings(xpName: str, modelName: str, embeddings,userEmbDir : str,ite
# save embeddings
results_path = os.path.join(userEmbDir, results_name_file+".csv")
print(results_path)
np.savetxt(results_path, u_emb, delimiter=',')
# save embeddings
results_path = os.path.join(itemEmbDir, results_name_file+".csv")
......@@ -91,8 +91,6 @@ def parse_dataframe(data, dico_kc, dico_users, dico_item, is_train = True):
df = pd.read_csv(data, names=['user_id', 'item_id','correct','knowledge'])
# Compute table of positive and negative items by KC and Users
# and the dictionary that associate the KC to a question/answer
#num_kc = np.max(np.array(list(dico_kc.keys()))) + 1
#print("Parse DF", num_kc)
num_kc = len(dico_kc)
num_users = len(dico_users)
# Find positive items for each kc/user
......@@ -113,7 +111,6 @@ def parse_dataframe(data, dico_kc, dico_users, dico_item, is_train = True):
col = row['item_id']
if col not in dico_items:
dico_items[col] = len(dico_items)
# Warning, all user's answers are positives!
q,r = parse_it(col)
col_neg = q+'_'+str(1-int(r))
if col_neg not in dico_items:
......@@ -156,7 +153,6 @@ def generate_quad(dico_items, triplets, t_trainy, item_users, alpha):
uu = item_users[t[2]][u]
t_quadriplets.append([t[0], t[1], t[2], uu])
t_y.append(t_trainy[k][i])
#break
else:
t_quadriplets.append([t[0], t[1], t[2], t[0]])
t_y.append(t_trainy[k][i])
......@@ -279,7 +275,7 @@ if __name__ == '__main__':
parser.add_argument("-bSize", "--batchSize", help="batch size")
parser.add_argument("-lr", "--learningRate", help="learning rate")
parser.add_argument("-mode", "--mode", help="CV mode = 1, GS mode = 0")
#parser.add_argument("-a", "--alpha", help="float")
args = parser.parse_args()
dataTrain = args.dataTrain
dataTest = args.dataTest
......@@ -308,7 +304,6 @@ if __name__ == '__main__':
FileNameTest_temp = testFileName[:-1] + str(i_fold)
dataTrain = FileNameTrain_temp+".csv"
dataTest = FileNameTest_temp+".csv"
# alpha = int(args.alpha)
print("dataTrain:", dataTrain)
print("dataTest:", dataTest)
print("dataPath:", dataPath)
......@@ -319,7 +314,6 @@ if __name__ == '__main__':
dico_kc, dico_users, dico_items = read_file(dataTrain, dataTest)
embedding_size = len(dico_kc)
dico_items, t_train, ty_train, item_users = parse_dataframe(dataTrain, dico_kc, dico_users, dico_items, True)
# print("alpha", alpha)
train, y_train = generate_quad(dico_items, t_train, ty_train, item_users, alpha)
dico_items, test, y_test = parse_dataframe(dataTest, dico_kc, dico_users, dico_items, False)
num_users = len(dico_users)
......@@ -340,7 +334,7 @@ if __name__ == '__main__':
write_file_doa(FileNameTrain_temp, emb[0], train, dico_kc, dico_users, dico_items)
doa = compute_doa(FileNameTrain_temp)
# '''
# Test
correctness, acc, users, auc, rmse = bpr_model.evaluate_model(test, len(dico_kc), y_test)
acc_list.append(acc)
......@@ -349,18 +343,13 @@ if __name__ == '__main__':
doa_train.append(doa)
print("Doa on Train dataset:", doa)
print("AUC and RMSE on test dataset:", auc, rmse)
# '''
new_embedding_value = bpr_model.user_embeddings.weight.clone().detach().cpu().numpy()
write_file_doa_test(FileNameTest_temp, new_embedding_value, test, y_test, dico_kc, dico_users, dico_items)
doa = compute_doa(FileNameTest_temp)
doa_test.append(doa)
print("Accuracy and Doa on test dataset:", acc, doa)
# '''
## test oppose
# acc, precision = bpr_model.evaluate_model(test1, len(dico_kc), y_test1)
# print(f'Accuracy: {acc}')
print(acc_list)
print(auc_list)
print(rmse_list)
......@@ -373,7 +362,6 @@ if __name__ == '__main__':
print("doa_test :", np.mean(doa_test), "+-", np.std(doa_test))
print("reo :",1- np.mean(doa_test)/np.mean(doa_train))
else :
#alpha = int(args.alpha)
print("dataTrain:",dataTrain)
print("epochs:",epochs)
print("batch_size:",batch_size)
......@@ -381,7 +369,6 @@ if __name__ == '__main__':
dico_kc, dico_users, dico_items = read_file(dataTrain, dataTest)
embedding_size = len(dico_kc)
dico_items, t_train, ty_train, item_users = parse_dataframe(dataTrain, dico_kc, dico_users, dico_items, True)
#print("alpha", alpha)
train, y_train = generate_quad(dico_items, t_train, ty_train, item_users, alpha)
dico_items, test, y_test = parse_dataframe(dataTest, dico_kc, dico_users, dico_items, False)
num_users = len(dico_users)
......@@ -402,16 +389,12 @@ if __name__ == '__main__':
write_file_doa(trainFileName, emb[0], train, dico_kc, dico_users, dico_items)
doa = compute_doa(trainFileName)
print("Doa on train dataset:", doa)
#'''
# Test
correctness, acc, users, auc, rmse = bpr_model.evaluate_model(test, len(dico_kc), y_test)
print(f'Accuracy: {acc}')
#'''
new_embedding_value = bpr_model.user_embeddings.weight.clone().detach().cpu().numpy()
write_file_doa_test(testFileName, new_embedding_value, test, y_test, dico_kc, dico_users, dico_items)
doa = compute_doa(testFileName)
print("Accuracy and Doa on test dataset:", acc, doa)
#'''
## test oppose
#acc, precision = bpr_model.evaluate_model(test1, len(dico_kc), y_test1)
#print(f'Accuracy: {acc}')
import os
dPath = "../../data/cdbpr_format/"
dPath = "../../data/"
embDirPath = "../../results/table_2/"
datasets = ['assist0910_tkde', 'assist17_tkde', 'algebra','math_1', 'math_2']
epochs = [1,75, 95, 5, 90, 90]
......
......@@ -137,6 +137,7 @@ class BPRModel(nn.Module):
all_labels = None
all_decisions = None
first = True
all_preferences = None
for k in range(num_kc):
trips = test_triplets[k]
y = y_test[k]
......@@ -162,6 +163,11 @@ class BPRModel(nn.Module):
comp = negative_scores < positive_scores
comp = comp.cpu()
all_decisions = comp
positive_scores = positive_scores.cpu()
negative_scores = negative_scores.cpu()
preferences = np.concatenate((users_batch.cpu().reshape(-1,1),items_batch.cpu().reshape(-1,1), positive_scores.detach().numpy().reshape(-1,1)), axis = 1)
pref_neg = np.concatenate((users_batch.cpu().reshape(-1,1),negatives_batch.cpu().reshape(-1,1), negative_scores.detach().numpy().reshape(-1,1)), axis = 1)
all_preferences = np.concatenate((preferences,pref_neg), axis = 0)
first = False
else:
all_labels = np.concatenate((all_labels, y_true), axis=0)
......@@ -169,9 +175,14 @@ class BPRModel(nn.Module):
comp = negative_scores < positive_scores
comp = comp.cpu()
all_decisions = np.concatenate((all_decisions, comp), axis=0)
preferences = np.concatenate((users_batch.cpu().reshape(-1,1),items_batch.cpu().reshape(-1,1), positive_scores.detach().numpy().reshape(-1,1)), axis = 1)
pref_neg = np.concatenate((users_batch.cpu().reshape(-1,1),negatives_batch.cpu().reshape(-1,1), negative_scores.detach().numpy().reshape(-1,1)), axis = 1)
preferences = np.concatenate((preferences,pref_neg), axis = 0)
#print(preferences.shape)
all_preferences = np.concatenate((all_preferences,preferences), axis=0)
correct_ranking = sum(negative_scores < positive_scores)# for score in negative_scores)
mse1 = mean_squared_error(all_labels, all_predictions)
auc = roc_auc_score(all_labels, all_predictions)
return accuracy_score(all_labels, all_decisions), precision_score(all_labels, all_decisions), recall_score(all_labels, all_decisions), all_decisions
return accuracy_score(all_labels, all_decisions), precision_score(all_labels, all_decisions), recall_score(all_labels, all_decisions), all_decisions, all_preferences
......@@ -127,41 +127,49 @@ def parse_dataframe(data, dico_kc, dico_users, dico_items, d_quest_val, nb_item_
else:
return dico_items, triplets, y_true
def compute_pred(all_predictions, test, y_test, dico_users, dico_items):
r = 0
all_test = test[0]
for i in range(1,len(test)):
all_test = np.concatenate((test[i],all_test),axis=0)
def compute_accuracy_multi_mod(all_preferences, dico_users, dico_items, dataTest):
# Remove duplicate
new_array = [tuple(row) for row in all_preferences]
all_preferences = np.unique(new_array, axis=0)
# Revert dictionaries
l = list(dico_items)
lu = list(dico_users)
# Extract the questions
list_quest = []
for i in range(len(all_test)):
q,r = parse_it(l[all_test[i,1]])
for i in range(len(all_preferences)):
q,r = parse_it(l[int(all_preferences[i,1])])
list_quest.append(int(q))
all_test = np.concatenate((all_test,np.array(list_quest).reshape(-1,1)), axis=1)
all_preferences = np.concatenate((all_preferences, np.array(list_quest).reshape(-1,1)), axis=1)
# Compute the predicted value
responses = []
list_user = flattern_array(all_test[:,0])
list_user = flattern_array(all_preferences[:,0])
for u in list_user:
my_users = all_test[:,0] == u
list_quest = flattern_array(all_test[my_users,4])
my_users = all_preferences[:,0] == u
list_quest = flattern_array(all_preferences[my_users,3])
for quest in list_quest:
rows = all_test[:,4] == quest
rows = all_preferences[:,3] == quest
my_rows = np.logical_and(rows,my_users)
# we go the rows with all the modalities
modality = -1
for t in range(len(all_test[my_rows])):
ione = all_test[my_rows][t][1]
itwo = all_test[my_rows][t][2]
q1,r1 = parse_it(l[ione])
q2,r2 = parse_it(l[itwo])
if(all_predictions[my_rows][t] == 1):
modality = r1
else:
modality = r2
responses.append([dico_users[u], quest, modality])
return responses
# we got the rows corresponding to a user and a question
# and take the row with the maximum predicted values
m = np.argmax(all_preferences[my_rows,2])
# Get the modality
item = int(all_preferences[my_rows][m][1])
q1,modality = parse_it(l[item])
responses.append([lu[int(u)], int(quest), int(modality)])
# Sort responses
dfPred = pd.DataFrame(responses, columns=['user_id', 'question','modality'])
dfPred = dfPred.sort_values(by=['user_id', 'question'])
pred = dfPred['modality'].values
# True data
dfTrue = pd.read_csv(dataTest, names=['user_id', 'item_id','correct','knowledge',"question"])
for row_index, row in dfTrue.iterrows():
col = row['item_id']
q,r = parse_it(col)
dfTrue.at[row_index,'question'] = int(q)
dfTrue = dfTrue.drop_duplicates(subset=['user_id', 'question'])
dfTrue = dfTrue.sort_values(by=['user_id', 'question'])
print("Accuracy multi-modal", accuracy_score(dfTrue['correct'].values, pred), "RMSE", np.sqrt(mean_squared_error(dfTrue['correct'].values, pred)))
......@@ -201,19 +209,12 @@ if __name__ == '__main__':
new_embedding_value = bpr_model.user_embeddings.weight.clone().detach().cpu().numpy()
write_file_doa(filename, new_embedding_value, train, dico_kc, dico_users, dico_items)
doa, rdoa = compute_doa(filename)
# write embed items
#new_embedding_items = bpr_model.item_embeddings.weight.clone().detach().cpu().numpy()
#write_file(filename+"embedding_items.csv", new_embedding_items[0:nb_item_train])
#write_file_std(FileName1+"kc_emb.txt", dico_items)
#print("Accuracy and Doa on train dataset:", acc, doa)
# Test
acc, precision, rappel, all_predictions = bpr_model.evaluate_model(test, len(dico_kc), y_test)
acc, precision, rappel, all_predictions, all_preferences = bpr_model.evaluate_model(test, len(dico_kc), y_test)
s = str(acc) +","+ str( precision)+ ","+str(rappel)+ ","+str(doa)
#for i in range(len(rdoa)):
# s = s + ','+ str(rdoa[i])
print(s)
compute_accuracy_multi_mod(all_preferences, dico_users, dico_items, dataTest)
# coding users and kc from user_label.csv
'''
os.getcwd()
......@@ -232,6 +233,6 @@ if __name__ == '__main__':
questEval[ind, kc] = val
write_file(filename+"_user_quest_label.csv", questEval)
'''
the_predictions = compute_pred(all_predictions, test, y_test, dico_users, dico_items)
file = "_test_predictions_"+str(alpha)+".csv"
write_file(filename + file, np.array(the_predictions))
\ No newline at end of file
#the_predictions = compute_pred(all_predictions, test, y_test, dico_users, dico_items)
#file = "_test_predictions_"+str(alpha)+".csv"
#write_file(filename + file, np.array(the_predictions))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment