Skip to content
Snippets Groups Projects
Commit 0452e14a authored by Arthur Batel's avatar Arthur Batel
Browse files

code

parent b499c7b9
No related branches found
No related tags found
No related merge requests found
Showing
with 13520 additions and 2 deletions
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
--- ---
The current repository contains all the code and data necessary to reproduce the paper results The current repository contains all the code and data necessary to reproduce the paper results.
Nonetheless, the cleaning of the code is in porcess to allow the reviewer to reproduce the results.
## Installation ## Installation
Set the environment with `make`: Set the environment with `make`:
...@@ -12,4 +14,4 @@ Set the environment with `make`: ...@@ -12,4 +14,4 @@ Set the environment with `make`:
R. Céline, A. Batel, M.Plantevit, I. Benouaret R. Céline, A. Batel, M.Plantevit, I. Benouaret
>LIRIS laboratory, UMR5205, Univ Lyon, CNRS, UBL >LIRIS laboratory, UMR5205, Univ Lyon, CNRS, UBL
## Reference ## Reference
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="cdbpr-env" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="SonarLintModuleSettings">
<option name="uniqueId" value="6084c8ba-5729-4f67-aa53-1f1cacc70561" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="cdbpr-env" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="cdbpr-env" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/code.iml" filepath="$PROJECT_DIR$/.idea/code.iml" />
</modules>
</component>
</project>
\ No newline at end of file
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics.cluster import contingency_matrix
dtype = torch.float32
if torch.cuda.is_available():
dev = "cuda:0"
else:
dev = "cpu"
device = torch.device(dev)
class BPRModel(nn.Module):
def __init__(self, batch_size, num_users, num_items, embedding_size, device, cc_exp_row, cc_exp_col, ablation):
self.device = device
self.batch_size = batch_size
super(BPRModel, self).__init__()
# Model parameters
self.user_embeddings = nn.Embedding(num_users, embedding_size).to(self.device)
self.item_embeddings = nn.Embedding(num_items, embedding_size).to(self.device)
# Initialization
if(ablation != 2) and (ablation != 3):
self.user_embeddings.weight.data.copy_(torch.from_numpy(cc_exp_row))
self.item_embeddings.weight.data.copy_(torch.from_numpy(cc_exp_col))
# Optimizer
self.optimizer = optim.Adam(self.parameters(), lr=0.01)
def forward(self, user, item, k):
user_embedding = self.user_embeddings(user)
item_embedding = self.item_embeddings(item)
if (k == -1):
user_embedding = F.normalize(user_embedding, p=2, dim=1)
item_embedding = F.normalize(item_embedding, p=2, dim=1)
return - torch.norm(user_embedding - item_embedding, p=2, dim=1)
else:
user_embedding = F.normalize(user_embedding, p=2, dim=1)
return torch.mean(user_embedding[:,k])
def bpr_loss(self,positive_scores, negative_scores):
return -torch.mean(torch.log(torch.sigmoid(positive_scores - negative_scores)))
def train(self, triplets, num_kc, epochs, batch_size, y, ablation):
for epoch in range(epochs):
all_labels = None
all_decisions = None
first = True
for k in range(num_kc):
trips = triplets[k]
y_true = y[k]
for i in range(0, len(trips), self.batch_size):
users_batch = trips[i:i + self.batch_size, 0]
items_batch = trips[i:i + self.batch_size, 1]
negatives_batch = trips[i:i + self.batch_size, 2]
neg_users_batch = trips[i:i + self.batch_size, 3]
y_train = y_true[i:i + self.batch_size]
# Convert the numpy.ndarray to tensor
users_batch = torch.from_numpy(users_batch).to(self.device)
items_batch = torch.from_numpy(items_batch).to(self.device)
negatives_batch = torch.from_numpy(negatives_batch).to(self.device)
neg_users_batch = torch.from_numpy(neg_users_batch).to(self.device)
# call forward
positive_scores = self(users_batch, items_batch, -1)
negative_scores = self(users_batch, negatives_batch, -1)
loss1 = self.bpr_loss(positive_scores, negative_scores)
positive_scores_bis = self(users_batch, negatives_batch, k)
negative_scores_bis = self(neg_users_batch, negatives_batch, k)
loss2 = self.bpr_loss(positive_scores_bis, negative_scores_bis)
if(ablation != 1) and (ablation != 3):
loss = loss1 + loss2
else:
loss = loss1
if (first == True):
all_labels = y_train
comp = negative_scores < positive_scores
comp = comp.cpu()
all_decisions = comp
first = False
else:
all_labels = np.concatenate((all_labels, y_train), axis=0)
comp = negative_scores < positive_scores
comp = comp.cpu()
all_decisions = np.concatenate((all_decisions, comp), axis=0)
# Backward
loss.backward(retain_graph=True)
# Optimizer step
self.optimizer.step()
# Clear the gradients for the next iteration
self.optimizer.zero_grad()
if(epoch % 10 == 0):
print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
acc = accuracy_score(all_labels, all_decisions)
return acc
# Evaluate the model
def evaluate_model(self, test_triplets, num_kc, y_test):
# Initialize counters for metrics
all_predictions = None
all_labels = None
all_decisions = None
first = True
for k in range(num_kc):
trips = test_triplets[k]
y = y_test[k]
for i in range(0, len(trips), self.batch_size):
users_batch = trips[i:i + self.batch_size, 0]
items_batch = trips[i:i + self.batch_size, 1]
negatives_batch = trips[i:i + self.batch_size, 2]
y_true = y[i:i + self.batch_size]
# Convert the ndarray to tensor
users_batch = torch.from_numpy(users_batch).to(self.device)
items_batch = torch.from_numpy(items_batch).to(self.device)
negatives_batch = torch.from_numpy(negatives_batch).to(self.device)
# Compute distances
positive_scores = self(users_batch, items_batch,-1).cpu()
negative_scores = self(users_batch, negatives_batch,-1).cpu()
# compute probabilities
proba = torch.sigmoid(positive_scores - negative_scores )
if (first == True):
all_labels = y_true
all_predictions = proba.detach().cpu().numpy()
comp = negative_scores < positive_scores
comp = comp.cpu()
all_decisions = comp
first = False
else:
all_labels = np.concatenate((all_labels, y_true), axis=0)
all_predictions = np.concatenate((all_predictions, proba.detach().cpu().numpy()), axis=0)
comp = negative_scores < positive_scores
comp = comp.cpu()
all_decisions = np.concatenate((all_decisions, comp), axis=0)
mse1 = mean_squared_error(all_labels, all_predictions)
print("RMSE", np.sqrt(mse1))#, root_mean_squared_error(all_labels, all_predictions))
auc = roc_auc_score(all_labels, all_predictions)
print("AUC:", auc)
acc = accuracy_score(all_labels, all_decisions)
print("ACC:", acc)
precision = precision_score(all_labels, all_decisions)
print(f'Prec: {precision}')
return acc, precision
File added
File added
File added
File added
NB KC 123
NB Users 2493
NB (question-answer) in the data 17506
Epoch 1, Loss: 1.3690141439437866
Epoch 11, Loss: 1.0071520805358887
NB KC 123
NB Users 2493
import numpy as np
import pandas as pd
import argparse
from utils import *
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-dt", "--data", help="data file")
args = parser.parse_args()
data = args.data
doa = compute_doa(data)
print("DOA:", doa)
\ No newline at end of file
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torch.nn.functional as F
import pandas as pd
from scipy import stats
import csv
import json
import random
from collections import Counter
from sklearn.preprocessing import normalize
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics.cluster import contingency_matrix
from utils import *
from BPR_model import *
dtype = torch.float32
if torch.cuda.is_available():
dev = "cuda:0"
else:
dev = "cpu"
device = torch.device(dev)
def read_file(dataTrain, dataTest):
# Compute dictionaries
df = pd.read_csv(dataTrain, names=['user_id', 'item_id','correct','knowledge'])
dfTest = pd.read_csv(dataTest, names=['user_id', 'item_id','correct','knowledge'])
# dico kc
kc = df['knowledge']
kcT = dfTest['knowledge']
kc = flattern_arrays(kc.values, kcT.values)
num_kc = len(kc)
dico_kc = { k:v for (k,v) in zip(kc, range(len(kc)))}
print("NB KC", num_kc)
# dico users
users = df['user_id']
usersT = dfTest['user_id']
users = flattern_arrays(users.values, usersT.values)
num_users = len(users)
dico_users = { k:v for (k,v) in zip(users, range(num_users))}
print("NB Users", num_users)
# dico items and their associated kc
itemsDT = df['item_id']
itemsT = dfTest['item_id']
items = flattern_arrays(itemsDT.values, itemsT.values)
num_items = len(items)
dico_items = { k:v for (k,v) in zip(items, range(num_items))}
#print("NB Items", num_items)
return dico_kc, dico_users, dico_items
def parse_dataframe(data, dico_kc, dico_users, dico_item, is_train = True):
df = pd.read_csv(data, names=['user_id', 'item_id','correct','knowledge'])
# Compute table of positive and negative items by KC and Users
# and the dictionary that associate the KC to a question/answer
num_kc = len(dico_kc)
num_users = len(dico_users)
# Find positive items for each kc/user
triplets = []
y_true = []
for k in range(num_kc):
triplets.append([])
y_true.append([])
k = 0
if(is_train):
item_users = {}
grouped_kc = df.groupby('knowledge')
for kc_name, df_kc in grouped_kc:
# Find positive items for each user
grouped = df_kc.groupby('user_id')
for group_name, df_group in grouped:
for row_index, row in df_group.iterrows():
col = row['item_id']
if col not in dico_items:
dico_items[col] = len(dico_items)
# Warning, all user's answers are positives!
q,r = parse_it(col)
col_neg = q+'_'+str(1-int(r))
if col_neg not in dico_items:
dico_items[col_neg] = len(dico_items)
if(is_train):
triplets[dico_kc[int(kc_name)]].append([dico_users[group_name], dico_items[col], dico_items[col_neg]])
if(dico_items[col] not in item_users):
item_users[dico_items[col]] = []
item_users[dico_items[col]].append(dico_users[group_name])
if(r == 1):
y_true[dico_kc[int(kc_name)]].append(r)
else:
y_true[dico_kc[int(kc_name)]].append(0)
else:
# reponse q_1, q_0 with y the expected answer
if(r==1):
triplets[dico_kc[int(kc_name)]].append([dico_users[group_name], dico_items[col], dico_items[col_neg]])
else:
triplets[dico_kc[int(kc_name)]].append([dico_users[group_name], dico_items[col_neg], dico_items[col]])
y_true[dico_kc[int(kc_name)]].append(r)
for k in range(num_kc):
triplets[k] = np.array(triplets[k])
y_true[k] = np.array(y_true[k])
if(is_train):
return dico_items, triplets, y_true, item_users
else:
return dico_items, triplets, y_true
def generate_quad(dico_items, triplets, t_trainy, item_users):
l = list(dico_items)
quadriplets = []
y = []
for k in range(len(triplets)):
t_quadriplets = []
t_y = []
for i in range(len(triplets[k])):
t = triplets[k][i]
q,r = parse_it(l[t[1]])
if(t[2] in item_users) and (r == 1):
u = random.randint(0,len(item_users[t[2]])-1)
uu = item_users[t[2]][u]
t_quadriplets.append([t[0], t[1], t[2], uu])
t_y.append(t_trainy[k][i])
else:
t_quadriplets.append([t[0], t[1], t[2], t[0]])
t_y.append(t_trainy[k][i])
quadriplets.append(np.array(t_quadriplets))
y.append(np.array(t_y))
return quadriplets, y
def init_frequencielle(train, n, p, dim, dico_items):
it = list(dico_items)
cc_exp_row = np.random.rand(n,dim)
cc_exp_col = np.random.rand(p,dim)
for k in range(len(train)):
for i in range(len(train[k])):
user = train[k][i][0]
item = train[k][i][1]
q,r = parse_it(it[item])
if(r == 1):
cc_exp_row[user][k] += 1
cc_exp_col[item][k] += 1
if(r == 0):
cc_exp_row[user][k] -= 1
cc_exp_col[item][k] -= 1
for i in range(n):
cc_exp_row[i] = cc_exp_row[i] / np.sum(cc_exp_row[i])
for i in range(p):
cc_exp_col[i] = cc_exp_col[i] / np.sum(cc_exp_col[i])
return cc_exp_row, cc_exp_col
def write_file_doa(FileName, embed, train, dico_kc, dico_users, dico_items):
# write embeddings
it = list(dico_items)
ut = list(dico_users)
nom = FileName+"_embed.csv"
f = open(nom, 'w')
writer = csv.writer(f)
for i in range(embed.shape[0]):
row = embed[i]
writer.writerow(row)
f.close()
# write R the responses
# user followed by the list of item_resp groupes in block of k
nom = FileName+"_responses.csv"
f = open(nom, 'w')
writer = csv.writer(f)
previous_user = -1
for k in range(len(dico_kc)):
for i in range(len(train[k])):
if(previous_user != -1) and (train[k][i][0] != previous_user):
positive_items = list(set(positive_items))
row = row + positive_items
writer.writerow(row)
if(train[k][i][0] != previous_user):
row = [k]+[train[k][i][0]]
positive_items = []
previous_user = train[k][i][0]
positive_items.append(it[train[k][i][1]])
# Remove duplicate positive_items
positive_items = list(set(positive_items))
row = row + positive_items
writer.writerow(row)
f.close()
#############################
#############################
# HP
epochs = 20
batch_size = 1024
ablation = 0
# 0 no ablation, 1 ablation L2, 2 ablation init, 3 both
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-dtrain", "--dataTrain", help="data file")
parser.add_argument("-dtest", "--dataTest", help="data file")
parser.add_argument("-ab", "--ablation", help="int")
args = parser.parse_args()
dataTrain = args.dataTrain
dataTest = args.dataTest
if args.ablation:
ablation = int(args.ablation)
file = dataTrain[:-4]
dico_kc, dico_users, dico_items = read_file(dataTrain, dataTest)
embedding_size = len(dico_kc)
dico_items, t_train, ty_train, item_users = parse_dataframe(dataTrain, dico_kc, dico_users, dico_items, True)
train, y_train = generate_quad(dico_items, t_train, ty_train, item_users)
dico_items, test, y_test = parse_dataframe(dataTest, dico_kc, dico_users, dico_items, False)
print("NB (question-answer) in the data", int(len(dico_items)/2))
num_users = len(dico_users)
num_items = len(dico_items)
cc_exp_row, cc_exp_col = init_frequencielle(train, num_users, num_items, embedding_size, dico_items)
bpr_model = BPRModel(batch_size, num_users, num_items, embedding_size, device, cc_exp_row, cc_exp_col, ablation)
bpr_model = bpr_model.to(device)
# Training loop
acc = bpr_model.train(train, len(dico_kc), epochs, batch_size, y_train, ablation)
# DOA
new_embedding_value = bpr_model.user_embeddings.weight.clone().detach().cpu().numpy()
write_file_doa(file, new_embedding_value, train, dico_kc, dico_users, dico_items)
doa = compute_doa(file)
print("DOA:", doa)
# Test
acc, precision = bpr_model.evaluate_model(test, len(dico_kc), y_test)
import os
name = ["assist09","assist17","algebra", "math1","math2"]
for i in range(4):
print("Ablation (0 no ablation, 1 ablation L2, 2 ablation init, 3 both) ",i)
for a in range(5):
print(name[i])
cmd = "python main.py --dataTrain ../data/"+name[i]+"/train.csv --dataTest ../data/"+name[i]+"/test.csv --ablation "+str(i)
os.system(cmd)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment