diff --git a/Experiments.ipynb b/Experiments.ipynb index 841f82ca1ebe10cb41b12499cdb28b776bb571f2..20bad0e0e28e46e6e5bcb92a18cee7f141ddd5fa 100644 --- a/Experiments.ipynb +++ b/Experiments.ipynb @@ -1,17 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Experiment notebook\n", - "\n", - "The following jupyter notebook contains all the commands used reproduce the experiments of the paper" - ], - "metadata": { - "collapsed": false - }, - "id": "523fb0ddcbd44033" - }, { "cell_type": "code", "outputs": [], @@ -20,145 +8,51 @@ "datasets = ['assist09_tkde', 'assist17_tkde', 'algebra','math_1', 'math_2']" ], "metadata": { - "ExecuteTime": { - "end_time": "2024-02-14T09:17:51.136290823Z", - "start_time": "2024-02-14T09:17:51.130011719Z" - } + "collapsed": false }, - "id": "9d54cd9a", - "execution_count": 3 + "id": "b0ab105c19fde634" }, { "cell_type": "markdown", - "id": "1da92e4f", - "metadata": {}, "source": [ - "## Table 2: compute ACC, AUC and RMSE\n", + "## Table 2 and 3: compute ACC, AUC and RMSE and DOA\n", "\n", "Warning : can be long to compute\n", "Prerequisite : Running the makefile\n", "Output : stored in `../../results/table_2`" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "841ae3856df97ef0" }, { "cell_type": "code", - "execution_count": 7, - "id": "61c53cb1", - "metadata": { - "ExecuteTime": { - "end_time": "2024-02-14T09:39:50.059022516Z", - "start_time": "2024-02-14T09:39:12.027137257Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/arthurb/Programmation/cd-bpr\n", - "assist0910_tkde\r\n", - "Is CUDA supported by this system? False\r\n", - "CUDA version: None\r\n", - "==========> fold 0\r\n", - "dataTrain: ../../data/assist0910_tkde/train_valid_0.csv\r\n", - "dataTest: ../../data/assist0910_tkde/test_0.csv\r\n", - "embPath: ../../results/table_2/\r\n", - "epochs: 1\r\n", - "batch_size: 512\r\n", - "[Epoch 0] loss: 1.370641\r\n", - "Doa: 0.7126622039356039\r\n", - "AUC and RMSE: 0.7249253060269787 0.4473763215794524\r\n", - "==========> fold 1\r\n", - "dataTrain: ../../data/assist0910_tkde/train_valid_1.csv\r\n", - "dataTest: ../../data/assist0910_tkde/test_1.csv\r\n", - "embPath: ../../results/table_2/\r\n", - "epochs: 1\r\n", - "batch_size: 512\r\n", - "^C\r\n", - "Traceback (most recent call last):\r\n", - " File \"/home/arthurb/Programmation/cd-bpr/code/binary_bpr/main.py\", line 312, in <module>\r\n", - " dico_items, test, y_test = parse_dataframe(dataTest, dico_kc, dico_users, dico_items, False)\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/home/arthurb/Programmation/cd-bpr/code/binary_bpr/main.py\", line 105, in parse_dataframe\r\n", - " for row_index, row in df_group.iterrows():\r\n", - " File \"/home/arthurb/anaconda3/envs/cdbpr-env/lib/python3.11/site-packages/pandas/core/frame.py\", line 1449, in iterrows\r\n", - " for k, v in zip(self.index, self.values):\r\n", - " ^^^^^^^^^^^\r\n", - " File \"/home/arthurb/anaconda3/envs/cdbpr-env/lib/python3.11/site-packages/pandas/core/frame.py\", line 12281, in values\r\n", - " return self._mgr.as_array()\r\n", - " ^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/home/arthurb/anaconda3/envs/cdbpr-env/lib/python3.11/site-packages/pandas/core/internals/managers.py\", line 1656, in as_array\r\n", - " arr = self._interleave(dtype=dtype, na_value=na_value)\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/home/arthurb/anaconda3/envs/cdbpr-env/lib/python3.11/site-packages/pandas/core/internals/managers.py\", line 1682, in _interleave\r\n", - " dtype = interleaved_dtype( # type: ignore[assignment]\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/home/arthurb/anaconda3/envs/cdbpr-env/lib/python3.11/site-packages/pandas/core/internals/base.py\", line 363, in interleaved_dtype\r\n", - " return find_common_type(dtypes)\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/home/arthurb/anaconda3/envs/cdbpr-env/lib/python3.11/site-packages/pandas/core/dtypes/cast.py\", line 1428, in find_common_type\r\n", - " types = list(dict.fromkeys(types).keys())\r\n", - " ^^^^^^^^^^^^^^^^^^^^\r\n", - "KeyboardInterrupt\r\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "\n", "cmd = 'cd code/binary_bpr && python ./script.py '\n", "!{cmd}" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "f361888d67da2988" }, { "cell_type": "markdown", - "id": "6b1c6b4d", - "metadata": {}, "source": [ - "## Table 3: compute DOA\n", - "\n", - "The trained embeddings of CD-BPR and its competitors are given in the `result` directory so that the doa can be computed directly.\n", - "Nonetheless, for the sake of reproducibility, they will be overwritten by the embeddings computed with the previous cell if you run it.\n", - "\n", - "After selecting the dataset with the variable i, the doa will be computed for each competitor and each data splitting of the cross validation.\n", - "\n", - "Warning : Can be long to compute\n", - "Prerequisite : Running the makefile\n", - "Output : displayed in the notebook" - ] + "## Table 3: compute DOA" + ], + "metadata": { + "collapsed": false + }, + "id": "d8116396a802bc7d" }, { "cell_type": "code", - "execution_count": 17, - "id": "790a43dd", - "metadata": { - "ExecuteTime": { - "end_time": "2024-02-14T17:29:30.489316935Z", - "start_time": "2024-02-14T17:15:40.682755850Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "_create_directory() - handled error when creating a directory at ../experiments_logs/3: [Errno 17] File exists: '../experiments_logs/3'\r\n", - "dataset math_1\r\n", - "fold 0\r\n", - "model DINA\r\n", - "num_kc 11\r\n", - "model MCD\r\n", - "num_kc 11\r\n", - "^C\r\n", - "Traceback (most recent call last):\r\n", - " File \"/home/arthurb/Programmation/cd-bpr/code/./DOA.py\", line 188, in <module>\r\n", - " r_test = dao_creuse(F, kc_user, kc_user_val, dico_u)\r\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\r\n", - " File \"/home/arthurb/Programmation/cd-bpr/code/./DOA.py\", line -1, in dao_creuse\r\n", - "KeyboardInterrupt\r\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "embDirPath = \"../../results/table_2/users/\"\n", @@ -170,7 +64,11 @@ "\n", "#doa = compute_doa(path+datasets[i]+'/train_embed.csv')\n", "#print(\"DOA:\", doa)" - ] + ], + "metadata": { + "collapsed": false + }, + "id": "68a2dc1287b5cac" }, { "cell_type": "markdown", @@ -239,11 +137,9 @@ } ], "source": [ - "# 0 no ablation, 1 ablation L2, 2 ablation init, 3 both\n", - "for abla in range(4):\n", - " for i in range(5):\n", - " cmd = 'python ./binary_model/main.py --dataTrain '+ path+'data/cdbpr_format/'+datasets[i]+'/train.csv --dataTest '+path+'data/'+datasets[i]+'/test.csv --ablation '+str(abla)\n", - " os.system(cmd)" + "print(os.getcwd())\n", + "cmd = 'cd code/binary_bpr_ablation && python script_ablation.py '\n", + "!{cmd}" ] }, { @@ -282,9 +178,9 @@ ], "source": [ "# pour all\n", - "cmd = 'python ./nary_model/main_nary_cv.py --data '+ path+'data/covid/initsurvey.csv' \n", + "cmd = 'python code/nary_model/main_nary_cv.py --data '+ path+'data/covid/initsurvey.csv'\n", "print(os.system(cmd))\n", - "cmd = 'python ./nary_model/main_nary_cv.py --data '+path+'data/covid/psysurvey.csv' \n", + "cmd = 'python code/nary_model/main_nary_cv.py --data '+path+'data/covid/psysurvey.csv'\n", "print(os.system(cmd))" ] }, @@ -490,9 +386,9 @@ } ], "source": [ - "cmd = 'python ./other/decision_tree.py --lower 0'\n", + "cmd = 'python code/unsupervised_DT/decision_tree.py --lower 0'\n", "print(os.system(cmd))\n", - "cmd = 'python ./other/decision_tree.py --lower 1'\n", + "cmd = 'python code/unsupervised_DT/decision_tree.py --lower 1'\n", "print(os.system(cmd))" ] }, @@ -552,8 +448,8 @@ "metadata": {}, "outputs": [], "source": [ - "H_users = fromDFtoArray(path+ \"other/dt/train_embed.csv\",False,'f') \n", - "classPsy = fromDFtoArray(path+ \"other/dt/train_user_quest_label.csv\",False,'f') " + "H_users = fromDFtoArray(path+ \"code/unsupervised_DT/files_for_dt/train_embed.csv\",False,'f')\n", + "classPsy = fromDFtoArray(path+ \"code/unsupervised_DT/files_for_dt/train_user_quest_label.csv\",False,'f')" ] }, { diff --git a/code/binary_bpr/main.py b/code/binary_bpr/main.py index 88c9d7d2cc4cc6071350c0c5f5e58f4bdb6b99a1..c9179a8adb1ab1e8c105fcb85dde2d41e63a3a36 100644 --- a/code/binary_bpr/main.py +++ b/code/binary_bpr/main.py @@ -62,7 +62,7 @@ def read_file(dataTrain, dataTest): dico_items = { k:v for (k,v) in zip(items, range(num_items))} return dico_kc, dico_users, dico_items - def save_embeddings(xpName: str, modelName: str, embeddings,userEmbDir : str,itemEmbDir : str): +def save_embeddings(xpName: str, modelName: str, embeddings,userEmbDir : str,itemEmbDir : str): """ Saves all the metrics measured after the training process. diff --git a/code/binary_bpr_ablation/script_abalation.py b/code/binary_bpr_ablation/script_abalation.py deleted file mode 100644 index 5e56acab0d89733e56e7bde3eede2869f04dd323..0000000000000000000000000000000000000000 --- a/code/binary_bpr_ablation/script_abalation.py +++ /dev/null @@ -1,9 +0,0 @@ -import os - -name = ["assist09","assist17","algebra", "math1","math2"] -for i in range(4): - print("Ablation (0 no ablation, 1 ablation L2, 2 ablation init, 3 both) ",i) - for a in range(5): - print(name[i]) - cmd = "python main.py --dataTrain ../data/"+name[i]+"/train.csv --dataTest ../data/"+name[i]+"/test.csv --ablation "+str(i) - os.system(cmd) diff --git a/code/binary_bpr_ablation/script_ablation.py b/code/binary_bpr_ablation/script_ablation.py new file mode 100644 index 0000000000000000000000000000000000000000..9c957a2630526f18c8ab6bfaf368262abd549d21 --- /dev/null +++ b/code/binary_bpr_ablation/script_ablation.py @@ -0,0 +1,9 @@ +import os + +name = ["assist0910_tkde","assist17_tkde","algebra", "math_1","math_2"] +for i in range(4): + print("Ablation (0 no ablation, 1 ablation L2, 2 ablation init, 3 both) ",i) + for a in range(5): + print(name[i]) + cmd = "python main.py --dataTrain ../../data/"+name[i]+"/train_0.csv --dataTest ../../data/"+name[i]+"/test_0.csv --ablation "+str(i) + os.system(cmd) diff --git a/code/nary_model/main_nary_cv.py b/code/nary_model/main_nary_cv.py index 609ed98d1df6a2b1a7237e1b13951ca1d4415b4d..6816f1d421833630274ad831bffc37d276dc3eac 100644 --- a/code/nary_model/main_nary_cv.py +++ b/code/nary_model/main_nary_cv.py @@ -48,7 +48,7 @@ def evaluate_all(dataTrain, dataTest, filename): new_embedding_items = bpr_model.item_embeddings.weight.clone().detach().cpu().numpy() write_file(filename+"embedding_items.csv", new_embedding_items[0:nb_item_train]) # Test - acc, precision, rappel, all_decisions = bpr_model.evaluate_model(test, len(dico_kc), y_test) + acc, precision, rappel, all_decisions, all_prefs = bpr_model.evaluate_model(test, len(dico_kc), y_test) ''' s = str(acc) +","+ str( precision)+ ","+str(rappel)+ ","+str(doa) for i in range(embedding_size): diff --git a/code/unsupervised_DT/decision_tree.py b/code/unsupervised_DT/decision_tree.py index bfba35be1143836458dd6cf1a684add864da03cd..a2f1a73bd8b693b76627a5608f64d45d9699d20f 100644 --- a/code/unsupervised_DT/decision_tree.py +++ b/code/unsupervised_DT/decision_tree.py @@ -302,8 +302,8 @@ if __name__ == '__main__': min_users = 50 os.getcwd() path = os.path.dirname(os.path.realpath(__file__)) -print(path+ "/dt/train_embed.csv") -H_users = fromDFtoArray(path+ "/dt/train_embed.csv",False,'f') +print(path+ "/files_for_dt/train_embed.csv") +H_users = fromDFtoArray(path+ "/files_for_dt/train_embed.csv",False,'f') dim = H_users.shape[1] n = H_users.shape[0] @@ -327,7 +327,7 @@ for i in range(len(partitions)): writer.writerow(row) f.close() ''' -partitions_init = fromDFtoArray(path+ "/dt/train_partitions.csv",False,'f') +partitions_init = fromDFtoArray(path+ "/files_for_dt/train_partitions.csv",False,'f') # Step 2: take the partition with max silhouette value best_init_part = np.argmax(partitions_init[:,2]) @@ -373,7 +373,7 @@ for i in range(1,len(theClusters)): # Validation with questionnaires print("") -file = path+ "/dt/train_user_quest_label.csv" +file = path+ "/files_for_dt/train_user_quest_label.csv" quest = fromDFtoArray(file,False,'f') questClusters = [] for i in range(len(quest)): @@ -392,4 +392,4 @@ compare_clusters(dtClust, questClusters, quest) #print(partitions_init) - \ No newline at end of file + diff --git a/data.zip b/data.zip deleted file mode 100644 index 95c40307b2783632095447bfec4bf902cce34e21..0000000000000000000000000000000000000000 Binary files a/data.zip and /dev/null differ