diff --git a/notebooks/Classification_Zero-Shot-Learning.ipynb b/notebooks/Classification_Zero-Shot-Learning.ipynb index 4a7ab888a7f66428cc6e486d2b1668d3636fa511..ad73e6a87ccb0cc4f6fc2f22426c6ee08e68e004 100644 --- a/notebooks/Classification_Zero-Shot-Learning.ipynb +++ b/notebooks/Classification_Zero-Shot-Learning.ipynb @@ -143,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -325,7 +325,7 @@ "4 INSPECTEUR, s. m. inspector ; () celui \\nà qui... 102 " ] }, - "execution_count": 18, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -337,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -346,7 +346,7 @@ "(15854, 13)" ] }, - "execution_count": 19, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -357,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -368,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -377,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -386,7 +386,7 @@ "(13441, 13)" ] }, - "execution_count": 23, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -397,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -443,7 +443,7 @@ " 'Spectacle']" ] }, - "execution_count": 32, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -471,13 +471,13 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e5a45c55993f47019fbdc0aceda84def", + "model_id": "cf58eccecc1847e48d520a83040e3ec7", "version_major": 2, "version_minor": 0 }, @@ -491,7 +491,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a7285f3fe7154920a0bb05fdb921d6f9", + "model_id": "2e91d1ab181f409f9c6263a255991c9f", "version_major": 2, "version_minor": 0 }, @@ -505,7 +505,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0cb38fe0e7934c49bbce246e88dd6e53", + "model_id": "4c23b1cb783d4c25b8332447bf25755a", "version_major": 2, "version_minor": 0 }, @@ -519,7 +519,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2c9dd205446a4b25848f1f06beeac8ae", + "model_id": "203385bbd7664b8b87250e4883cc300f", "version_major": 2, "version_minor": 0 }, @@ -533,7 +533,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "032334b5310d4588993b7d85329d916c", + "model_id": "c068998923a14abf8e38d8e0d89248ad", "version_major": 2, "version_minor": 0 }, @@ -583,7 +583,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -638,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -647,7 +647,7 @@ "\"ORNIS, s. m. toile des Indes, (Comm.) sortes de\\ntoiles de coton ou de mousseline, qui se font a Brampour ville de l'Indoustan, entre Surate & Agra. Ces\\ntoiles sont par bandes, moitié coton & moitié or &\\nargent. Il y en a depuis quinze jusqu'à vingt aunes.\"" ] }, - "execution_count": 49, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -658,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -671,47 +671,47 @@ { "data": { "text/plain": [ - "[('Commerce', 70.05109786987305),\n", - " ('Anatomie', 68.73835921287537),\n", - " ('Politique', 60.711854696273804),\n", - " ('Géographie', 59.15616154670715),\n", - " ('Architecture', 58.7415874004364),\n", - " ('Histoire', 57.4596107006073),\n", - " ('Agriculture - Economie rustique', 53.530728816986084),\n", - " ('Histoire naturelle', 48.45908284187317),\n", - " ('Antiquité', 46.68468534946442),\n", - " ('Beaux-arts', 42.85620450973511),\n", - " ('Mesure', 41.310253739356995),\n", - " ('Jeu', 41.221246123313904),\n", - " ('Droit - Jurisprudence', 41.13341271877289),\n", - " ('Minéralogie', 38.13730478286743),\n", - " ('Spectacle', 37.80337870121002),\n", - " ('Pêche', 37.214699387550354),\n", - " ('Superstition', 36.72800958156586),\n", - " ('Arts et métiers', 36.512187123298645),\n", - " ('Métiers', 36.505624651908875),\n", - " ('Monnaie', 35.89847385883331),\n", - " ('Musique', 32.74992108345032),\n", - " ('Mathématiques', 32.70103633403778),\n", - " ('Chasse', 29.351988434791565),\n", - " ('Economie domestique', 28.34642231464386),\n", - " ('Philosophie', 27.653351426124573),\n", - " ('Chimie', 25.783848762512207),\n", - " ('Physique - [Sciences physico-mathématiques]', 25.403589010238647),\n", - " ('Médailles', 24.586908519268036),\n", - " ('Grammaire', 22.362521290779114),\n", - " ('Caractères', 20.14842927455902),\n", - " ('Pharmacie', 19.72046047449112),\n", - " ('Militaire (Art) - Guerre - Arme', 19.68255490064621),\n", - " ('Médecine - Chirurgie', 18.616029620170593),\n", - " ('Marine', 18.207980692386627),\n", - " ('Belles-lettres - Poésie', 13.30692172050476),\n", - " ('Blason', 10.476682335138321),\n", - " ('Religion', 9.701979160308838),\n", - " ('Maréchage - Manège', 4.211422428488731)]" + "[('Commerce', 70.05096077919006),\n", + " ('Anatomie', 68.73840689659119),\n", + " ('Politique', 60.71174740791321),\n", + " ('Géographie', 59.156250953674316),\n", + " ('Architecture', 58.74174237251282),\n", + " ('Histoire', 57.459235191345215),\n", + " ('Agriculture - Economie rustique', 53.53081226348877),\n", + " ('Histoire naturelle', 48.459288477897644),\n", + " ('Antiquité', 46.68458700180054),\n", + " ('Beaux-arts', 42.856183648109436),\n", + " ('Mesure', 41.31035804748535),\n", + " ('Jeu', 41.22118949890137),\n", + " ('Droit - Jurisprudence', 41.1332905292511),\n", + " ('Minéralogie', 38.137245178222656),\n", + " ('Spectacle', 37.80339956283569),\n", + " ('Pêche', 37.214648723602295),\n", + " ('Superstition', 36.727988719940186),\n", + " ('Arts et métiers', 36.511969566345215),\n", + " ('Métiers', 36.5054726600647),\n", + " ('Monnaie', 35.89862287044525),\n", + " ('Musique', 32.74966776371002),\n", + " ('Mathématiques', 32.70111680030823),\n", + " ('Chasse', 29.35197949409485),\n", + " ('Economie domestique', 28.346234560012817),\n", + " ('Philosophie', 27.653270959854126),\n", + " ('Chimie', 25.783824920654297),\n", + " ('Physique - [Sciences physico-mathématiques]', 25.4037082195282),\n", + " ('Médailles', 24.58679974079132),\n", + " ('Grammaire', 22.36253321170807),\n", + " ('Caractères', 20.14845609664917),\n", + " ('Pharmacie', 19.720394909381866),\n", + " ('Militaire (Art) - Guerre - Arme', 19.682711362838745),\n", + " ('Médecine - Chirurgie', 18.615825474262238),\n", + " ('Marine', 18.208028376102448),\n", + " ('Belles-lettres - Poésie', 13.306896388530731),\n", + " ('Blason', 10.476677119731903),\n", + " ('Religion', 9.702161699533463),\n", + " ('Maréchage - Manège', 4.211411997675896)]" ] }, - "execution_count": 50, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -732,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -741,7 +741,7 @@ "'Commerce'" ] }, - "execution_count": 51, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -752,1041 +752,110 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "pred_labels = []\n", - "prob_labels = []\n", - "\n", - "for content in tqdm(df[column_text].tolist()):\n", - "\n", - " true_probs = zero_shot_prediction(content[:1024], classes)\n", - " \n", - " pred_labels.append(get_highest_score(true_probs, classes)[0])\n", - " prob_labels.append(get_sorted_scores(true_probs, classes))\n" + "y_true = df[column_class].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, + "outputs": [], + "source": [ + "def get_tsv_content(y_true, prob_labels):\n", + " c = ''\n", + " for i, row in enumerate(prob_labels):\n", + " c += y_true[i] + '\\t'\n", + " for t in row:\n", + " c += t[0] + '\\t' + str(t[1])+'\\t'\n", + " c += '\\n'\n", + "\n", + " return c" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Commerce',\n", - " 'Marine',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Histoire',\n", - " 'Marine',\n", - " 'Belles-lettres - Poésie',\n", - " 'Economie domestique',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Beaux-arts',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Histoire naturelle',\n", - " 'Grammaire',\n", - " 'Philosophie',\n", - " 'Histoire',\n", - " 'Médecine - Chirurgie',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Arts et métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Antiquité',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Pharmacie',\n", - " 'Grammaire',\n", - " 'Religion',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Médecine - Chirurgie',\n", - " 'Pêche',\n", - " 'Anatomie',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Musique',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Belles-lettres - Poésie',\n", - " 'Commerce',\n", - " 'Jeu',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Caractères',\n", - " 'Métiers',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Histoire naturelle',\n", - " 'Architecture',\n", - " 'Droit - Jurisprudence',\n", - " 'Belles-lettres - Poésie',\n", - " 'Métiers',\n", - " 'Histoire',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Maréchage - Manège',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Grammaire',\n", - " 'Grammaire',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Architecture',\n", - " 'Grammaire',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Chimie',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Blason',\n", - " 'Géographie',\n", - " 'Blason',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Grammaire',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Métiers',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Antiquité',\n", - " 'Philosophie',\n", - " 'Grammaire',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Grammaire',\n", - " 'Médecine - Chirurgie',\n", - " 'Métiers',\n", - " 'Chasse',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Mathématiques',\n", - " 'Histoire naturelle',\n", - " 'Marine',\n", - " 'Architecture',\n", - " 'Philosophie',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Métiers',\n", - " 'Histoire',\n", - " 'Antiquité',\n", - " 'Commerce',\n", - " 'Histoire',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Philosophie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Architecture',\n", - " 'Commerce',\n", - " 'Architecture',\n", - " 'Droit - Jurisprudence',\n", - " 'Commerce',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Médailles',\n", - " 'Marine',\n", - " 'Superstition',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Marine',\n", - " 'Antiquité',\n", - " 'Histoire naturelle',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Marine',\n", - " 'Grammaire',\n", - " 'Médecine - Chirurgie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Architecture',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Beaux-arts',\n", - " 'Histoire naturelle',\n", - " 'Chasse',\n", - " 'Grammaire',\n", - " 'Métiers',\n", - " 'Agriculture - Economie rustique',\n", - " 'Marine',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Grammaire',\n", - " 'Architecture',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Chasse',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Commerce',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Commerce',\n", - " 'Belles-lettres - Poésie',\n", - " 'Belles-lettres - Poésie',\n", - " 'Histoire naturelle',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Métiers',\n", - " 'Marine',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Grammaire',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Belles-lettres - Poésie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Blason',\n", - " 'Anatomie',\n", - " 'Architecture',\n", - " 'Histoire naturelle',\n", - " 'Musique',\n", - " 'Droit - Jurisprudence',\n", - " 'Commerce',\n", - " 'Histoire',\n", - " 'Pharmacie',\n", - " 'Marine',\n", - " 'Commerce',\n", - " 'Histoire naturelle',\n", - " 'Jeu',\n", - " 'Droit - Jurisprudence',\n", - " 'Grammaire',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Mathématiques',\n", - " 'Médecine - Chirurgie',\n", - " 'Religion',\n", - " 'Chimie',\n", - " 'Blason',\n", - " 'Antiquité',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Belles-lettres - Poésie',\n", - " 'Métiers',\n", - " 'Religion',\n", - " 'Droit - Jurisprudence',\n", - " 'Arts et métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Beaux-arts',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Economie domestique',\n", - " 'Blason',\n", - " 'Architecture',\n", - " 'Maréchage - Manège',\n", - " 'Pharmacie',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Marine',\n", - " 'Histoire naturelle',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Belles-lettres - Poésie',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Histoire',\n", - " 'Chimie',\n", - " 'Antiquité',\n", - " 'Religion',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Architecture',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Mesure',\n", - " 'Droit - Jurisprudence',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Marine',\n", - " 'Arts et métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Architecture',\n", - " 'Histoire naturelle',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Belles-lettres - Poésie',\n", - " 'Droit - Jurisprudence',\n", - " 'Marine',\n", - " 'Architecture',\n", - " 'Agriculture - Economie rustique',\n", - " 'Histoire naturelle',\n", - " 'Métiers',\n", - " 'Mathématiques',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Musique',\n", - " 'Chasse',\n", - " 'Médecine - Chirurgie',\n", - " 'Antiquité',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Métiers',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Chasse',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Grammaire',\n", - " 'Grammaire',\n", - " 'Médecine - Chirurgie',\n", - " 'Mathématiques',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Mathématiques',\n", - " 'Histoire',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Grammaire',\n", - " 'Marine',\n", - " 'Marine',\n", - " 'Blason',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Histoire',\n", - " 'Marine',\n", - " 'Marine',\n", - " 'Métiers',\n", - " 'Belles-lettres - Poésie',\n", - " 'Religion',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Musique',\n", - " 'Marine',\n", - " 'Marine',\n", - " 'Maréchage - Manège',\n", - " 'Religion',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Belles-lettres - Poésie',\n", - " 'Grammaire',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Chasse',\n", - " 'Métiers',\n", - " 'Histoire',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Antiquité',\n", - " 'Monnaie',\n", - " 'Anatomie',\n", - " 'Métiers',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Anatomie',\n", - " 'Histoire naturelle',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Jeu',\n", - " 'Agriculture - Economie rustique',\n", - " 'Commerce',\n", - " 'Philosophie',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Agriculture - Economie rustique',\n", - " 'Histoire naturelle',\n", - " 'Antiquité',\n", - " 'Histoire naturelle',\n", - " 'Métiers',\n", - " 'Métiers',\n", - " 'Arts et métiers',\n", - " 'Architecture',\n", - " 'Métiers',\n", - " 'Religion',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Histoire',\n", - " 'Mathématiques',\n", - " 'Pêche',\n", - " 'Religion',\n", - " 'Commerce',\n", - " 'Histoire',\n", - " 'Commerce',\n", - " 'Métiers',\n", - " 'Religion',\n", - " 'Religion',\n", - " 'Médecine - Chirurgie',\n", - " 'Métiers',\n", - " 'Musique',\n", - " 'Monnaie',\n", - " 'Antiquité',\n", - " 'Grammaire',\n", - " 'Anatomie',\n", - " 'Marine',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Anatomie',\n", - " 'Métiers',\n", - " 'Métiers',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Architecture',\n", - " 'Belles-lettres - Poésie',\n", - " 'Droit - Jurisprudence',\n", - " 'Arts et métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Mathématiques',\n", - " 'Beaux-arts',\n", - " 'Histoire',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Histoire naturelle',\n", - " 'Minéralogie',\n", - " 'Maréchage - Manège',\n", - " 'Métiers',\n", - " 'Blason',\n", - " 'Marine',\n", - " 'Droit - Jurisprudence',\n", - " 'Maréchage - Manège',\n", - " 'Caractères',\n", - " 'Métiers',\n", - " 'Histoire',\n", - " 'Droit - Jurisprudence',\n", - " 'Philosophie',\n", - " 'Anatomie',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Religion',\n", - " 'Arts et métiers',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Philosophie',\n", - " 'Agriculture - Economie rustique',\n", - " 'Commerce',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Musique',\n", - " 'Histoire naturelle',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire naturelle',\n", - " 'Arts et métiers',\n", - " 'Maréchage - Manège',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Droit - Jurisprudence',\n", - " 'Architecture',\n", - " 'Commerce',\n", - " 'Musique',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Politique',\n", - " 'Grammaire',\n", - " 'Commerce',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Médecine - Chirurgie',\n", - " 'Métiers',\n", - " 'Marine',\n", - " 'Histoire',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Monnaie',\n", - " 'Agriculture - Economie rustique',\n", - " 'Métiers',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Histoire naturelle',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Anatomie',\n", - " 'Anatomie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Chimie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Grammaire',\n", - " 'Pharmacie',\n", - " 'Grammaire',\n", - " 'Métiers',\n", - " 'Métiers',\n", - " 'Histoire naturelle',\n", - " 'Arts et métiers',\n", - " 'Antiquité',\n", - " 'Pharmacie',\n", - " 'Histoire naturelle',\n", - " 'Marine',\n", - " 'Religion',\n", - " 'Histoire',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Maréchage - Manège',\n", - " 'Jeu',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Agriculture - Economie rustique',\n", - " 'Antiquité',\n", - " 'Médecine - Chirurgie',\n", - " 'Chimie',\n", - " 'Métiers',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Politique',\n", - " 'Histoire',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Grammaire',\n", - " 'Marine',\n", - " 'Métiers',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Marine',\n", - " 'Commerce',\n", - " 'Grammaire',\n", - " 'Histoire naturelle',\n", - " 'Histoire naturelle',\n", - " 'Maréchage - Manège',\n", - " 'Métiers',\n", - " 'Grammaire',\n", - " 'Métiers',\n", - " 'Métiers',\n", - " 'Histoire naturelle',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Musique',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Commerce',\n", - " 'Métiers',\n", - " 'Marine',\n", - " 'Marine',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Marine',\n", - " 'Religion',\n", - " 'Droit - Jurisprudence',\n", - " 'Musique',\n", - " 'Histoire naturelle',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Mesure',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Musique',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Médecine - Chirurgie',\n", - " 'Blason',\n", - " 'Belles-lettres - Poésie',\n", - " 'Métiers',\n", - " 'Economie domestique',\n", - " 'Géographie',\n", - " 'Mathématiques',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Chimie',\n", - " 'Géographie',\n", - " 'Belles-lettres - Poésie',\n", - " 'Marine',\n", - " 'Anatomie',\n", - " 'Commerce',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Métiers',\n", - " 'Marine',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Métiers',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Belles-lettres - Poésie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Histoire',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Agriculture - Economie rustique',\n", - " 'Histoire naturelle',\n", - " 'Médecine - Chirurgie',\n", - " 'Architecture',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Commerce',\n", - " 'Antiquité',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Religion',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Histoire',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Jeu',\n", - " 'Marine',\n", - " 'Métiers',\n", - " 'Marine',\n", - " 'Architecture',\n", - " 'Architecture',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Musique',\n", - " 'Métiers',\n", - " 'Histoire naturelle',\n", - " 'Antiquité',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Marine',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Marine',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Métiers',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Histoire naturelle',\n", - " 'Marine',\n", - " 'Métiers',\n", - " 'Pêche',\n", - " 'Droit - Jurisprudence',\n", - " 'Marine',\n", - " 'Histoire naturelle',\n", - " 'Droit - Jurisprudence',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Agriculture - Economie rustique',\n", - " 'Histoire naturelle',\n", - " 'Grammaire',\n", - " 'Architecture',\n", - " 'Anatomie',\n", - " 'Commerce',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Histoire naturelle',\n", - " 'Antiquité',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Droit - Jurisprudence',\n", - " 'Chasse',\n", - " 'Métiers',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire naturelle',\n", - " 'Métiers',\n", - " 'Grammaire',\n", - " 'Architecture',\n", - " 'Philosophie',\n", - " 'Géographie',\n", - " 'Chasse',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Métiers',\n", - " 'Histoire naturelle',\n", - " 'Antiquité',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire naturelle',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Antiquité',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Chasse',\n", - " 'Architecture',\n", - " 'Histoire',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Histoire',\n", - " 'Droit - Jurisprudence',\n", - " 'Marine',\n", - " 'Agriculture - Economie rustique',\n", - " 'Architecture',\n", - " 'Commerce',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Anatomie',\n", - " 'Histoire',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Médecine - Chirurgie',\n", - " 'Histoire naturelle',\n", - " 'Architecture',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Marine',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Marine',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Géographie',\n", - " 'Architecture',\n", - " 'Géographie',\n", - " 'Chimie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Mathématiques',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Médecine - Chirurgie',\n", - " 'Chasse',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Religion',\n", - " 'Agriculture - Economie rustique',\n", - " 'Marine',\n", - " 'Commerce',\n", - " 'Anatomie',\n", - " 'Anatomie',\n", - " 'Agriculture - Economie rustique',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Musique',\n", - " 'Architecture',\n", - " 'Marine',\n", - " 'Histoire',\n", - " 'Géographie',\n", - " 'Grammaire',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Superstition',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Histoire naturelle',\n", - " 'Agriculture - Economie rustique',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire naturelle',\n", - " 'Métiers',\n", - " 'Pharmacie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Droit - Jurisprudence',\n", - " 'Grammaire',\n", - " 'Histoire',\n", - " 'Agriculture - Economie rustique',\n", - " 'Droit - Jurisprudence',\n", - " 'Histoire naturelle',\n", - " 'Histoire',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Histoire',\n", - " 'Histoire',\n", - " 'Belles-lettres - Poésie',\n", - " 'Histoire',\n", - " 'Architecture',\n", - " 'Physique - [Sciences physico-mathématiques]',\n", - " 'Philosophie',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Religion',\n", - " 'Droit - Jurisprudence',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Antiquité',\n", - " 'Grammaire',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Marine',\n", - " 'Mathématiques',\n", - " 'Antiquité',\n", - " 'Caractères',\n", - " 'Commerce',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Géographie',\n", - " 'Métiers',\n", - " 'Médecine - Chirurgie',\n", - " 'Belles-lettres - Poésie',\n", - " 'Métiers',\n", - " 'Médecine - Chirurgie',\n", - " 'Droit - Jurisprudence',\n", - " 'Antiquité',\n", - " 'Maréchage - Manège',\n", - " 'Economie domestique',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Agriculture - Economie rustique',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Commerce',\n", - " 'Beaux-arts',\n", - " 'Mathématiques',\n", - " 'Géographie',\n", - " 'Antiquité',\n", - " 'Géographie',\n", - " 'Histoire naturelle',\n", - " 'Médecine - Chirurgie',\n", - " 'Géographie',\n", - " 'Droit - Jurisprudence',\n", - " 'Droit - Jurisprudence',\n", - " 'Métiers',\n", - " 'Militaire (Art) - Guerre - Arme',\n", - " 'Chasse',\n", - " 'Marine',\n", - " 'Géographie',\n", - " 'Agriculture - Economie rustique',\n", - " 'Métiers',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " 'Géographie',\n", - " ...]" + "672.05" ] }, - "execution_count": 40, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "true_labels = df[column_class].tolist()" + "len(texts) / 20" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 1%| | 5/673 [10:28<23:19:26, 125.70s/it]\n" + ] + }, + { + "ename": "IndexError", + "evalue": "string index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[36], line 12\u001b[0m\n\u001b[1;32m 8\u001b[0m prob_labels \u001b[39m=\u001b[39m []\n\u001b[1;32m 10\u001b[0m \u001b[39mfor\u001b[39;00m j, content \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(batch):\n\u001b[0;32m---> 12\u001b[0m true_probs \u001b[39m=\u001b[39m zero_shot_prediction(content[i][:\u001b[39m1024\u001b[39m], classes)\n\u001b[1;32m 14\u001b[0m \u001b[39m#pred_labels.append(get_highest_score(true_probs, classes)[0])\u001b[39;00m\n\u001b[1;32m 15\u001b[0m prob_labels\u001b[39m.\u001b[39mappend(get_sorted_scores(true_probs, classes))\n", + "\u001b[0;31mIndexError\u001b[0m: string index out of range" + ] + } + ], + "source": [ + "texts = df[column_text].tolist()\n", + "batch_size = 20\n", + "\n", + "for i in tqdm(range(0, len(texts), batch_size)):\n", + " batch = texts[i:i+batch_size]\n", + " batch_y_true = y_true[i:i+batch_size]\n", + "\n", + " prob_labels = []\n", + "\n", + " for j, content in enumerate(batch):\n", + "\n", + " true_probs = zero_shot_prediction(content[j][:1024], classes)\n", + " \n", + " #pred_labels.append(get_highest_score(true_probs, classes)[0])\n", + " prob_labels.append(get_sorted_scores(true_probs, classes))\n", + "\n", + " with open('zero-shot-classification.tsv', 'a') as f:\n", + " f.write(get_tsv_content(batch_y_true, prob_labels))\n", + " #print(prob_labels) \n" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -1817,11 +886,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.15" }, "vscode": { "interpreter": { - "hash": "16fac9c2d845f8e1f8c6fffffe3d3a0def61c7e42da17a08d00f279ad4dea797" + "hash": "c5e52a88b09a5dede77937220424e7ea0969cc6c60aea64b9a5e920a2b19e73c" } } },