Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
MTA_Heritage
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ANR AAA
MTA_Heritage
Commits
abdb63a0
Commit
abdb63a0
authored
6 months ago
by
Tetiana Yemelianenko
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
b34de20d
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
Retrieve_combined_features.ipynb
+309
-0
309 additions, 0 deletions
Retrieve_combined_features.ipynb
with
309 additions
and
0 deletions
Retrieve_combined_features.ipynb
0 → 100644
+
309
−
0
View file @
abdb63a0
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f8b76e74",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from PIL import Image\n",
"from annoy import AnnoyIndex\n",
"import random\n",
"\n",
"base_dir = 'path_to_the_dataset'\n",
"top_count = 100\n",
"top_K = 11"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43c9c917",
"metadata": {},
"outputs": [],
"source": [
"#the 1st column is index, the second is path to the paintings, rest columns are pre-calculated embeddings\n",
"df_artist = pd.read_csv('dino_features_wikiart_artist.csv')\n",
"df_genre = pd.read_csv('dino_features_wikiart_genre.csv')\n",
"df_style = pd.read_csv('dino_features_wikiart_style.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be5120fe",
"metadata": {},
"outputs": [],
"source": [
"def build_index(df, filename):\n",
" NUMBER_OF_TREES = 25\n",
" feature_dim = len(df.columns) - 2\n",
"\n",
" t = AnnoyIndex(feature_dim, metric='euclidean')\n",
"\n",
" for i in range(len(df)):\n",
" vector = df.loc[i][2:]\n",
" t.add_item(i, vector)\n",
" \n",
" _ = t.build(NUMBER_OF_TREES)\n",
"\n",
" #save indexes\n",
" t.save(base_dir + filename)\n",
" return t"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afa13d41",
"metadata": {},
"outputs": [],
"source": [
"t_genre = build_index(df_genre, 'dino_genre.ann')\n",
"t_style = build_index(df_style, 'dino_style.ann')\n",
"t_artist = build_index(df_artist, 'dino_artist.ann')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "feff4c2f",
"metadata": {},
"outputs": [],
"source": [
"#download saved indexes\n",
"feature_dim = len(df_genre.columns) - 2\n",
"\n",
"t_genre = AnnoyIndex(feature_dim, metric='euclidean')\n",
"t_genre.load('dino_genre.ann')\n",
"\n",
"t_style = AnnoyIndex(feature_dim, metric='euclidean')\n",
"t_style.load('dino_style.ann')\n",
"\n",
"t_artist = AnnoyIndex(feature_dim, metric='euclidean')\n",
"t_artist.load('dino_artist.ann')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee4c29ab",
"metadata": {},
"outputs": [],
"source": [
"#receive similar images using ANNOY\n",
"def get_similar_images_annoy(vector, t):\n",
" indices, dists = t.get_nns_by_vector(vector, top_count+1, include_distances=True)\n",
" return indices, dists"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b10c4d0",
"metadata": {},
"outputs": [],
"source": [
"def show_similar(img_str, img_list, k, distances):\n",
" img = Image.open(base_dir + img_str)\n",
" plt.title('Request painting')\n",
" plt.axis(\"off\")\n",
" plt.imshow(img)\n",
"\n",
" plt.figure(figsize=(30, 30))\n",
" ax = plt.subplot(1, k, 1)\n",
" \n",
" i = 0\n",
" for key in img_list:\n",
" ax = plt.subplot(1, k, i+1)\n",
" img = Image.open(base_dir + key)\n",
" plt.title(\"{}\".format(round(distances[i], 4)))\n",
" plt.imshow(img)\n",
" i+=1\n",
"\n",
" plt.axis(\"off\")\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29ccae77",
"metadata": {},
"outputs": [],
"source": [
"#request image\n",
"img_str = 'Impressionism/pierre-auguste-renoir_children-on-the-seashore-1883.jpg'\n",
"\n",
"#pre-calculated embeddings of the request painting\n",
"img_req_g = np.array(df_genre[df_genre['file_path']==img_str])[0][2:]\n",
"img_req_s = np.array(df_style[df_style['file_path']==img_str])[0][2:]\n",
"img_req_a = np.array(df_artist[df_artist['file_path']==img_str])[0][2:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c2c37c1",
"metadata": {},
"outputs": [],
"source": [
"similar_images_g = []\n",
"similar_images_s = []\n",
"similar_images_a = []\n",
"\n",
"similar_img_ids_g, distances_g = get_similar_images_annoy(img_req_g, t_genre)\n",
"similar_images_g = list(df_genre.iloc[similar_img_ids_g]['file_path'])\n",
"\n",
"similar_img_ids_s, distances_s = get_similar_images_annoy(img_req_s, t_style)\n",
"similar_images_s = list(df_style.iloc[similar_img_ids_s]['file_path'])\n",
"\n",
"similar_img_ids_a, distances_a = get_similar_images_annoy(img_req_a, t_artist)\n",
"similar_images_a = list(df_artist.iloc[similar_img_ids_a]['file_path'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eac7636a",
"metadata": {},
"outputs": [],
"source": [
"#pre-calculated values of mean and std of eucledian distances on WikiArt dataset\n",
"mean_genre = 19.6129883775738\n",
"std_genre = 2.9717604654152274\n",
"mean_style = 18.328733753985198\n",
"std_style = 2.5707980486965547\n",
"mean_artist = 20.973407871326454\n",
"std_artist = 2.9063593849149183"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08e0941d",
"metadata": {},
"outputs": [],
"source": [
"# Calculating cosine distance\n",
"#request painting belongs to the dataset, so we ignore the first similar painting\n",
"df_genre_sim = df_genre[df_genre['file_path'].isin(similar_images_g)]\n",
"cosines_genre = distances_g[1:]\n",
"\n",
"df_style_sim = df_style[df_style['file_path'].isin(similar_images_s)]\n",
"cosines_style = distances_s[1:]\n",
"\n",
"df_artist_sim = df_artist[df_artist['file_path'].isin(similar_images_a)]\n",
"cosines_artist = distances_a[1:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b48542d3",
"metadata": {},
"outputs": [],
"source": [
"d = {'file_path': list(df_genre_sim['file_path'].iloc[1:]), 'd_g': cosines_genre}\n",
"df_g = pd.DataFrame(data=d)\n",
"\n",
"d = {'file_path': list(df_style_sim['file_path'].iloc[1:]), 'd_s': cosines_style}\n",
"df_s = pd.DataFrame(data=d)\n",
"\n",
"d = {'file_path': list(df_artist_sim['file_path'].iloc[1:]), 'd_a': cosines_artist}\n",
"df_a = pd.DataFrame(data=d)\n",
"\n",
"df_u = pd.merge(df_g, df_s, on='file_path', how='outer')\n",
"df_u = pd.merge(df_u, df_a, on='file_path', how='outer')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92bb15c8",
"metadata": {},
"outputs": [],
"source": [
"def func(x):\n",
" missing_indices = x.index[x.isnull()]\n",
" for index in missing_indices:\n",
" if x.name == 'd_g':\n",
" img_sim = df_u.loc[index, 'file_path']\n",
" eucl_distance = np.linalg.norm(img_req_g - df_genre[df_genre['file_path'] == img_sim].iloc[0, 2:])\n",
" x.loc[index] = eucl_distance\n",
" elif x.name == 'd_s': \n",
" img_sim = df_u.loc[index, 'file_path']\n",
" eucl_distance = np.linalg.norm(img_req_s - df_style[df_style['file_path'] == img_sim].iloc[0, 2:])\n",
" x.loc[index] = eucl_distance\n",
" elif x.name == 'd_a': \n",
" img_sim = df_u.loc[index, 'file_path']\n",
" eucl_distance = np.linalg.norm(img_req_a - df_artist[df_artist['file_path'] == img_sim].iloc[0, 2:])\n",
" x.loc[index] = eucl_distance\n",
"\n",
" return x\n",
"\n",
"df_u = df_u.apply(func, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e464ba0",
"metadata": {},
"outputs": [],
"source": [
"df_u['d_g'] = (df_u['d_g'] - mean_genre)/std_genre\n",
"df_u['d_s'] = (df_u['d_s'] - mean_style)/std_style\n",
"df_u['d_a'] = (df_u['d_a'] - mean_style)/std_style"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ea5a057",
"metadata": {},
"outputs": [],
"source": [
"#combined recommendations with the same weights\n",
"df_u['combined'] = df_u['d_g'] + df_u['d_s'] + df_u['d_a']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9e0d83f",
"metadata": {},
"outputs": [],
"source": [
"paintings = list(df_u.sort_values(by=['combined'])['file_path'])[:top_K]\n",
"dist = list(df_u.sort_values(by=['combined'])['combined'])[:top_K]\n",
"show_similar(img_str, paintings, top_K, dist)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:f8b76e74 tags:
```
python
import
pandas
as
pd
import
numpy
as
np
import
matplotlib.pyplot
as
plt
from
PIL
import
Image
from
annoy
import
AnnoyIndex
import
random
base_dir
=
'
path_to_the_dataset
'
top_count
=
100
top_K
=
11
```
%% Cell type:code id:43c9c917 tags:
```
python
#the 1st column is index, the second is path to the paintings, rest columns are pre-calculated embeddings
df_artist
=
pd
.
read_csv
(
'
dino_features_wikiart_artist.csv
'
)
df_genre
=
pd
.
read_csv
(
'
dino_features_wikiart_genre.csv
'
)
df_style
=
pd
.
read_csv
(
'
dino_features_wikiart_style.csv
'
)
```
%% Cell type:code id:be5120fe tags:
```
python
def
build_index
(
df
,
filename
):
NUMBER_OF_TREES
=
25
feature_dim
=
len
(
df
.
columns
)
-
2
t
=
AnnoyIndex
(
feature_dim
,
metric
=
'
euclidean
'
)
for
i
in
range
(
len
(
df
)):
vector
=
df
.
loc
[
i
][
2
:]
t
.
add_item
(
i
,
vector
)
_
=
t
.
build
(
NUMBER_OF_TREES
)
#save indexes
t
.
save
(
base_dir
+
filename
)
return
t
```
%% Cell type:code id:afa13d41 tags:
```
python
t_genre
=
build_index
(
df_genre
,
'
dino_genre.ann
'
)
t_style
=
build_index
(
df_style
,
'
dino_style.ann
'
)
t_artist
=
build_index
(
df_artist
,
'
dino_artist.ann
'
)
```
%% Cell type:code id:feff4c2f tags:
```
python
#download saved indexes
feature_dim
=
len
(
df_genre
.
columns
)
-
2
t_genre
=
AnnoyIndex
(
feature_dim
,
metric
=
'
euclidean
'
)
t_genre
.
load
(
'
dino_genre.ann
'
)
t_style
=
AnnoyIndex
(
feature_dim
,
metric
=
'
euclidean
'
)
t_style
.
load
(
'
dino_style.ann
'
)
t_artist
=
AnnoyIndex
(
feature_dim
,
metric
=
'
euclidean
'
)
t_artist
.
load
(
'
dino_artist.ann
'
)
```
%% Cell type:code id:ee4c29ab tags:
```
python
#receive similar images using ANNOY
def
get_similar_images_annoy
(
vector
,
t
):
indices
,
dists
=
t
.
get_nns_by_vector
(
vector
,
top_count
+
1
,
include_distances
=
True
)
return
indices
,
dists
```
%% Cell type:code id:1b10c4d0 tags:
```
python
def
show_similar
(
img_str
,
img_list
,
k
,
distances
):
img
=
Image
.
open
(
base_dir
+
img_str
)
plt
.
title
(
'
Request painting
'
)
plt
.
axis
(
"
off
"
)
plt
.
imshow
(
img
)
plt
.
figure
(
figsize
=
(
30
,
30
))
ax
=
plt
.
subplot
(
1
,
k
,
1
)
i
=
0
for
key
in
img_list
:
ax
=
plt
.
subplot
(
1
,
k
,
i
+
1
)
img
=
Image
.
open
(
base_dir
+
key
)
plt
.
title
(
"
{}
"
.
format
(
round
(
distances
[
i
],
4
)))
plt
.
imshow
(
img
)
i
+=
1
plt
.
axis
(
"
off
"
)
plt
.
show
()
```
%% Cell type:code id:29ccae77 tags:
```
python
#request image
img_str
=
'
Impressionism/pierre-auguste-renoir_children-on-the-seashore-1883.jpg
'
#pre-calculated embeddings of the request painting
img_req_g
=
np
.
array
(
df_genre
[
df_genre
[
'
file_path
'
]
==
img_str
])[
0
][
2
:]
img_req_s
=
np
.
array
(
df_style
[
df_style
[
'
file_path
'
]
==
img_str
])[
0
][
2
:]
img_req_a
=
np
.
array
(
df_artist
[
df_artist
[
'
file_path
'
]
==
img_str
])[
0
][
2
:]
```
%% Cell type:code id:9c2c37c1 tags:
```
python
similar_images_g
=
[]
similar_images_s
=
[]
similar_images_a
=
[]
similar_img_ids_g
,
distances_g
=
get_similar_images_annoy
(
img_req_g
,
t_genre
)
similar_images_g
=
list
(
df_genre
.
iloc
[
similar_img_ids_g
][
'
file_path
'
])
similar_img_ids_s
,
distances_s
=
get_similar_images_annoy
(
img_req_s
,
t_style
)
similar_images_s
=
list
(
df_style
.
iloc
[
similar_img_ids_s
][
'
file_path
'
])
similar_img_ids_a
,
distances_a
=
get_similar_images_annoy
(
img_req_a
,
t_artist
)
similar_images_a
=
list
(
df_artist
.
iloc
[
similar_img_ids_a
][
'
file_path
'
])
```
%% Cell type:code id:eac7636a tags:
```
python
#pre-calculated values of mean and std of eucledian distances on WikiArt dataset
mean_genre
=
19.6129883775738
std_genre
=
2.9717604654152274
mean_style
=
18.328733753985198
std_style
=
2.5707980486965547
mean_artist
=
20.973407871326454
std_artist
=
2.9063593849149183
```
%% Cell type:code id:08e0941d tags:
```
python
# Calculating cosine distance
#request painting belongs to the dataset, so we ignore the first similar painting
df_genre_sim
=
df_genre
[
df_genre
[
'
file_path
'
].
isin
(
similar_images_g
)]
cosines_genre
=
distances_g
[
1
:]
df_style_sim
=
df_style
[
df_style
[
'
file_path
'
].
isin
(
similar_images_s
)]
cosines_style
=
distances_s
[
1
:]
df_artist_sim
=
df_artist
[
df_artist
[
'
file_path
'
].
isin
(
similar_images_a
)]
cosines_artist
=
distances_a
[
1
:]
```
%% Cell type:code id:b48542d3 tags:
```
python
d
=
{
'
file_path
'
:
list
(
df_genre_sim
[
'
file_path
'
].
iloc
[
1
:]),
'
d_g
'
:
cosines_genre
}
df_g
=
pd
.
DataFrame
(
data
=
d
)
d
=
{
'
file_path
'
:
list
(
df_style_sim
[
'
file_path
'
].
iloc
[
1
:]),
'
d_s
'
:
cosines_style
}
df_s
=
pd
.
DataFrame
(
data
=
d
)
d
=
{
'
file_path
'
:
list
(
df_artist_sim
[
'
file_path
'
].
iloc
[
1
:]),
'
d_a
'
:
cosines_artist
}
df_a
=
pd
.
DataFrame
(
data
=
d
)
df_u
=
pd
.
merge
(
df_g
,
df_s
,
on
=
'
file_path
'
,
how
=
'
outer
'
)
df_u
=
pd
.
merge
(
df_u
,
df_a
,
on
=
'
file_path
'
,
how
=
'
outer
'
)
```
%% Cell type:code id:92bb15c8 tags:
```
python
def
func
(
x
):
missing_indices
=
x
.
index
[
x
.
isnull
()]
for
index
in
missing_indices
:
if
x
.
name
==
'
d_g
'
:
img_sim
=
df_u
.
loc
[
index
,
'
file_path
'
]
eucl_distance
=
np
.
linalg
.
norm
(
img_req_g
-
df_genre
[
df_genre
[
'
file_path
'
]
==
img_sim
].
iloc
[
0
,
2
:])
x
.
loc
[
index
]
=
eucl_distance
elif
x
.
name
==
'
d_s
'
:
img_sim
=
df_u
.
loc
[
index
,
'
file_path
'
]
eucl_distance
=
np
.
linalg
.
norm
(
img_req_s
-
df_style
[
df_style
[
'
file_path
'
]
==
img_sim
].
iloc
[
0
,
2
:])
x
.
loc
[
index
]
=
eucl_distance
elif
x
.
name
==
'
d_a
'
:
img_sim
=
df_u
.
loc
[
index
,
'
file_path
'
]
eucl_distance
=
np
.
linalg
.
norm
(
img_req_a
-
df_artist
[
df_artist
[
'
file_path
'
]
==
img_sim
].
iloc
[
0
,
2
:])
x
.
loc
[
index
]
=
eucl_distance
return
x
df_u
=
df_u
.
apply
(
func
,
axis
=
0
)
```
%% Cell type:code id:7e464ba0 tags:
```
python
df_u
[
'
d_g
'
]
=
(
df_u
[
'
d_g
'
]
-
mean_genre
)
/
std_genre
df_u
[
'
d_s
'
]
=
(
df_u
[
'
d_s
'
]
-
mean_style
)
/
std_style
df_u
[
'
d_a
'
]
=
(
df_u
[
'
d_a
'
]
-
mean_style
)
/
std_style
```
%% Cell type:code id:5ea5a057 tags:
```
python
#combined recommendations with the same weights
df_u
[
'
combined
'
]
=
df_u
[
'
d_g
'
]
+
df_u
[
'
d_s
'
]
+
df_u
[
'
d_a
'
]
```
%% Cell type:code id:b9e0d83f tags:
```
python
paintings
=
list
(
df_u
.
sort_values
(
by
=
[
'
combined
'
])[
'
file_path
'
])[:
top_K
]
dist
=
list
(
df_u
.
sort_values
(
by
=
[
'
combined
'
])[
'
combined
'
])[:
top_K
]
show_similar
(
img_str
,
paintings
,
top_K
,
dist
)
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment