Skip to content
Snippets Groups Projects
Commit 53754093 authored by George Marchment's avatar George Marchment
Browse files

Add workflowhub stat

parent 3958adba
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Analysis of results of crawler # Analysis of results of crawler
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
sns.set(style = 'darkgrid', palette = "Accent") sns.set(style = 'darkgrid', palette = "Accent")
taille = (9, 5) taille = (9, 5)
``` ```
%% Output %% Output
/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.1 /usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.1
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}" warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import json import json
import pandas as pd import pandas as pd
with open('wf_crawl_nextflow.json') as json_file: with open('wf_crawl_nextflow.json') as json_file:
dict = json.load(json_file) dict = json.load(json_file)
_ = dict.pop("last_date") _ = dict.pop("last_date")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print(f"The crawler found {len(dict)} Nextflow workflows with at least Nextflow file at the root.") print(f"The crawler found {len(dict)} Nextflow workflows with at least Nextflow file at the root.")
``` ```
%% Output %% Output
The crawler found 752 Nextflow workflows with at least Nextflow file at the root. The crawler found 752 Nextflow workflows with at least Nextflow file at the root.
%% Cell type:markdown id: tags:
At the time of writing there are 52 Nextflow workflows integrated on WorkflowHub.
%% Cell type:code id: tags:
``` python
nb_wfhub = 52
print(f"Hence, at least {(len(dict)-nb_wfhub)/len(dict)*100:.1f}% of Nextflow workflows found on Github are not integrated into WorkflowHub")
```
%% Output
Hence, at least 93.1% of Nextflow workflows found on Github are not integrated into WorkflowHub
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
tab = [] tab = []
for wf in dict: for wf in dict:
tab.append(dict[wf]) tab.append(dict[wf])
df = pd.DataFrame(tab) df = pd.DataFrame(tab)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
name = "Nextflow workflows" name = "Nextflow workflows"
df[name] = 1 df[name] = 1
df["First commit"] = pd.to_datetime(df["first_commit_date"], format=f'%Y-%m-%d') df["First commit"] = pd.to_datetime(df["first_commit_date"], format=f'%Y-%m-%d')
fig = plt.figure() fig = plt.figure()
ax = fig.add_subplot(111) ax = fig.add_subplot(111)
ax2 = ax.twinx() ax2 = ax.twinx()
group_by = 'First commit' group_by = 'First commit'
df[group_by].hist(ax=ax, bins=84) df[group_by].hist(ax=ax, bins=84)
df[[name, group_by]].groupby(group_by).count().cumsum().plot(linewidth=2, color='orangered', ax=ax2, grid=False) df[[name, group_by]].groupby(group_by).count().cumsum().plot(linewidth=2, color='orangered', ax=ax2, grid=False)
ax.set_ylabel('Counts') ax.set_ylabel('Counts')
ax2.set_ylabel('Cumulative sum') ax2.set_ylabel('Cumulative sum')
ax.set_xlabel('Date') ax.set_xlabel('Date')
plt.title("Evolution of the yearly and cumulative number of Nextflow workflows available on GitHub"); plt.title("Evolution of the yearly and cumulative number of Nextflow workflows available on GitHub");
plt.show() plt.show()
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
We only want to use open workflows, so we are only gonna use the workflows which have an open license. We are gonna keep the ones which have : We only want to use open workflows, so we are only gonna use the workflows which have an open license. We are gonna keep the ones which have :
* Apache License 2.0 * Apache License 2.0
* GNU General Public License v3.0 * GNU General Public License v3.0
* MIT License * MIT License
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
nb_open = len(df[(df["license"] =="Apache License 2.0") | (df["license"] == "GNU General Public License v3.0") | (df["license"] == "MIT License")]) nb_open = len(df[(df["license"] =="Apache License 2.0") | (df["license"] == "GNU General Public License v3.0") | (df["license"] == "MIT License")])
print(f"Only keeping open workflows that leaves us with {nb_open} workflows.") print(f"Only keeping open workflows that leaves us with {nb_open} workflows.")
``` ```
%% Output %% Output
Only keeping open workflows that leaves us with 677 workflows. Only keeping open workflows that leaves us with 677 workflows.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment