Skip to content
Snippets Groups Projects
Commit 3958adba authored by George Marchment's avatar George Marchment
Browse files

Add results

parent 0ee1f5d9
No related merge requests found
# Github-Crawler
>This branch corresponds to the results of the crawler for the study linked with [BioFlow-Insight](https://gitlab.liris.cnrs.fr/sharefair/bioflow-insight)
[![MIT licensed](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
......
%% Cell type:markdown id: tags:
# Analysis of results of crawler
%% Cell type:code id: tags:
``` python
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set(style = 'darkgrid', palette = "Accent")
taille = (9, 5)
```
%% Output
/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.1
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
%% Cell type:code id: tags:
``` python
import json
import pandas as pd
with open('wf_crawl_nextflow.json') as json_file:
dict = json.load(json_file)
_ = dict.pop("last_date")
```
%% Cell type:code id: tags:
``` python
print(f"The crawler found {len(dict)} Nextflow workflows with at least Nextflow file at the root.")
```
%% Output
The crawler found 752 Nextflow workflows with at least Nextflow file at the root.
%% Cell type:code id: tags:
``` python
tab = []
for wf in dict:
tab.append(dict[wf])
df = pd.DataFrame(tab)
```
%% Cell type:code id: tags:
``` python
name = "Nextflow workflows"
df[name] = 1
df["First commit"] = pd.to_datetime(df["first_commit_date"], format=f'%Y-%m-%d')
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
group_by = 'First commit'
df[group_by].hist(ax=ax, bins=84)
df[[name, group_by]].groupby(group_by).count().cumsum().plot(linewidth=2, color='orangered', ax=ax2, grid=False)
ax.set_ylabel('Counts')
ax2.set_ylabel('Cumulative sum')
ax.set_xlabel('Date')
plt.title("Evolution of the yearly and cumulative number of Nextflow workflows available on GitHub");
plt.show()
```
%% Output
%% Cell type:markdown id: tags:
We only want to use open workflows, so we are only gonna use the workflows which have an open license. We are gonna keep the ones which have :
* Apache License 2.0
* GNU General Public License v3.0
* MIT License
%% Cell type:code id: tags:
``` python
nb_open = len(df[(df["license"] =="Apache License 2.0") | (df["license"] == "GNU General Public License v3.0") | (df["license"] == "MIT License")])
print(f"Only keeping open workflows that leaves us with {nb_open} workflows.")
```
%% Output
Only keeping open workflows that leaves us with 677 workflows.
%% Cell type:code id: tags:
``` python
```
import json
import os
with open("./wf_crawl_nextflow.json") as json_file:
crawler = json.load(json_file)
crawler.pop("last_date")
#These are the licenses that we are keeping
LICENSE = ["Apache License 2.0", "GNU General Public License v3.0", "MIT License"]
#Code for downloading the files
index = 1
downloaded = 0
for project in crawler:
print(f'* {index}/{len(crawler)}')
if(crawler[project]["license"] in LICENSE):
print(f"Downloading...'{project}'")
downloaded+=1
os.system(f"mkdir -p ")
os.system(f'git clone https://github.com/{project}.git ../../../Workflow-Corpus-Open-License/{project}')
index+=1
print('\n\n')
print(downloaded)
\ No newline at end of file
source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment