Skip to content
Snippets Groups Projects
Commit 3958adba authored by George Marchment's avatar George Marchment
Browse files

Add results

parent 0ee1f5d9
No related branches found
No related tags found
No related merge requests found
# Github-Crawler # Github-Crawler
>This branch corresponds to the results of the crawler for the study linked with [BioFlow-Insight](https://gitlab.liris.cnrs.fr/sharefair/bioflow-insight)
[![MIT licensed](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) [![MIT licensed](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
......
%% Cell type:markdown id: tags:
# Analysis of results of crawler
%% Cell type:code id: tags:
``` python
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set(style = 'darkgrid', palette = "Accent")
taille = (9, 5)
```
%% Output
/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.1
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
%% Cell type:code id: tags:
``` python
import json
import pandas as pd
with open('wf_crawl_nextflow.json') as json_file:
dict = json.load(json_file)
_ = dict.pop("last_date")
```
%% Cell type:code id: tags:
``` python
print(f"The crawler found {len(dict)} Nextflow workflows with at least Nextflow file at the root.")
```
%% Output
The crawler found 752 Nextflow workflows with at least Nextflow file at the root.
%% Cell type:code id: tags:
``` python
tab = []
for wf in dict:
tab.append(dict[wf])
df = pd.DataFrame(tab)
```
%% Cell type:code id: tags:
``` python
name = "Nextflow workflows"
df[name] = 1
df["First commit"] = pd.to_datetime(df["first_commit_date"], format=f'%Y-%m-%d')
fig = plt.figure()
ax = fig.add_subplot(111)
ax2 = ax.twinx()
group_by = 'First commit'
df[group_by].hist(ax=ax, bins=84)
df[[name, group_by]].groupby(group_by).count().cumsum().plot(linewidth=2, color='orangered', ax=ax2, grid=False)
ax.set_ylabel('Counts')
ax2.set_ylabel('Cumulative sum')
ax.set_xlabel('Date')
plt.title("Evolution of the yearly and cumulative number of Nextflow workflows available on GitHub");
plt.show()
```
%% Output
%% Cell type:markdown id: tags:
We only want to use open workflows, so we are only gonna use the workflows which have an open license. We are gonna keep the ones which have :
* Apache License 2.0
* GNU General Public License v3.0
* MIT License
%% Cell type:code id: tags:
``` python
nb_open = len(df[(df["license"] =="Apache License 2.0") | (df["license"] == "GNU General Public License v3.0") | (df["license"] == "MIT License")])
print(f"Only keeping open workflows that leaves us with {nb_open} workflows.")
```
%% Output
Only keeping open workflows that leaves us with 677 workflows.
%% Cell type:code id: tags:
``` python
```
import json
import os
with open("./wf_crawl_nextflow.json") as json_file:
crawler = json.load(json_file)
crawler.pop("last_date")
#These are the licenses that we are keeping
LICENSE = ["Apache License 2.0", "GNU General Public License v3.0", "MIT License"]
#Code for downloading the files
index = 1
downloaded = 0
for project in crawler:
print(f'* {index}/{len(crawler)}')
if(crawler[project]["license"] in LICENSE):
print(f"Downloading...'{project}'")
downloaded+=1
os.system(f"mkdir -p ")
os.system(f'git clone https://github.com/{project}.git ../../../Workflow-Corpus-Open-License/{project}')
index+=1
print('\n\n')
print(downloaded)
\ No newline at end of file
source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment