From bf6322002948c7b3df3657e0bd0e5fd82e7c8bd5 Mon Sep 17 00:00:00 2001 From: George Marchment <georgemarchment@yahoo.fr> Date: Thu, 7 Mar 2024 13:50:29 +0100 Subject: [PATCH] update download --- download_corpus.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/download_corpus.py b/download_corpus.py index 589d717..6a4abaf 100644 --- a/download_corpus.py +++ b/download_corpus.py @@ -1,25 +1,26 @@ import json import os -with open("./wf_crawl_nextflow.json") as json_file: - crawler = json.load(json_file) +#with open("./wf_crawl_nextflow.json") as json_file: +# crawler = json.load(json_file) +# +#crawler.pop("last_date") -crawler.pop("last_date") +def download(path, crawler): + #These are the licenses that we are keeping + LICENSE = ["Apache License 2.0", "GNU General Public License v3.0", "MIT License"] -#These are the licenses that we are keeping -LICENSE = ["Apache License 2.0", "GNU General Public License v3.0", "MIT License"] - -#Code for downloading the files -index = 1 -downloaded = 0 -for project in crawler: - print(f'* {index}/{len(crawler)}') - if(crawler[project]["license"] in LICENSE): - print(f"Downloading...'{project}'") - downloaded+=1 - os.system(f"mkdir -p ") - os.system(f'git clone https://github.com/{project}.git ../../../Workflow-Corpus-Open-License/{project}') - index+=1 - print('\n\n') -print(downloaded) + #Code for downloading the files + index = 1 + downloaded = 0 + for project in crawler: + print(f'* {index}/{len(crawler)}') + if(crawler[project]["license"] in LICENSE): + print(f"Downloading...'{project}'") + downloaded+=1 + os.system(f"mkdir -p ") + os.system(f'git clone https://github.com/{project}.git {path}/{project}') + index+=1 + print('\n\n') + print(downloaded) \ No newline at end of file -- GitLab