diff --git a/download_corpus.py b/download_corpus.py index 589d7177af61042bdf3be5d7782ca77df7a76b12..6a4abaf14435e9c1ea054a9559617ac1d5e127c3 100644 --- a/download_corpus.py +++ b/download_corpus.py @@ -1,25 +1,26 @@ import json import os -with open("./wf_crawl_nextflow.json") as json_file: - crawler = json.load(json_file) +#with open("./wf_crawl_nextflow.json") as json_file: +# crawler = json.load(json_file) +# +#crawler.pop("last_date") -crawler.pop("last_date") +def download(path, crawler): + #These are the licenses that we are keeping + LICENSE = ["Apache License 2.0", "GNU General Public License v3.0", "MIT License"] -#These are the licenses that we are keeping -LICENSE = ["Apache License 2.0", "GNU General Public License v3.0", "MIT License"] - -#Code for downloading the files -index = 1 -downloaded = 0 -for project in crawler: - print(f'* {index}/{len(crawler)}') - if(crawler[project]["license"] in LICENSE): - print(f"Downloading...'{project}'") - downloaded+=1 - os.system(f"mkdir -p ") - os.system(f'git clone https://github.com/{project}.git ../../../Workflow-Corpus-Open-License/{project}') - index+=1 - print('\n\n') -print(downloaded) + #Code for downloading the files + index = 1 + downloaded = 0 + for project in crawler: + print(f'* {index}/{len(crawler)}') + if(crawler[project]["license"] in LICENSE): + print(f"Downloading...'{project}'") + downloaded+=1 + os.system(f"mkdir -p ") + os.system(f'git clone https://github.com/{project}.git {path}/{project}') + index+=1 + print('\n\n') + print(downloaded) \ No newline at end of file