Skip to content
Snippets Groups Projects
ro_crate.py 6.89 KiB
import json
import glob
import os
import re

from . import constant

#Need to add these things here
# self.datePublished = datePublished
# self.description = description
# self.license = license
# self.creativeWorkStatus = creativeWorkStatus
# self.authors = authors
# self.version = version
# self.keywords = keywords
# self.producer = producer
# self.publisher = publisher



class RO_Crate:
    def __init__(self, workflow):
        self.workflow = workflow
        self.directory = '/'.join(workflow.get_file_address().split('/')[:-1])
        self.files = []
        self.dico = {}
        self.dico["temp_directory"] = self.directory

    def get_files(self):
        self.files = glob.glob(f'{self.directory}/**/*.*', recursive=True)
        tab_files = []
        for file in self.files:
            tab_files.append({"@id":file[len(self.directory)+1:]})
        return tab_files

    def initialise_dico(self):
        self.dico["@context"] = "https://w3id.org/ro/crate/1.1/context"
        self.dico["@graph"] = []
        #GENERAL
        general = {}
        #general["@id"] = f"ro-crate-metadata-{self.workflow.get_name()}.json"
        general["@id"] = f"ro-crate-metadata.json"
        general["@type"] = "CreativeWork"
        general["about"] = {"@id":"./"}
        general["conformsTo"] = [{"@id":"https://w3id.org/ro/crate/1.1"}
                                 #, {"@id":"https://w3id.org/workflowhub/workflow-ro-crate/1.0"}#This description does not conform 
                                 ]
        self.dico["@graph"].append(general)
        #ROOT
        root = {}
        root["@id"] = "./"
        root["@type"] = "Dataset"
        root["name"] = self.workflow.get_name()
        root["datePublished"] = self.workflow.get_datePublished()
        root["description"] = self.workflow.get_description()
        root["mainEntity"] = {"@id": self.workflow.get_main_file()}
                              #, "@type":["File", "SoftwareSourceCode"]} #We do not consider a File as a "ComputationalWorkflow" since multiple (sub)workflows can be defined in a same file
        root["license"] = {"@id":self.workflow.get_license()}
        authors = self.workflow.get_authors()
        tab_authors = []
        for author in authors:
            try:
                #tab_authors.append({"@id":author["@id"], "email":author["email"]})
                tab_authors.append({"@id":f'#{"_".join(author["@id"].split())}', "@name":author["@id"],"email":author["email"]})
            except:
                #tab_authors.append({"@id":author["@id"]})
                tab_authors.append({"@id":f'#{"_".join(author["@id"].split())}', "@name":author["@id"]})
        root["author"] = tab_authors
        root["maintainer"] = tab_authors #Right now i'm assuming that all the authors are maintainers
        files = self.get_files()
        tab_files = []
        for file in files:
            tab_files.append({"@id":file["@id"]})
        root["hasPart"] = tab_files
        root["publisher"] = {"@id":self.workflow.get_publisher()}
        #subjectOf TODO
        root["subjectOf"] = None
        root["creativeWorkStatus"] = self.workflow.get_creativeWorkStatus()
        root["@version"] = self.workflow.get_version()
        root["keywords"] = self.workflow.get_keywords()
        root["producer"] = self.workflow.get_producer()
        self.dico["@graph"].append(root)

    #TODO 
    def get_programming_language(self, file):
        if(file[-3:]==".nf"):
            return "https://w3id.org/workflowhub/workflow-ro-crate#nextflow"
        return None
    
    def get_contentSize(self, file):
        file_stats = os.stat(file)
        return file_stats.st_size/1e3
    
    def fill_log_file(self, file, reverse = True):
        info = ""
        current_directory = os.getcwd()
        os.chdir("/".join(self.workflow.nextflow_file.get_file_address().split("/")[:-1]))
        try:           
            os.system(f"git log {'--reverse'*reverse} \"{file}\" > temp_{id(self)}.txt")
            with open(f'temp_{id(self)}.txt') as f:
                info = f.read()
            os.system(f"rm temp_{id(self)}.txt")
        except:
            None
        os.chdir(current_directory)
        return info

    def get_dateCreated(self, file):
        info = self.fill_log_file(file, reverse = True)
        for match in re.finditer(r"Date: +\w+ +(\w+) +(\d+) +\d+:\d+:\d+ +(\d+)", info):
            month = constant.month_mapping[match.group(1)]
            day = match.group(2)
            year = match.group(3)
            return f"{year}-{month}-{day}"
        return None
    
 
    def get_dateModified(self, file):
        info = self.fill_log_file(file, reverse = False)
        for match in re.finditer(r"Date: +\w+ +(\w+) +(\d+) +\d+:\d+:\d+ +(\d+)", info):
            month = constant.month_mapping[match.group(1)]
            day = match.group(2)
            year = match.group(3)
            return f"{year}-{month}-{day}"
        return None
    
    #TODO -> update this -> it's incomplet
    def get_url(self, file):
        if(self.workflow.dico!={}):
            return f"https://github.com/{self.workflow.get_file_address()}/blob/main/{file}"
        return None
    

    def get_creators(self, file):
        info = self.fill_log_file(file, reverse = True)
        for match in re.finditer(r"Author: ([^>]+)<([^>]+)>",info):
            return [{"@id": match.group(1).strip()}]
        return []


    def get_types(self, file):
        types = ["File"]
        if(file[-3:]==".nf"):
            types.append("SoftwareSourceCode")
        return types
        

    def initialise_file(self, file):
        key = file[len(self.directory)+1:]
        dico = {}
        dico["@id"] = key
        dico["name"] = key
        dico["@type"] = self.get_types(file)
        dico["programmingLanguage"] = {"@id":self.get_programming_language(file)}
        dico["contentSize"] = self.get_contentSize(file)
        dico["dateCreated"] = self.get_dateCreated(key)
        dico["dateModified"] = self.get_dateModified(key)
        dico["url"] = self.get_url(key)
        creators = self.get_creators(key)
        dico["creator"] = []
        for creator in creators:
            dico["creator"].append({"@id": creator["@id"]})
        dico["isPartOf"] = []
        dico["hasPart"] = []
        self.dico["@graph"].append(dico)    

    def fill_from_workflow(self):
        self.workflow.add_2_rocrate(self.dico)

    def initialise(self):
        self.initialise_dico()
        for file in self.files:
            self.initialise_file(file)
        self.fill_from_workflow()
        self.dico.pop("temp_directory")

        name = self.workflow.get_name()
        name = name.replace('github.com/', '')
        name = re.sub(r"^[ .]|[/<>:\"\\|?*]+|[ .]$", "-", name)

        #with open(f"{self.workflow.get_output_dir()}/ro-crate-metadata-{name}.json", 'w') as output_file :
        with open(f"{self.workflow.get_output_dir()}/ro-crate-metadata.json", 'w') as output_file :
            json.dump(self.dico, output_file, indent=2)