import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from matplotlib_venn import venn2


def load_lib(path):
    table = pq.read_table(path)
    table = table.to_pandas()

    return table

if __name__ =='__main__':
    df1 = load_lib('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
    df2 = load_lib('fasta/steigerwaltii variants/uniparc_proteome_UP000033499_2025_03_14.predicted.parquet')

    set1 = set(df1['Stripped.Sequence'].to_list())
    set2 = set(df2['Stripped.Sequence'].to_list())

    venn2((set1, set2), ('Group1', 'Group2'))
    plt.show()
    plt.savefig('fasta_similarity_diann.png')