From 524cb6bfc14690bd4be9616894ee539e1cdb40a6 Mon Sep 17 00:00:00 2001 From: lschneider <leo.schneider@univ-lyon1.fr> Date: Thu, 25 Jan 2024 16:00:10 +0100 Subject: [PATCH] pyteomics module local --- pyteomics/__init__.py | 16 + pyteomics/_schema_defaults.py | 635 ++++++ pyteomics/achrom.py | 1326 +++++++++++ pyteomics/auxiliary/__init__.py | 35 + pyteomics/auxiliary/constants.py | 3297 +++++++++++++++++++++++++++ pyteomics/auxiliary/file_helpers.py | 1250 ++++++++++ pyteomics/auxiliary/math.py | 97 + pyteomics/auxiliary/patch.py | 0 pyteomics/auxiliary/structures.py | 504 ++++ pyteomics/auxiliary/target_decoy.py | 997 ++++++++ pyteomics/auxiliary/utils.py | 317 +++ pyteomics/electrochem.py | 499 ++++ pyteomics/fasta.py | 1072 +++++++++ pyteomics/mass/__init__.py | 6 + pyteomics/mass/mass.py | 1231 ++++++++++ pyteomics/mass/unimod.py | 798 +++++++ pyteomics/mgf.py | 830 +++++++ pyteomics/ms1.py | 492 ++++ pyteomics/ms2.py | 396 ++++ pyteomics/mzid.py | 453 ++++ pyteomics/mzml.py | 546 +++++ pyteomics/mzmlb.py | 618 +++++ pyteomics/mztab.py | 783 +++++++ pyteomics/mzxml.py | 328 +++ pyteomics/openms/__init__.py | 1 + pyteomics/openms/featurexml.py | 115 + pyteomics/openms/idxml.py | 430 ++++ pyteomics/openms/trafoxml.py | 82 + pyteomics/parser.py | 1148 ++++++++++ pyteomics/peff.py | 277 +++ pyteomics/pepxml.py | 573 +++++ pyteomics/proforma.py | 2372 +++++++++++++++++++ pyteomics/protxml.py | 309 +++ pyteomics/pylab_aux.py | 831 +++++++ pyteomics/tandem.py | 384 ++++ pyteomics/traml.py | 235 ++ pyteomics/usi.py | 527 +++++ pyteomics/version.py | 66 + pyteomics/xml.py | 1335 +++++++++++ 39 files changed, 25211 insertions(+) create mode 100644 pyteomics/__init__.py create mode 100644 pyteomics/_schema_defaults.py create mode 100644 pyteomics/achrom.py create mode 100644 pyteomics/auxiliary/__init__.py create mode 100644 pyteomics/auxiliary/constants.py create mode 100644 pyteomics/auxiliary/file_helpers.py create mode 100644 pyteomics/auxiliary/math.py create mode 100644 pyteomics/auxiliary/patch.py create mode 100644 pyteomics/auxiliary/structures.py create mode 100644 pyteomics/auxiliary/target_decoy.py create mode 100644 pyteomics/auxiliary/utils.py create mode 100644 pyteomics/electrochem.py create mode 100644 pyteomics/fasta.py create mode 100644 pyteomics/mass/__init__.py create mode 100644 pyteomics/mass/mass.py create mode 100644 pyteomics/mass/unimod.py create mode 100644 pyteomics/mgf.py create mode 100644 pyteomics/ms1.py create mode 100644 pyteomics/ms2.py create mode 100644 pyteomics/mzid.py create mode 100644 pyteomics/mzml.py create mode 100644 pyteomics/mzmlb.py create mode 100644 pyteomics/mztab.py create mode 100644 pyteomics/mzxml.py create mode 100644 pyteomics/openms/__init__.py create mode 100644 pyteomics/openms/featurexml.py create mode 100644 pyteomics/openms/idxml.py create mode 100644 pyteomics/openms/trafoxml.py create mode 100644 pyteomics/parser.py create mode 100644 pyteomics/peff.py create mode 100644 pyteomics/pepxml.py create mode 100644 pyteomics/proforma.py create mode 100644 pyteomics/protxml.py create mode 100644 pyteomics/pylab_aux.py create mode 100644 pyteomics/tandem.py create mode 100644 pyteomics/traml.py create mode 100644 pyteomics/usi.py create mode 100644 pyteomics/version.py create mode 100644 pyteomics/xml.py diff --git a/pyteomics/__init__.py b/pyteomics/__init__.py new file mode 100644 index 0000000..fd278bf --- /dev/null +++ b/pyteomics/__init__.py @@ -0,0 +1,16 @@ +""" +Copyright 2012 Anton Goloborodko, Lev Levitsky + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +__import__('pkg_resources').declare_namespace(__name__) diff --git a/pyteomics/_schema_defaults.py b/pyteomics/_schema_defaults.py new file mode 100644 index 0000000..e31701c --- /dev/null +++ b/pyteomics/_schema_defaults.py @@ -0,0 +1,635 @@ +_protxml_schema_defaults = {'bools': set(), + 'charlists': set(), + 'floatlists': set(), + 'floats': {('ASAPRatio', 'heavy2light_ratio_mean'), + ('ASAPRatio', 'heavy2light_ratio_standard_dev'), + ('ASAPRatio', 'ratio_mean'), + ('ASAPRatio', 'ratio_standard_dev'), + ('ASAPRatio_pvalue', 'adj_ratio_mean'), + ('ASAPRatio_pvalue', 'adj_ratio_standard_dev'), + ('ASAPRatio_pvalue', 'decimal_pvalue'), + ('ASAPRatio_pvalue', 'heavy2light_adj_ratio_mean'), + ('ASAPRatio_pvalue', 'heavy2light_adj_ratio_standard_dev'), + ('ASAPRatio_pvalue', 'pvalue'), + ('ASAP_Peak', 'heavy2light_ratio_mean'), + ('ASAP_Peak', 'heavy2light_ratio_standard_dev'), + ('ASAP_Peak', 'ratio_mean'), + ('ASAP_Peak', 'ratio_standard_dev'), + ('ASAP_Peak', 'weight'), + ('ASAP_Seq', 'heavy2light_ratio_mean'), + ('ASAP_Seq', 'heavy2light_ratio_standard_dev'), + ('ASAP_Seq', 'ratio_mean'), + ('ASAP_Seq', 'ratio_standard_dev'), + ('ASAP_Seq', 'weight'), + ('ASAP_prot_analysis_summary', 'min_peptide_probability'), + ('ASAP_prot_analysis_summary', 'min_peptide_weight'), + ('ASAP_prot_analysis_summary', 'min_protein_probability'), + ('ASAP_pvalue_analysis_summary', 'background_fitting_error'), + ('ASAP_pvalue_analysis_summary', 'background_ratio_mean'), + ('ASAP_pvalue_analysis_summary', 'background_ratio_stdev'), + ('StPeterQuant', 'SIn'), + ('StPeterQuant', 'ng'), + ('StPeterQuant_peptide', 'spectralIndex'), + ('StPeter_analysis_summary', 'FDR'), + ('StPeter_analysis_summary', 'probability'), + ('StPeter_analysis_summary', 'sampleLoad'), + ('StPeter_analysis_summary', 'tolerance'), + ('XPress_analysis_summary', 'min_peptide_probability'), + ('XPress_analysis_summary', 'min_peptide_weight'), + ('XPress_analysis_summary', 'min_protein_probability'), + ('affected_channel', 'correction'), + ('decoy_analysis_summary', 'decoy_ratio'), + ('error_point', 'error'), + ('error_point', 'min_prob'), + ('fpkm_distribution', 'alt_pos_to_neg_ratio'), + ('fpkm_distribution', 'fpkm_lower_bound_excl'), + ('fpkm_distribution', 'fpkm_lower_bound_incl'), + ('fpkm_distribution', 'neg_freq'), + ('fpkm_distribution', 'pos_freq'), + ('fpkm_distribution', 'pos_to_neg_ratio'), + ('fragment_masses', 'mz'), + ('indistinguishable_peptide', 'calc_neutral_pep_mass'), + ('intensity', 'error'), + ('intensity', 'mz'), + ('intensity', 'ratio'), + ('libra_summary', 'mass_tolerance'), + ('libra_summary', 'min_pep_prob'), + ('libra_summary', 'min_pep_wt'), + ('libra_summary', 'min_prot_prob'), + ('ni_distribution', 'alt_pos_to_neg_ratio'), + ('ni_distribution', 'neg_freq'), + ('ni_distribution', 'ni_lower_bound_excl'), + ('ni_distribution', 'ni_lower_bound_incl'), + ('ni_distribution', 'pos_freq'), + ('ni_distribution', 'pos_to_neg_ratio'), + ('nsp_distribution', 'alt_pos_to_neg_ratio'), + ('nsp_distribution', 'neg_freq'), + ('nsp_distribution', 'nsp_lower_bound_excl'), + ('nsp_distribution', 'nsp_lower_bound_incl'), + ('nsp_distribution', 'pos_freq'), + ('nsp_distribution', 'pos_to_neg_ratio'), + ('peptide', 'calc_neutral_pep_mass'), + ('peptide', 'exp_sibling_ion_bin'), + ('peptide', 'exp_sibling_ion_instances'), + ('peptide', 'exp_tot_instances'), + ('peptide', 'fpkm_adjusted_probability'), + ('peptide', 'initial_probability'), + ('peptide', 'max_fpkm'), + ('peptide', 'n_sibling_peptides'), + ('peptide', 'ni_adjusted_probability'), + ('peptide', 'nsp_adjusted_probability'), + ('peptide', 'weight'), + ('point', 'fdr_pp'), + ('point', 'fdr_pp_decoy'), + ('point', 'logratio'), + ('point', 'model_distr'), + ('point', 'num_corr_pp'), + ('point', 'num_corr_pp_decoy'), + ('point', 'obs_distr'), + ('point', 'pp_decoy_uncert'), + ('point', 'pp_uncert'), + ('point', 'prob_cutoff'), + ('protein', 'confidence'), + ('protein', 'percent_coverage'), + ('protein', 'probability'), + ('protein_group', 'probability'), + ('protein_summary_data_filter', 'false_positive_error_rate'), + ('protein_summary_data_filter', 'min_probability'), + ('protein_summary_data_filter', 'predicted_num_correct'), + ('protein_summary_data_filter', 'predicted_num_incorrect'), + ('protein_summary_data_filter', 'sensitivity'), + ('protein_summary_header', 'initial_min_peptide_prob'), + ('protein_summary_header', 'min_peptide_probability'), + ('protein_summary_header', 'min_peptide_weight'), + ('protein_summary_header', 'num_predicted_correct_prots'), + ('protein_summary_header', 'total_no_spectrum_ids')}, + 'intlists': set(), + 'ints': {('ASAPRatio', 'ratio_number_peptides'), + ('ASAP_Peak', 'datanum'), + ('ASAP_Seq', 'datanum'), + ('ASAP_pvalue_analysis_summary', 'asap_prot_id'), + ('ASAP_pvalue_analysis_summary', 'asapratio_id'), + ('StPeterQuant_peptide', 'charge'), + ('affected_channel', 'channel'), + ('analysis_result', 'id'), + ('analysis_summary', 'id'), + ('contributing_channel', 'channel'), + ('error_point', 'num_corr'), + ('error_point', 'num_incorr'), + ('fpkm_distribution', 'bin_no'), + ('fragment_masses', 'channel'), + ('intensity', 'channel'), + ('libra_result', 'number'), + ('libra_summary', 'centroiding_preference'), + ('libra_summary', 'normalization'), + ('libra_summary', 'output_type'), + ('ni_distribution', 'bin_no'), + ('nsp_distribution', 'bin_no'), + ('peptide', 'charge'), + ('peptide', 'fpkm_bin'), + ('peptide', 'n_enzymatic_termini'), + ('peptide', 'n_instances'), + ('peptide', 'n_sibling_peptides_bin'), + ('protein', 'n_indistinguishable_proteins'), + ('protein', 'total_number_distinct_peptides'), + ('protein', 'total_number_peptides'), + ('protein_summary_header', 'num_input_1_spectra'), + ('protein_summary_header', 'num_input_2_spectra'), + ('protein_summary_header', 'num_input_3_spectra'), + ('protein_summary_header', 'num_input_4_spectra'), + ('protein_summary_header', 'num_input_5_spectra')}, + 'lists': {'ASAP_Dta', + 'ASAP_Peak', + 'ASAP_Seq', + 'StPeterQuant_peptide', + 'affected_channel', + 'analysis_result', + 'analysis_summary', + 'contributing_channel', + 'error_point', + 'fpkm_distribution', + 'fpkm_information', + 'fragment_masses', + 'indistinguishable_peptide', + 'indistinguishable_protein', + 'intensity', + 'mod_aminoacid_mass', + 'modification_info', + 'ni_distribution', + 'ni_information', + 'nsp_distribution', + 'parameter', + 'peptide', + 'peptide_parent_protein', + 'point', + 'protein', + 'protein_group', + 'protein_summary_data_filter'}} + +_mzid_schema_defaults = {'bools': {('Enzyme', 'semiSpecific'), + ('Enzymes', 'independent'), + ('PeptideEvidence', 'isDecoy'), + ('ProteinDetectionHypothesis', 'passThreshold'), + ('SearchModification', 'fixedMod'), + ('SpectrumIdentificationItem', 'passThreshold')}, + 'charlists': {('Modification', 'residues'), + ('SearchModification', 'residues')}, + 'floatlists': {('FragmentArray', 'values')}, + 'floats': {('Modification', 'avgMassDelta'), + ('Modification', 'monoisotopicMassDelta'), + ('Residue', 'mass'), + ('SearchModification', 'massDelta'), + ('SpectrumIdentificationItem', 'calculatedMassToCharge'), + ('SpectrumIdentificationItem', 'calculatedPI'), + ('SpectrumIdentificationItem', 'experimentalMassToCharge'), + ('SubstitutionModification', 'avgMassDelta'), + ('SubstitutionModification', 'monoisotopicMassDelta')}, + 'intlists': {('IonType', 'index'), ('MassTable', 'msLevel')}, + 'ints': {('BibliographicReference', 'year'), + ('DBSequence', 'length'), + ('Enzyme', 'missedCleavages'), + ('IonType', 'charge'), + ('Modification', 'location'), + ('PeptideEvidence', 'end'), + ('PeptideEvidence', 'start'), + ('SearchDatabase', 'numDatabaseSequences'), + ('SearchDatabase', 'numResidues'), + ('SpectrumIdentificationItem', 'chargeState'), + ('SpectrumIdentificationItem', 'rank'), + ('SpectrumIdentificationList', 'numSequencesSearched'), + ('SubstitutionModification', 'location')}, + 'lists': {'Affiliation', + 'AmbiguousResidue', + 'AnalysisSoftware', + 'BibliographicReference', + 'ContactRole', + 'DBSequence', + 'Enzyme', + 'Filter', + 'FragmentArray', + 'InputSpectra', + 'InputSpectrumIdentifications', + 'IonType', + 'MassTable', + 'Measure', + 'Modification', + 'Peptide', + 'PeptideEvidence', + 'PeptideEvidenceRef', + 'PeptideHypothesis', + 'ProteinAmbiguityGroup', + 'ProteinDetectionHypothesis', + 'Residue', + 'Sample', + 'SearchDatabase', + 'SearchDatabaseRef', + 'SearchModification', + 'SourceFile', + 'SpecificityRules', + 'SpectraData', + 'SpectrumIdentification', + 'SpectrumIdentificationItem', + 'SpectrumIdentificationItemRef', + 'SpectrumIdentificationList', + 'SpectrumIdentificationProtocol', + 'SpectrumIdentificationResult', + 'SubSample', + 'SubstitutionModification', + 'TranslationTable', + 'cv', + 'cvParam'}} + +_trafoxml_schema_defaults = {'bools': set(), + 'charlists': set(), + 'floatlists': set(), + 'floats': {('Pair', 'from'), ('Pair', 'to'), ('TrafoXML', 'version')}, + 'intlists': set(), + 'ints': {('Pairs', 'count')}, + 'lists': {'Pair', 'Param'}} + +_featurexml_schema_defaults = { + 'ints': {('PeptideHit', 'charge'), + # ('PeptideIdentification', 'spectrum_reference'), + ('SearchParameters', 'missed_cleavages'), + # ('UnassignedPeptideIdentification', 'spectrum_reference'), + ('featureList', 'count'), + ('quality', 'dim'), + ('position', 'dim'), + ('feature', 'charge'), + ('convexhull', 'nr'), + }, + 'floats': {('PeptideHit', 'score'), + ('PeptideIdentification', 'MZ'), + ('PeptideIdentification', 'RT'), + ('PeptideIdentification', 'significance_threshold'), + ('ProteinHit', 'coverage'), + ('ProteinHit', 'score'), + ('ProteinIdentification', 'significance_threshold'), + ('SearchParameters', 'peak_mass_tolerance'), + ('SearchParameters', 'precursor_peak_tolerance'), + ('UnassignedPeptideIdentification', 'MZ'), + ('UnassignedPeptideIdentification', 'RT'), + ('UnassignedPeptideIdentification', 'significance_threshold'), + ('featureMap', 'version'), + ('pt', 'x'), + ('pt', 'y'), + ('quality', 'quality'), + ('position', 'position'), + ('feature', 'overallquality'), + ('feature', 'intensity'), + }, + 'bools': {('PeptideIdentification', 'higher_score_better'), + ('ProteinIdentification', 'higher_score_better'), + ('SearchParameters', 'peak_mass_tolerance_ppm'), + ('SearchParameters', 'precursor_peak_tolerance_ppm'), + ('UnassignedPeptideIdentification', 'higher_score_better')}, + 'intlists': set(), + 'floatlists': set(), + 'charlists': set(), + 'lists': {'FixedModification', + 'IdentificationRun', + 'PeptideHit', + 'PeptideIdentification', + 'ProteinHit', + 'ProteinIdentification', + 'SearchParameters', + 'UnassignedPeptideIdentification', + 'UserParam', + 'VariableModification', + 'convexhull', + 'dataProcessing', + 'feature', + 'hposition', + 'hullpoint', + 'param', + 'position', + 'processingAction', + 'pt', + 'quality'}} + +_tandem_schema_defaults = {'ints': { + ('group', 'z'), ('aa', 'at')} | {('domain', k) for k in [ + 'missed_cleavages', 'start', 'end', 'y_ions', 'b_ions', + 'a_ions', 'x_ions', 'c_ions', 'z_ions']}, + + 'floats': {('group', k) for k in [ + 'fI', 'sumI', 'maxI', 'mh', 'expect']} | { + ('domain', k) for k in [ + 'expect', 'hyperscore', 'b_score', 'y_score', + 'a_score', 'x_score', 'c_score', 'z_score', + 'nextscore', 'delta', 'mh']} | { + ('protein', 'expect'), ('protein', 'sumI'), + ('aa', 'modified')}, + + 'bools': set(), + 'lists': {'group', 'trace', 'attribute', 'protein', 'aa', 'note'}, + 'floatlists': {('values', 'values')}, + 'intlists': set(), 'charlists': set(), 'duration': {('group', 'rt')}} + +_mzxml_schema_defaults = {'bools': {('dataProcessing', 'centroided'), + ('dataProcessing', 'chargeDeconvoluted'), + ('dataProcessing', 'deisotoped'), + ('dataProcessing', 'spotIntegration'), + ('maldi', 'collisionGas'), + ('scan', 'centroided'), + ('scan', 'chargeDeconvoluted'), + ('scan', 'deisotoped')}, + 'charlists': set(), + 'floatlists': set(), + 'floats': {('dataProcessing', 'intensityCutoff'), + ('precursorMz', 'precursorIntensity'), + ('precursorMz', 'windowWideness'), + ('precursorMz', 'precursorMz'), + ('scan', 'basePeakIntensity'), + ('scan', 'basePeakMz'), + ('scan', 'cidGasPressure'), + ('scan', 'collisionEnergy'), + ('scan', 'compensationVoltage'), + ('scan', 'endMz'), + ('scan', 'highMz'), + ('scan', 'ionisationEnergy'), + ('scan', 'lowMz'), + ('scan', 'startMz'), + ('scan', 'totIonCurrent')}, + 'duration': {("scan", "retentionTime") + }, + 'intlists': set(), + 'ints': {('msInstrument', 'msInstrumentID'), + ('peaks', 'compressedLen'), + ('precursorMz', 'precursorCharge'), + ('robot', 'deadVolume'), + ('scan', 'msInstrumentID'), + ('scan', 'peaksCount'), + ('scanOrigin', 'num'), + ('scan', 'msLevel')}, + 'lists': {'dataProcessing', + 'msInstrument', + 'parentFile', + 'peaks', + 'plate', + 'precursorMz', + 'scanOrigin', + 'spot'}} + +_mzml_schema_defaults = {'ints': { + ('spectrum', 'index'), + ('instrumentConfigurationList', 'count'), + ('binaryDataArray', 'encodedLength'), + ('cvList', 'count'), + ('binaryDataArray', 'arrayLength'), + ('scanWindowList', 'count'), + ('componentList', 'count'), + ('sourceFileList', 'count'), + ('productList', 'count'), + ('referenceableParamGroupList', 'count'), + ('scanList', 'count'), + ('spectrum', 'defaultArrayLength'), + ('dataProcessingList', 'count'), + ('sourceFileRefList', 'count'), + ('scanSettingsList', 'count'), + ('selectedIonList', 'count'), + ('chromatogram', 'defaultArrayLength'), + ('precursorList', 'count'), + ('chromatogram', 'index'), + ('processingMethod', 'order'), + ('targetList', 'count'), + ('sampleList', 'count'), + ('softwareList', 'count'), + ('binaryDataArrayList', 'count'), + ('spectrumList', 'count'), + ('chromatogramList', 'count'), + ('selectedIon', 'charge state')}, + 'floats': {}, + 'bools': {}, + 'lists': {'scan', 'spectrum', 'sample', 'cv', 'dataProcessing', + 'cvParam', 'source', 'userParam', 'detector', 'product', + 'referenceableParamGroupRef', 'selectedIon', 'sourceFileRef', + 'binaryDataArray', 'analyzer', 'scanSettings', + 'instrumentConfiguration', 'chromatogram', 'target', + 'processingMethod', 'precursor', 'sourceFile', + 'referenceableParamGroup', 'contact', 'scanWindow', 'software'}, + 'intlists': {}, + 'floatlists': {}, + 'charlists': {}} + +_pepxml_schema_defaults = {'ints': + {('xpressratio_summary', 'xpress_light'), + ('distribution_point', 'obs_5_distr'), + ('distribution_point', 'obs_2_distr'), + ('enzymatic_search_constraint', 'max_num_internal_cleavages'), + ('asapratio_lc_heavypeak', 'right_valley'), + ('libra_summary', 'output_type'), + ('distribution_point', 'obs_7_distr'), + ('spectrum_query', 'index'), + ('data_filter', 'number'), + ('roc_data_point', 'num_incorr'), + ('search_hit', 'num_tol_term'), + ('search_hit', 'num_missed_cleavages'), + ('asapratio_lc_lightpeak', 'right_valley'), + ('libra_summary', 'normalization'), + ('specificity', 'min_spacing'), + ('database_refresh_timestamp', 'min_num_enz_term'), + ('enzymatic_search_constraint', 'min_number_termini'), + ('xpressratio_result', 'light_lastscan'), + ('distribution_point', 'obs_3_distr'), + ('spectrum_query', 'end_scan'), + ('analysis_result', 'id'), + ('search_database', 'size_in_db_entries'), + ('search_hit', 'hit_rank'), + ('alternative_protein', 'num_tol_term'), + ('search_hit', 'num_tot_proteins'), + ('asapratio_summary', 'elution'), + ('search_hit', 'tot_num_ions'), + ('error_point', 'num_incorr'), + ('mixture_model', 'precursor_ion_charge'), + ('roc_data_point', 'num_corr'), + ('search_hit', 'num_matched_ions'), + ('dataset_derivation', 'generation_no'), + ('xpressratio_result', 'heavy_firstscan'), + ('xpressratio_result', 'heavy_lastscan'), + ('error_point', 'num_corr'), + ('spectrum_query', 'assumed_charge'), + ('analysis_timestamp', 'id'), + ('xpressratio_result', 'light_firstscan'), + ('distribution_point', 'obs_4_distr'), + ('asapratio_lc_heavypeak', 'left_valley'), + ('fragment_masses', 'channel'), + ('distribution_point', 'obs_6_distr'), + ('affected_channel', 'channel'), + ('search_result', 'search_id'), + ('contributing_channel', 'channel'), + ('asapratio_lc_lightpeak', 'left_valley'), + ('asapratio_peptide_data', 'area_flag'), + ('search_database', 'size_of_residues'), + ('asapratio_peptide_data', 'cidIndex'), + ('mixture_model', 'num_iterations'), + ('mod_aminoacid_mass', 'position'), + ('spectrum_query', 'start_scan'), + ('asapratio_summary', 'area_flag'), + ('mixture_model', 'tot_num_spectra'), + ('search_summary', 'search_id'), + ('xpressratio_timestamp', 'xpress_light'), + ('distribution_point', 'obs_1_distr'), + ('intensity', 'channel'), + ('asapratio_contribution', 'charge'), + ('libra_summary', 'centroiding_preference')}, + 'floats': + {('asapratio_contribution', 'error'), + ('asapratio_lc_heavypeak', 'area_error'), + ('modification_info', 'mod_nterm_mass'), + ('distribution_point', 'model_4_neg_distr'), + ('distribution_point', 'model_5_pos_distr'), + ('spectrum_query', 'precursor_neutral_mass'), + ('asapratio_lc_heavypeak', 'time_width'), + ('xpressratio_summary', 'masstol'), + ('affected_channel', 'correction'), + ('distribution_point', 'model_7_neg_distr'), + ('error_point', 'error'), + ('intensity', 'target_mass'), + ('roc_data_point', 'sensitivity'), + ('distribution_point', 'model_4_pos_distr'), + ('distribution_point', 'model_2_neg_distr'), + ('distribution_point', 'model_3_pos_distr'), + ('mixture_model', 'prior_probability'), + ('roc_data_point', 'error'), + ('intensity', 'normalized'), + ('modification_info', 'mod_cterm_mass'), + ('asapratio_lc_lightpeak', 'area_error'), + ('distribution_point', 'fvalue'), + ('distribution_point', 'model_1_neg_distr'), + ('peptideprophet_summary', 'min_prob'), + ('asapratio_result', 'mean'), + ('point', 'pos_dens'), + ('fragment_masses', 'mz'), + ('mod_aminoacid_mass', 'mass'), + ('distribution_point', 'model_6_neg_distr'), + ('asapratio_lc_lightpeak', 'time_width'), + ('asapratio_result', 'heavy2light_error'), + ('peptideprophet_result', 'probability'), + ('error_point', 'min_prob'), + ('peptideprophet_summary', 'est_tot_num_correct'), + ('roc_data_point', 'min_prob'), + ('asapratio_result', 'heavy2light_mean'), + ('distribution_point', 'model_5_neg_distr'), + ('mixturemodel', 'neg_bandwidth'), + ('asapratio_result', 'error'), + ('xpressratio_result', 'light_mass'), + ('point', 'neg_dens'), + ('asapratio_lc_lightpeak', 'area'), + ('distribution_point', 'model_1_pos_distr'), + ('xpressratio_result', 'mass_tol'), + ('mixturemodel', 'pos_bandwidth'), + ('xpressratio_result', 'light_area'), + ('asapratio_peptide_data', 'heavy_mass'), + ('distribution_point', 'model_2_pos_distr'), + ('search_hit', 'calc_neutral_pep_mass'), + ('intensity', 'absolute'), + ('asapratio_peptide_data', 'light_mass'), + ('distribution_point', 'model_3_neg_distr'), + ('aminoacid_modification', 'mass'), + ('asapratio_lc_heavypeak', 'time'), + ('asapratio_lc_lightpeak', 'time'), + ('asapratio_lc_lightpeak', 'background'), + ('mixture_model', 'est_tot_correct'), + ('point', 'value'), + ('asapratio_lc_heavypeak', 'background'), + ('terminal_modification', 'mass'), + ('fragment_masses', 'offset'), + ('xpressratio_result', 'heavy_mass'), + ('search_hit', 'protein_mw'), + ('libra_summary', 'mass_tolerance'), + ('spectrum_query', 'retention_time_sec'), + ('distribution_point', 'model_7_pos_distr'), + ('asapratio_lc_heavypeak', 'area'), + ('alternative_protein', 'protein_mw'), + ('asapratio_contribution', 'ratio'), + ('xpressratio_result', 'heavy_area'), + ('distribution_point', 'model_6_pos_distr')}, + 'bools': + {('sample_enzyme', 'independent'), + ('intensity', 'reject'), + ('libra_result', 'is_rejected')}, + 'intlists': set(), + 'floatlists': set(), + 'charlists': set(), + 'lists': {'point', 'aminoacid_modification', 'msms_run_summary', + 'mixturemodel', 'search_hit', 'mixturemodel_distribution', + 'sequence_search_constraint', 'specificity', 'alternative_protein', + 'analysis_result', 'data_filter', 'fragment_masses', 'error_point', + 'parameter', 'spectrum_query', 'search_result', 'affected_channel', + 'analysis_summary', 'roc_data_point', 'distribution_point', + 'search_summary', 'mod_aminoacid_mass', 'search_score', 'intensity', + 'analysis_timestamp', 'mixture_model', 'terminal_modification', + 'contributing_channel', 'inputfile'}} + + +_traml_schema_defaults = {'bools': set(), + 'charlists': set(), + 'floatlists': set(), + 'floats': {('Modification', 'averageMassDelta'), + ('Modification', 'monoisotopicMassDelta')}, + 'intlists': set(), + 'ints': {('Modification', 'location')}, + 'lists': {'Compound', + 'Configuration', + 'Contact', + 'Instrument', + 'IntermediateProduct', + 'Interpretation', + 'Modification', + 'Peptide', + 'Protein', + 'ProteinRef', + 'Publication', + 'RetentionTime', + 'RetentionTimeList', + 'Software', + 'SourceFile', + 'Target', + 'Transition', + 'ValidationStatus', + 'cv', + 'cvParam', + 'userParam'}} + +_idxml_schema_defaults = { + 'ints': {('PeptideHit', 'charge'), ('SearchParameters', 'missed_cleavages'), + ('PeptideHit', 'NumMatchedMainIons'), ('PeptideHit', 'IsotopeError')}, + 'floats': {('IdXML', 'version'), + ('PeptideHit', 'score'), + ('PeptideIdentification', 'MZ'), + ('PeptideIdentification', 'RT'), + ('PeptideIdentification', 'significance_threshold'), + ('PeptideHit', 'MS2IonCurrent'), + ('PeptideHit', 'MeanErrorAll'), + ('PeptideHit', 'MeanErrorTop7'), + ('PeptideHit', 'MeanRelErrorAll'), + ('PeptideHit', 'MeanRelErrorTop7'), + ('PeptideHit', 'NTermIonCurrentRatio'), + ('PeptideHit', 'CTermIonCurrentRatio'), + ('PeptideHit', 'StdevErrorAll'), + ('PeptideHit', 'StdevErrorTop7'), + ('PeptideHit', 'StdevRelErrorAll'), + ('PeptideHit', 'StdevRelErrorTop7'), + ('PeptideHit', 'ExplainedIonCurrentRatio'), + ('ProteinHit', 'coverage'), + ('ProteinHit', 'score'), + ('ProteinIdentification', 'significance_threshold'), + ('SearchParameters', 'peak_mass_tolerance'), + ('SearchParameters', 'precursor_peak_tolerance')}, + 'bools': {('PeptideIdentification', 'higher_score_better'), + ('ProteinIdentification', 'higher_score_better'), + ('SearchParameters', 'peak_mass_tolerance_ppm'), + ('SearchParameters', 'precursor_peak_tolerance_ppm')}, + 'intlists': set(), + 'floatlists': set(), + 'charlists': set(), + 'lists': {'FixedModification', + 'IdentificationRun', + 'PeptideHit', + 'PeptideIdentification', + 'ProteinHit', + 'ProteinIdentification', + 'SearchParameters', + 'UserParam', + 'VariableModification'}} diff --git a/pyteomics/achrom.py b/pyteomics/achrom.py new file mode 100644 index 0000000..05a3224 --- /dev/null +++ b/pyteomics/achrom.py @@ -0,0 +1,1326 @@ +""" +achrom - additive model of polypeptide chromatography +===================================================== + +Summary +------- + +The additive model of polypeptide chromatography, or achrom, is the most basic +model for peptide retention time prediction. The main equation behind +achrom has the following form: + +.. math:: + + RT = (1 + m\\,ln N) \\sum_{i=1}^{i=N}{RC_i n_i} + RT_0 + + +Here, :math:`RC_i` is the retention coefficient of the amino acid +residues of the i-th type, :math:`n_i` corresponds to the number of amino acid +residues of type :math:`i` in the peptide sequence, N is the total number of +different *types* of amino acid residues present, +and :math:`RT_0` is a constant retention time shift. + +In order to use achrom, one needs to find the retention +coeffcients, using experimentally determined retention times for a training set +of peptide retention times, i.e. to *calibrate* the model. + +Calibration +----------- + + :py:func:`get_RCs` - find a set of retention coefficients using a + given set of peptides with known retention times and a fixed value of + length correction parameter. + + :py:func:`get_RCs_vary_lcp` - find the best length correction parameter + and a set of retention coefficients for a given peptide sample. + +Retention time calculation +-------------------------- + + :py:func:`calculate_RT` - calculate the retention time of a peptide + using a given set of retention coefficients. + +Data +---- + + :py:data:`RCs_guo_ph2_0` - a set of retention coefficients (RCs) + from [#Guo1]_. Conditions: Synchropak RP-P C18 column (250 x 4.1 mm + I.D.), gradient (A = 0.1% aq. TFA, pH 2.0; B = 0.1% TFA in acetonitrile) at + 1% B/min, flow rate 1 ml/min, 26 centigrades. + + :py:data:`RCs_guo_ph7_0` - a set of retention coefficients (RCs) + from [#Guo1]_. Conditions: Synchropak RP-P C18 column (250 x 4.1 mm + I.D.), gradient (A = aq. 10 mM (NH4)2HPO4 - 0.1 M NaClO4, pH 7.0; B + = 0.1 M NaClO4 in 60% aq. acetonitrile) at 1.67% B/min, flow rate 1 + ml/min, 26 centigrades. + + :py:data:`RCs_meek_ph2_1` - a set of RCs from [#Meek]_. Conditions: Bio-Rad + "ODS" column, gradient (A = 0.1 M NaClO4, 0.1% phosphoric acid in + water; B = 0.1 M NaClO4, 0.1% phosphoric acid in 60% + aq. acetonitrile) at 1.25% B/min, room temperature. + + :py:data:`RCs_meek_ph7_4` - a set of RCs from [#Meek]_. Conditions: Bio-Rad + "ODS" column, gradient (A = 0.1 M NaClO4, 5 mM phosphate buffer in + water; B = 0.1 M NaClO4, 5 mM phosphate buffer in 60% + aq. acetonitrile) at 1.25% B/min, room temperature. + + :py:data:`RCs_browne_tfa` - a set of RCs found in + [#Browne]_. Conditions: Waters mjuBondapak C18 column, gradient (A = + 0.1% aq. TFA, B = 0.1% TFA in acetonitrile) at 0.33% B/min, flow + rate 1.5 ml/min. + + :py:data:`RCs_browne_hfba` - a set of RCs found in + [#Browne]_. Conditions: Waters mjuBondapak C18 column, gradient (A = + 0.13% aq. HFBA, B = 0.13% HFBA in acetonitrile) at 0.33% B/min, flow + rate 1.5 ml/min. + + :py:data:`RCs_palmblad` - a set of RCs from + [#Palmblad]_. Conditions: a fused silica column (80-100 x 0.200 mm + I.D.) packed in-house with C18 ODS-AQ; solvent A = 0.5% aq. HAc, + B = 0.5% HAc in acetonitrile. + + :py:data:`RCs_yoshida` - a set of RCs for normal phase chromatography + from [#Yoshida]_. Conditions: + TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = 0.1% TFA + in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at 0.6% + water/min, flow rate 1.0 ml/min, 40 centigrades. + + :py:data:`RCs_yoshida_lc` - a set of length-corrected RCs for normal phase + chromatography. The set was calculated in [#Moskovets]_ for the data from + [#Yoshida]_. + Conditions: + TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = 0.1% TFA + in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at 0.6% + water/min, flow rate 1.0 ml/min, 40 centigrades. + + :py:data:`RCs_zubarev` - a set of length-corrected RCs calculated + on a dataset used in [#Goloborodko]_. + Conditions: Reprosil-Pur C18-AQ column (150 x 0.075 mm I.D.), gradient (A = + 0.5% AA in water; B = 0.5% AA in ACN-water (90:10)) at + 0.5% water/min, flow rate 200.0 nl/min, room temperature. + + :py:data:`RCs_gilar_atlantis_ph3_0` - a set of retention coefficients obtained + in [#Gilar]_. + Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A, + gradient (A = water, B = ACN, C = 200 mM ammonium formate): + 0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C + at 0.2 ml/min, temperature 40 C, pH 3.0 + + :py:data:`RCs_gilar_atlantis_ph4_5` - a set of retention coefficients obtained + in [#Gilar]_. + Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A, + gradient (A = water, B = ACN, C = 200 mM ammonium formate): + 0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C + at 0.2 ml/min, temperature 40 C, pH 4.5 + + :py:data:`RCs_gilar_atlantis_ph10_0` - a set of retention coefficients + obtained in [#Gilar]_. + Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A, + gradient (A = water, B = ACN, C = 200 mM ammonium formate): + 0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C + at 0.2 ml/min, temperature 40 C, pH 10.0 + + :py:data:`RCs_gilar_beh` - a set of retention coefficients obtained in + [#Gilar]_. + Conditions: ACQUITY UPLC BEH HILIC column (150 x 2.1 mm I.D.), 1.7 um, 130 A, + Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by + titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: + 90% ACN, 10% mobile phase A (v:v). + Gradient: 90-60% B in 50 min. + + :py:data:`RCs_gilar_beh_amide` - a set of retention coefficients obtained in + [#Gilar]_. + Conditions: ACQUITY UPLC BEH glycan column (150 x 2.1 mm I.D.), 1.7 um, 130 A, + Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by + titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: + 90% ACN, 10% mobile phase A (v:v). + Gradient: 90-60% B in 50 min. + + :py:data:`RCs_gilar_rp` - a set of retention coefficients obtained in + [#Gilar]_. + Conditions: ACQUITY UPLC BEH C18 column (100 mm x 2.1 mm I.D.), 1.7 um, 130 A. + Mobile phase A: 0.02% TFA in water, mobile phase B: 0.018% TFA in ACN. + Gradient: 0 to 50% B in 50 min, flow rate 0.2 ml/min, temperature 40 C., + pH 2.6. + + :py:data:`RCs_krokhin_100A_fa` - a set of retention coefficients obtained in + [#Krokhin]_. + Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% FA), packed with + 5-um Luna C18(2) (Phenomenex, Torrance, CA), pH=2.0. + Both eluents A (2% ACN in water) and B (98% ACN) contained + 0.1% FA as ion-pairing modifier. 0.33% ACN/min + linear gradient (0-30% B). + + :py:data:`RCs_krokhin_100A_tfa` - a set of retention coefficients obtained in + [#Krokhin]_. + Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% TFA), packed with + 5-um Luna C18(2) (Phenomenex, Torrance, CA), pH=2.0. + Both eluents A (2% ACN in water) and B (98% ACN) contained + 0.1% TFA as ion-pairing modifier. 0.33% ACN/min + linear gradient (0-30% B). + +Theory +------ + +The additive model of polypeptide chromatography, or the model of +retention coefficients was the earliest attempt to describe the dependence of +retention time of a polypeptide in liquid chromatography on its sequence +[#Meek]_, [#Guo1]_. In this model, each amino acid is assigned a number, or +a *retention coefficient* (RC) describing its retention properties. The +retention time (RT) during a gradient elution is then calculated as: + +.. math:: + + RT = \\sum_{i=1}^{i=N}{RC_i \\cdot n_i} + RT_0, + +which is the sum of retention coefficients of all amino acid residues in a +polypeptide. This equation can also be expressed in terms of linear +algebra: + +.. math:: + + RT = \\bar{aa} \\cdot \\bar{RC} + RT_0, + +where :math:`\\bar{aa}` is a vector of amino acid composition, +i.e. :math:`\\bar{aa}_i` is the number of amino acid residues of i-th +type in a polypeptide; :math:`\\bar{RC}` is a vector of respective +retention coefficients. + +In this formulation, it is clear that additive model gives the same results for +any two peptides with different sequences but the same amino acid +composition. In other words, **additive model is not sequence-specific**. + +The additive model has two advantages over all other models of chromatography +- it is easy to understand and use. The rule behind the additive model is as +simple as it could be: **each amino acid residue shifts retention time by a +fixed value, depending only on its type**. This rule allows geometrical +interpretation. Each peptide may be represented by a point in 21-dimensional +space, with first 20 coordinates equal to the amounts of corresponding amino +acid residues in the peptide and 21-st coordinate equal to RT. The additive +model assumes that a line may be drawn through these points. Of course, this +assumption is valid only partially, and most points would not lie on the +line. But the line would describe the main trend and could be used to estimate +retention time for peptides with known amino acid composition. + +This best fit line is described by retention coefficients and :math:`RT_0`. +The procedure of finding these coefficients is called *calibration*. There is +`an analytical solution to calibration of linear models +<http://en.wikipedia.org/wiki/Linear_regression>`_, which makes them +especially useful in real applications. + +Several attempts were made in order to improve the accuracy of prediction by +the additive model (for a review of the field we suggest to read [#Baczek]_ +and [#Babushok]_). The two implemented in this module are the logarithmic +length correction term described in [#MantLogLen]_ and additional sets of +retention coefficients for terminal amino acid residues [#Tripet]_. + +Logarithmic length correction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This enhancement was firstly described in [#MantLogLen]_. Briefly, it was +found that the following equation better describes the dependence of RT on the +peptide sequence: + +.. math:: + + RT = \\sum_{i=1}^{i=N}{RC_i} + m\\,ln N \\sum_{i=1}^{i=N}{RC_i} + RT_0 + +We would call the second term :math:`m\\,ln N \\sum_{i=1}^{i=N}{RC_i}` *the +length correction term* and m - *the length correction parameter*. The +simplified and vectorized form of this equation would be: + +.. math:: + + RT = (1 + m\\,ln N) \\, \\bar{RC} \\cdot \\bar{aa} + RT_0 + +This equation may be reduced to a linear form and solved by the standard +methods. + +Terminal retention coefficients +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Another significant improvement may be obtained through introduction of +separate sets of retention coefficients for terminal amino acid residues +[#Tripet]_. + +References +---------- + +.. [#Meek] Meek, J. L. `Prediction of peptide retention times in high-pressure + liquid chromatography on the basis of amino acid composition. + <http://www.ncbi.nlm.nih.gov/pubmed/6929513>`_ + PNAS, 1980, 77 (3), 1632-1636. + +.. [#Guo1] Guo, D.; Mant, C. T.; Taneja, A. K.; Parker, J. M. R.; Hodges, + R. S. `Prediction of peptide retention times in reversed-phase + high-performance liquid chromatography I. Determination of retention + coefficients of amino acid residues of model synthetic peptides. + <http://dx.doi.org/10.1016/0021-9673(86)80102-9>`_ + Journal of Chromatography A, 1986, 359, 499-518. + +.. [#Baczek] Baczek, T.; Kaliszan, R. `Predictions of peptides' retention times + in reversed-phase liquid chromatography as a new supportive tool to improve + protein identification in proteomics. + <http://dx.doi.org/10.1002/pmic.200800544>`_ + Proteomics, 2009, 9 (4), 835-47. + +.. [#Babushok] Babushok, V. I.; Zenkevich, I. G. `Retention Characteristics of + Peptides in RP-LC: Peptide Retention Prediction. + <http://dx.doi.org/10.1365/s10337-010-1721-8>`_ + Chromatographia, 2010, 72 (9-10), 781-797. + +.. [#MantLogLen] Mant, C. T.; Zhou, N. E.; Hodges, R. S. `Correlation of + protein retention times in reversed-phase chromatography with polypeptide + chain length and hydrophobicity. + <http://dx.doi.org/10.1016/S0021-9673(01)93882-8>`_ + Journal of Chromatography A, 1989, 476, 363-375. + +.. [#Tripet] Tripet, B.; Cepeniene, D.; Kovacs, J. M.; Mant, C. T.; Krokhin, + O. V.; Hodges, R. S. `Requirements for prediction of peptide retention time + in reversed-phase high-performance liquid chromatography: + hydrophilicity/hydrophobicity of side-chains at the N- and C-termini of + peptides are dramatically affected by the end-groups and location. + <http://dx.doi.org/10.1016/j.chroma.2006.12.024>`_ + Journal of chromatography A, 2007, 1141 (2), 212-25. + +.. [#Browne] Browne, C. A.; Bennett, H. P. J.; Solomon, S. `The + isolation of peptides by high-performance liquid chromatography + using predicted elution positions + <http://www.sciencedirect.com/science/article/pii/000326978290238X>`_. + Analytical Biochemistry, 1982, 124 (1), 201-208. + +.. [#Palmblad] Palmblad, M.; Ramstrom, M.; Markides, K. E.; Hakansson, + P.; Bergquist, J. `Prediction of Chromatographic Retention and + Protein Identification in Liquid Chromatography/Mass + Spectrometry + <http://pubs.acs.org/doi/abs/10.1021/ac0256890>`_. + Analytical Chemistry, 2002, 74 (22), 5826-5830. + +.. [#Yoshida] Yoshida, T. Calculation of peptide retention + coefficients in normal-phase liquid chromatography. Journal of + Chromatography A, 1998, 808 (1-2), 105-112. + +.. [#Moskovets] Moskovets, E.; Goloborodko A. A.; Gorshkov A. V.; Gorshkov M.V. + `Limitation of predictive 2-D liquid chromatography in reducing the database + search space in shotgun proteomics: In silico studies. + <http://dx.doi.org/10.1002/jssc.201100798>`_ + Journal of Separation Science, 2012, 35 (14), 1771-1778. + +.. [#Goloborodko] Goloborodko A. A.; Mayerhofer C.; Zubarev A. R.; + Tarasova I. A.; Gorshkov A. V.; Zubarev, R. A.; Gorshkov, M. V. + `Empirical approach to false discovery rate + estimation in shotgun proteomics. <http://dx.doi.org/10.1002/rcm.4417>`_ + Rapid communications in mass spectrometry, 2010, 24(4), 454-62. + +.. [#Gilar] Gilar, M., & Jaworski, A. (2011). `Retention behavior of peptides in + hydrophilic-interaction chromatography. + <http://dx.doi.org/10.1016/j.chroma.2011.04.005>`_ + Journal of chromatography A, 1218(49), 8890-6. + +.. [#Krokhin] Dwivedi, R. C.; Spicer, V.; Harder, M.; Antonovici, M.; Ens, W.; + Standing, K. G.; Wilkins, J. A.; Krokhin, O. V. (2008). `Practical + implementation of 2D HPLC scheme with accurate peptide retention prediction + in both dimensions for high-throughput bottom-up proteomics + <http://pubs.acs.org/doi/abs/10.1021/ac800984n>`_. + Analytical Chemistry, 80(18), 7036-42. + +Dependencies +------------ + +This module requires :py:mod:`numpy` and, optionally, :py:mod:`scikit-learn` +(for MAE regression). + +-------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from .auxiliary import linear_regression, PyteomicsError +try: + from sklearn.linear_model import QuantileRegressor +except ImportError: + QuantileRegressor = None + +from . import parser + +def get_RCs(sequences, RTs, lcp=-0.21, term_aa=False, metric='mse', **kwargs): + """Calculate the retention coefficients of amino acids using + retention times of a peptide sample and a fixed value of length + correction parameter. + + Parameters + ---------- + sequences : list of str + List of peptide sequences. + RTs: list of float + List of corresponding retention times. + lcp : float, optional + A multiplier before ln(L) term in the equation for the retention + time of a peptide. Set to -0.21 by default. + term_aa : bool, optional + If :py:const:`True`, terminal amino acids are treated as being + modified with 'ntermX'/'ctermX' modifications. :py:const:`False` + by default. + metric : str, optional + Metric for the regression problem. Set to "mse" (mean squared + error) by default. Alternative: "mae" (mean absolute error), + which uses quantile regression. + + .. note :: + `"mae"` requires :py:mod:`scikit-learn` for + `quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_. + + labels : list of str, optional + List of all possible amino acids and terminal groups + If not given, any modX labels are allowed. + + Returns + ------- + RC_dict : dict + Dictionary with the calculated retention coefficients. + + - RC_dict['aa'] -- amino acid retention coefficients. + + - RC_dict['const'] -- constant retention time shift. + + - RC_dict['lcp'] -- length correction parameter. + + Examples + -------- + >>> RCs = get_RCs(['A','AA'], [1.0, 2.0], 0.0, labels=['A']) + >>> abs(RCs['aa']['A'] - 1) < 1e-6 and abs(RCs['const']) < 1e-6 + True + >>> RCs = get_RCs(['A','AA','B'], [1.0, 2.0, 2.0], 0.0, labels=['A','B']) + >>> abs(RCs['aa']['A'] - 1) + abs(RCs['aa']['B'] - 2) + \ + abs(RCs['const']) < 1e-6 + True + """ + + labels = kwargs.get('labels') + + # Make a list of all amino acids present in the sample. + peptide_dicts = [ + parser.amino_acid_composition(peptide, False, term_aa, + allow_unknown_modifications=True, + labels=labels) + if not isinstance(peptide, dict) else peptide + for peptide in sequences] + + detected_amino_acids = {aa for peptide_dict in peptide_dicts + for aa in peptide_dict} + + # Determine retention coefficients using multidimensional linear + # regression. + composition_array = [] + for pdict in peptide_dicts: + loglen = np.log(parser.length(pdict)) + composition_array.append([pdict.get(aa, 0.) + * (1. + lcp * loglen) + for aa in detected_amino_acids] + [1.]) + + # Add normalizing conditions for terminal retention coefficients. The + # condition we are using here is quite arbitrary. It implies that the sum + # of N- or C-terminal RCs minus the sum of corresponding internal RCs must + # be equal to zero. + if term_aa: + for term_label in ['nterm', 'cterm']: + normalizing_peptide = [] + for aa in detected_amino_acids: + if aa.startswith(term_label): + normalizing_peptide.append(1.0) + elif (term_label+aa) in detected_amino_acids: + normalizing_peptide.append(-1.0) + else: + normalizing_peptide.append(0.0) + normalizing_peptide.append(0.0) + composition_array.append(normalizing_peptide) + RTs.append(0.0) + + if metric == 'mse': + # # Use least square linear regression. + RCs, _, _, _ = np.linalg.lstsq(np.array(composition_array), np.array(RTs), rcond=None) + + elif metric == 'mae': + if QuantileRegressor is None: + raise PyteomicsError("`metric='mae'` requires scikit-learn.") + # Use Quantile regression. + QR = QuantileRegressor(fit_intercept=False, alpha=0, solver='highs') + QR.fit(np.array(composition_array), np.array(RTs)) + RCs = QR.coef_ + else: + raise PyteomicsError('Invalid metric "{}". Must be "mse" or "mae".'.format(metric)) + + # Remove normalizing elements from the RTs vector. + if term_aa: + for term_label in ['nterm', 'cterm']: + RTs.pop() + + # Form output. + RC_dict = {} + RC_dict['aa'] = dict( + zip(list(detected_amino_acids), + RCs[:len(detected_amino_acids)])) + RC_dict['aa'][parser.std_nterm] = 0.0 + RC_dict['aa'][parser.std_cterm] = 0.0 + RC_dict['const'] = RCs[len(detected_amino_acids)] + RC_dict['lcp'] = lcp + + # Find remaining terminal RCs. + if term_aa: + for term_label in ['nterm', 'cterm']: + # Check if there are terminal RCs remaining undefined. + undefined_term_RCs = [aa for aa in RC_dict['aa'] + if aa[1:5] != 'term' + and term_label + aa not in RC_dict['aa']] + if not undefined_term_RCs: + continue + + # Find a linear relationship between internal and terminal RCs. + defined_term_RCs = [aa for aa in RC_dict['aa'] + if aa[1:5] != 'term' + and term_label + aa in RC_dict['aa']] + + a, b, r, stderr = linear_regression( + [RC_dict['aa'][aa] for aa in defined_term_RCs], + [RC_dict['aa'][term_label+aa] for aa in defined_term_RCs]) + + # Define missing terminal RCs using this linear equation. + for aa in undefined_term_RCs: + RC_dict['aa'][term_label + aa] = a * RC_dict['aa'][aa] + b + + return RC_dict + + +def get_RCs_vary_lcp(sequences, RTs, term_aa=False, lcp_range=(-1.0, 1.0), metric='mse', **kwargs): + """Find the best combination of a length correction parameter and + retention coefficients for a given peptide sample. + + Parameters + ---------- + sequences : list of str + List of peptide sequences. + RTs : list of float + List of corresponding retention times. + term_aa : bool, optional + If True, terminal amino acids are treated as being + modified with 'ntermX'/'ctermX' modifications. False by default. + metric : str, optional + Metric for the regression problem. Set to "mse" (mean squared + error) by default. Alternative: "mae" (mean absolute error). + + .. note :: + `"mae"` requires :py:mod:`scikit-learn` for + `quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_. + + lcp_range : 2-tuple of float, optional + Range of possible values of the length correction parameter. + labels : list of str, optional + List of labels for all possible amino acids and terminal groups + If not given, any modX labels are allowed. + lcp_accuracy : float, optional + The accuracy of the length correction parameter calculation. + + Returns + ------- + RC_dict : dict + Dictionary with the calculated retention coefficients. + + - RC_dict['aa'] -- amino acid retention coefficients. + + - RC_dict['const'] -- constant retention time shift. + + - RC_dict['lcp'] -- length correction parameter. + + Examples + -------- + >>> RCs = get_RCs_vary_lcp(['A', 'AA', 'AAA'], \ + [1.0, 2.0, 3.0], \ + labels=['A']) + >>> abs(RCs['aa']['A'] - 1) + abs(RCs['lcp']) + abs(RCs['const']) < 1e-6 + True + """ + labels = kwargs.get('labels') + + best_r = -1.1 + best_RC_dict = {} + lcp_accuracy = kwargs.get('lcp_accuracy', 0.1) + + min_lcp = lcp_range[0] + max_lcp = lcp_range[1] + step = (max_lcp - min_lcp) / 10.0 + peptide_dicts = [ + parser.amino_acid_composition(peptide, False, term_aa, + allow_unknown_modifications=True, + labels=labels) + if not isinstance(peptide, dict) else peptide + for peptide in sequences] + while step > lcp_accuracy: + lcp_grid = np.arange(min_lcp, max_lcp, + (max_lcp - min_lcp) / 10.0) + for lcp in lcp_grid: + RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels, metric=metric) + regression_coeffs = linear_regression( + RTs, + [calculate_RT(peptide, RC_dict) for peptide in peptide_dicts]) + if regression_coeffs[2] > best_r: + best_r = regression_coeffs[2] + best_RC_dict = dict(RC_dict) + min_lcp = best_RC_dict['lcp'] - step + max_lcp = best_RC_dict['lcp'] + step + step = (max_lcp - min_lcp) / 10.0 + + return best_RC_dict + + +def calculate_RT(peptide, RC_dict, raise_no_mod=True): + """Calculate the retention time of a peptide using a given set + of retention coefficients. + + Parameters + ---------- + peptide : str or dict + A peptide sequence or amino acid composition. + RC_dict : dict + A set of retention coefficients, length correction parameter and + a fixed retention time shift. Keys are: 'aa', 'lcp' and 'const'. + raise_no_mod : bool, optional + If :py:const:`True` then an exception is raised when a modified amino + acid from `peptides` is not found in `RC_dict`. If :py:const:`False`, + then the retention coefficient for the non-modified amino acid residue + is used instead. :py:const:`True` by default. + + Returns + ------- + RT : float + Calculated retention time. + + Examples + -------- + >>> RT = calculate_RT('AA', {'aa': {'A': 1.1}, 'lcp':0.0, 'const': 0.1}) + >>> abs(RT - 2.3) < 1e-6 # Float comparison + True + >>> RT = calculate_RT('AAA', {'aa': {'ntermA': 1.0, 'A': 1.1, 'ctermA': 1.2},\ + 'lcp': 0.0, 'const':0.1}) + >>> abs(RT - 3.4) < 1e-6 # Float comparison + True + >>> RT = calculate_RT({'A': 3}, {'aa': {'ntermA': 1.0, 'A': 1.1, 'ctermA': 1.2},\ + 'lcp': 0.0, 'const':0.1}) + >>> abs(RT - 3.4) < 1e-6 # Float comparison + True + """ + + amino_acids = [aa for aa in RC_dict['aa'] + if not (aa[:5] == 'nterm' or aa[:5] == 'cterm')] + + # Check if there are retention coefficients for terminal amino acids. + term_aa = False + for aa in RC_dict['aa']: + if aa[:5] == 'nterm' or aa[:5] == 'cterm': + term_aa = True + break + + # Calculate retention time. + if isinstance(peptide, dict): + peptide_dict = peptide + else: + peptide_dict = parser.amino_acid_composition(peptide, False, term_aa, + allow_unknown_modifications=True, labels=amino_acids) + RT = 0.0 + for aa in peptide_dict: + if aa not in RC_dict['aa']: + if len(aa) == 1: + raise PyteomicsError('No RC for residue "{}".'.format(aa)) + if (not raise_no_mod) and aa[-1] in RC_dict['aa']: + RT += peptide_dict[aa] * RC_dict['aa'][aa[-1]] + else: + raise PyteomicsError( + 'Residue "{0}" not found in RC_dict. '.format(aa) + + 'Set raise_no_mod=False to ignore this error ' + + 'and use the RC for "{0}"" instead.'.format(aa[-1])) + else: + RT += peptide_dict[aa] * RC_dict['aa'][aa] + + length_correction_term = ( + 1.0 + RC_dict.get('lcp', 0) * np.log(parser.length(peptide_dict))) + RT *= length_correction_term + + RT += RC_dict.get('const', 0) + + return RT + +RCs_guo_ph2_0 = {'aa':{'K': -2.1, + 'G': -0.2, + 'L': 8.1, + 'A': 2.0, + 'C': 2.6, + 'E': 1.1, + 'D': 0.2, + 'F': 8.1, + 'I': 7.4, + 'H': -2.1, + 'M': 5.5, + 'N': -0.6, + 'Q': 0.0, + 'P': 2.0, + 'S': -0.2, + 'R': -0.6, + 'T': 0.6, + 'W': 8.8, + 'V': 5.0, + 'Y': 4.5, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients from Guo, D.; Mant, C. T.; Taneja, +A. K.; Parker, J. M. R.; Hodges, R. S. Prediction of peptide +retention times in reversed-phase high-performance liquid +chromatography I. Determination of retention coefficients of amino +acid residues of model synthetic peptides. Journal of Chromatography +A, 1986, 359, 499-518. + +Conditions: Synchropak RP-P C18 column (250 x 4.1 mm I.D.), gradient +(A = 0.1% aq. TFA, pH 2.0; B = 0.1% TFA in acetonitrile) at 1% B/min, +flow rate 1 ml/min, 26 centigrades. +""" + +RCs_guo_ph7_0 = {'aa':{'K': -0.2, + 'G': -0.2, + 'L': 9.0, + 'A': 2.2, + 'C': 2.6, + 'E': -1.3, + 'D': -2.6, + 'F': 9.0, + 'I': 8.3, + 'H': 2.2, + 'M': 6.0, + 'N': -0.8, + 'Q': 0.0, + 'P': 2.2, + 'S': -0.5, + 'R': 0.9, + 'T': 0.3, + 'W': 9.5, + 'V': 5.7, + 'Y': 4.6, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients from Guo, D.; Mant, C. T.; Taneja, +A. K.; Parker, J. M. R.; Hodges, R. S. Prediction of peptide +retention times in reversed-phase high-performance liquid +chromatography I. Determination of retention coefficients of amino +acid residues of model synthetic peptides. Journal of Chromatography +A, 1986, 359, 499-518. + +Conditions: Synchropak RP-P C18 column (250 x 4.1 mm I.D.), gradient +(A = aq. 10 mM (NH4)2HPO4 - 0.1 M NaClO4, pH 7.0; B = 0.1 M NaClO4 in +60% aq. acetonitrile) at 1.67% B/min, flow rate 1 ml/min, 26 +centigrades. +""" + +RCs_meek_ph2_1 = {'aa':{'K': -3.2, + 'G': -0.5, + 'L': 10.0, + 'A': -0.1, + 'C': -2.2, + 'E': -7.5, + 'D': -2.8, + 'F': 13.9, + 'I': 11.8, + 'H': 0.8, + 'M': 7.1, + 'N': -1.6, + 'Q': -2.5, + 'P': 8.0, + 'S': -3.7, + 'R': -4.5, + 'T': 1.5, + 'W': 18.1, + 'V': 3.3, + 'Y': 8.2, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients determined in Meek, +J. L. Prediction of peptide retention times in high-pressure liquid +chromatography on the basis of amino acid composition. PNAS, 1980, 77 +(3), 1632-1636. + +.. note :: C stands for Cystine. + +Conditions: Bio-Rad "ODS" column, gradient (A = 0.1 M NaClO4, +0.1% phosphoric acid in water; B = 0.1 M NaClO4, 0.1% phosphoric acid +in 60% aq. acetonitrile) at 1.25% B/min, room temperature. +""" + +RCs_meek_ph7_4 = {'aa':{'K': 0.1, + 'G': 0.0, + 'L': 8.8, + 'A': 0.5, + 'C': -6.8, + 'E':-16.9, + 'D': -8.2, + 'F': 13.2, + 'I': 13.9, + 'H': -3.5, + 'M': 4.8, + 'N': 0.8, + 'Q': -4.8, + 'P': 6.1, + 'S': 1.2, + 'R': 0.8, + 'T': 2.7, + 'W': 14.9, + 'V': 2.7, + 'Y': 6.1, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients determined in Meek, +J. L. Prediction of peptide retention times in high-pressure liquid +chromatography on the basis of amino acid composition. PNAS, 1980, 77 +(3), 1632-1636. + +.. note :: C stands for Cystine. + +Conditions: Bio-Rad "ODS" column, gradient (A = 0.1 M NaClO4, +5 mM phosphate buffer in water; B = 0.1 M NaClO4, 5 mM phosphate buffer +in 60% aq. acetonitrile) at 1.25% B/min, room temperature. +""" + +RCs_browne_tfa = {'aa':{'K': -3.7, + 'G': -1.2, + 'L': 20.0, + 'A': 7.3, + 'C': -9.2, + 'E': -7.1, + 'D': -2.9, + 'F': 19.2, + 'I': 6.6, + 'H': -2.1, + 'M': 5.6, + 'N': -5.7, + 'Q': -0.3, + 'P': 5.1, + 'S': -4.1, + 'pS':-6.5, + 'R': -3.6, + 'T': 0.8, + 'pT':-1.6, + 'W': 16.3, + 'V': 3.5, + 'Y': 5.9, + 'pY': 3.5, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients determined in Browne, C. A.; +Bennett, H. P. J.; Solomon, S. The isolation of peptides by +high-performance liquid chromatography using predicted elution +positions. Analytical Biochemistry, 1982, 124 (1), 201-208. + +Conditions: Waters mjuBondapak C18 column, gradient (A = 0.1% aq. TFA, +B = 0.1% TFA in acetonitrile) at 0.33% B/min, flow rate 1.5 ml/min. +""" + +RCs_browne_hfba = {'aa':{'K': -2.5, + 'G': -2.3, + 'L': 15.0, + 'A': 3.9, + 'C':-14.3, + 'E': -7.5, + 'D': -2.8, + 'F': 14.7, + 'I': 11.0, + 'H': 2.0, + 'M': 4.1, + 'N': -2.8, + 'Q': 1.8, + 'P': 5.6, + 'S': -3.5, + 'pS':-7.6, + 'R': 3.2, + 'T': 1.1, + 'pT':-3.0, + 'W': 17.8, + 'V': 2.1, + 'Y': 3.8, + 'pY':-0.3, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients determined in Browne, C. A.; +Bennett, H. P. J.; Solomon, S. The isolation of peptides by +high-performance liquid chromatography using predicted elution +positions. Analytical Biochemistry, 1982, 124 (1), 201-208. + +Conditions: Waters mjuBondapak C18 column, gradient (A = 0.13% aq. HFBA, +B = 0.13% HFBA in acetonitrile) at 0.33% B/min, flow rate 1.5 ml/min. +""" + +RCs_palmblad = {'aa':{'K': -0.66, + 'G': -0.29, + 'L': 2.28, + 'A': 0.41, + 'C': -1.32, + 'E': -0.26, + 'D': 0.04, + 'F': 2.68, + 'I': 2.70, + 'H': 0.57, + 'M': 0.98, + 'N': -0.54, + 'Q': 1.02, + 'P': 0.97, + 'S': -0.71, + 'R': -0.76, + 'T': 0.37, + 'W': 4.68, + 'V': 2.44, + 'Y': 2.78, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients determined in Palmblad, M.; +Ramstrom, M.; Markides, K. E.; Hakansson, P.; Bergquist, J. Prediction +of Chromatographic Retention and Protein Identification in Liquid +Chromatography/Mass Spectrometry. Analytical Chemistry, 2002, 74 (22), +5826-5830. + +Conditions: a fused silica column (80-100 x 0.200 mm I.D.) packed +in-house with C18 ODS-AQ; solvent A = 0.5% aq. HAc, B = 0.5% HAc in +acetonitrile. +""" + +RCs_yoshida = {'aa':{'K': 2.77, + 'G': -0.16, + 'L': -2.31, + 'A': 0.28, + 'C': 0.80, + 'camC': 0.80, + 'E': 1.58, + 'D': 2.45, + 'F': -2.94, + 'I': -1.34, + 'H': 3.44, + 'M': -0.14, + 'N': 3.25, + 'Q': 2.35, + 'P': 0.77, + 'S': 2.53, + 'R': 3.90, + 'T': 1.73, + 'W': -1.80, + 'V': -2.19, + 'Y': -0.11, + 'H-': 0.0, + '-OH':0.0}, + 'lcp': 0.0, + 'const': 0.0} +"""A set of retention coefficients determined in Yoshida, +T. Calculation of peptide retention coefficients in normal-phase +liquid chromatography. Journal of Chromatography A, 1998, 808 (1-2), +105-112. + +.. note:: Cysteine is Carboxymethylated. + +Conditions: TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = +0.1% TFA in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at +0.6% water/min, flow rate 1.0 ml/min, 40 centigrades. +""" + +RCs_yoshida_lc = {'aa': {'A': 1.29, + 'C': 0.94, + 'camC': 0.94, + 'D': 3.89, + 'E': 4.40, + 'F': -4.18, + 'G': 1.29, + 'H': 7.57, + 'I': -2.65, + 'K': 7.33, + 'L': -3.93, + 'M': -1.48, + 'N': 6.65, + 'P': 1.03, + 'Q': 6.68, + 'R': 7.08, + 'S': 5.09, + 'T': 3.46, + 'V': -2.52, + 'W': -1.87, + 'Y': -0.46, + 'H-': 0.0, + '-OH': 0.0}, + 'const': 0.0, + 'lcp': -0.2} +"""A set of retention coefficients from the length-corrected model +of normal-phase peptide chromatography. The dataset comes from Yoshida, T. +Calculation of peptide retention coefficients in normal-phase +liquid chromatography. Journal of Chromatography A, 1998, 808 (1-2), +105-112. The RCs were calculated in Moskovets, E.; Goloborodko A. A.; +Gorshkov A. V.; Gorshkov M.V. Limitation of predictive 2-D liquid chromatography +in reducing the database search space in shotgun proteomics: In silico studies. +Journal of Separation Science, 2012, 35 (14), 1771-1778. + +.. note:: Cysteine is Carboxymethylated. + +Conditions: TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = +0.1% TFA in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at +0.6% water/min, flow rate 1.0 ml/min, 40 centigrades. +""" + +RCs_zubarev = {'aa': {'A': 6.73, + 'E': 5.66, + 'C': 3.25, + 'D': 5.64, + 'G': 2.35, + 'F': 27.43, + 'I': 20.50, + 'H': -0.66, + 'K': -4.47, + 'M': 17.39, + 'L': 23.38, + 'N': 2.57, + 'Q': 2.93, + 'P': 5.66, + 'S': 3.58, + 'R': -2.55, + 'T': 4.88, + 'Y': 13.22, + 'W': 31.27, + 'V': 13.05, + 'camC': 3.25, + 'C': 3.25, + 'oxM': -7.61, + '-OH': 0.0, + 'H-': 0.0}, + 'const': 0.53, + 'lcp': -0.21} +"""A set of retention coefficients from the length-corrected model +of reversed-phase peptide chromatography. The dataset was taken from +Goloborodko A. A.; Mayerhofer C.; Zubarev A. R.; Tarasova I. A.; Gorshkov A. V.; +Zubarev, R. A.; Gorshkov, M. V. Empirical approach to false discovery rate +estimation in shotgun proteomics. Rapid communications in mass spectrometry, +2010, 24(4), 454-62. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: Reprosil-Pur C18-AQ column (150 x 0.075 mm I.D.), gradient (A = +0.5% AA in water; B = 0.5% AA in ACN-water (90:10)) at +0.5% water/min, flow rate 200.0 nl/min, room temperature. +""" + +RCs_gilar_atlantis_ph3_0 = {'aa': {'K': 15.90, + 'R': 13.64, + 'H': 12.94, + 'E': 2.97, + 'P': 4.77, + 'Q': 5.43, + 'D': 3.20, + 'C*': 4.87, + 'C': 4.87, + 'N': 3.91, + 'A': 3.34, + 'G': 3.33, + 'S': 3.04, + 'T': 2.71, + 'V': 1.75, + 'I': 0.65, + 'M': 1.13, + 'L': 0.13, + 'F': -1.17, + 'Y': -0.22, + 'W': -2.47}, + 'lcp': 0.0, + 'const': 21.33} +"""A set of retention coefficients for normal phase chromatography obtained in +Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in +hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), +8890-6. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A, +gradient (A = water, B = ACN, C = 200 mM ammonium formate): +0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C +at 0.2 ml/min, temperature 40 C, pH 3.0""" + +RCs_gilar_atlantis_ph4_5 = {'aa': {'K': 15.49, + 'R': 13.33, + 'H': 12.19, + 'E': 6.93, + 'P': 5.89, + 'Q': 5.68, + 'D': 5.31, + 'C*': 5.23, + 'C': 5.23, + 'N': 4.07, + 'A': 3.6, + 'G': 3.46, + 'S': 2.62, + 'T': 2.33, + 'V': 1.42, + 'I': 0.84, + 'M': 0.34, + 'L': 0.29, + 'F': -1.21, + 'Y': -1.62, + 'W': -2.08}, + 'lcp': 0.0, + 'const': 23.95} +"""A set of retention coefficients for normal phase chromatography obtained in +Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in +hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), +8890-6. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A, +gradient (A = water, B = ACN, C = 200 mM ammonium formate): +0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C +at 0.2 ml/min, temperature 40 C, pH 4.5""" + +RCs_gilar_atlantis_ph10_0 = {'aa': {'K': 25.23, + 'R': 23.38, + 'H': 5.94, + 'E': 0.59, + 'P': 4.00, + 'Q': 3.53, + 'D': -0.84, + 'C*': 3.52, + 'C': 3.52, + 'N': 3.26, + 'A': 3.64, + 'G': 3.02, + 'S': 2.28, + 'T': 1.74, + 'V': 1.05, + 'I': 1.51, + 'M': -0.61, + 'L': 0.25, + 'F': -0.17, + 'Y': -0.79, + 'W': 0.23}, + 'lcp': 0.0, + 'const': 13.78} +"""A set of retention coefficients for normal phase chromatography obtained in +Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in +hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), +8890-6. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A, +gradient (A = water, B = ACN, C = 200 mM ammonium formate): +0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C +at 0.2 ml/min, temperature 40 C, pH 10.0""" + +RCs_gilar_beh = {'aa': {'K': 9.49, + 'R': 8.56, + 'H': 8.40, + 'E': 5.95, + 'P': 4.73, + 'Q': 4.65, + 'D': 4.97, + 'C': 3.47, + 'C*': 3.47, + 'N': 3.50, + 'A': 2.90, + 'G': 2.63, + 'S': 2.14, + 'T': 2.19, + 'V': 1.71, + 'I': 1.30, + 'M': 1.40, + 'L': 0.73, + 'F': -0.09, + 'Y': -0.40, + 'W': 0.11}, + 'lcp': 0.0, + 'const': 18.41} +"""A set of retention coefficients for normal phase chromatography obtained in +Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in +hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), +8890-6. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: ACQUITY UPLC BEH HILIC column (150 x 2.1 mm I.D.), 1.7 um, 130 A, +Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by +titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: +90% ACN, 10% mobile phase A (v:v). +Gradient: 90-60% B in 50 min.""" + +RCs_gilar_beh_amide = {'aa': {'K': 7.19, + 'R': 6.68, + 'H': 6.16, + 'E': 6.11, + 'P': 3.18, + 'Q': 5.19, + 'D': 6.02, + 'C*': 3.71, + 'C': 3.71, + 'N': 4.16, + 'A': 2.64, + 'G': 3.12, + 'S': 3.17, + 'T': 3.41, + 'V': 0.83, + 'I': -0.69, + 'M': -0.12, + 'L': -1.24, + 'F': -1.93, + 'Y': 0.46, + 'W': -2.11}, + 'lcp': 0.0, + 'const': 24.26} +"""A set of retention coefficients for normal phase chromatography obtained in +Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in +hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), +8890-6. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: ACQUITY UPLC BEH glycan column (150 x 2.1 mm I.D.), 1.7 um, 130 A, +Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by +titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: +90% ACN, 10% mobile phase A (v:v). +Gradient: 90-60% B in 50 min.""" + +RCs_gilar_rp = {'aa': {'K': -1.015, + 'R': -0.681, + 'H': -1.937, + 'E': 1.475, + 'P': 3.496, + 'Q': 1.228, + 'D': 1.326, + 'C': 1.832, + 'C*': 1.832, + 'N': 0.299, + 'A': 2.322, + 'G': 1.172, + 'S': 1.165, + 'T': 1.894, + 'V': 5.695, + 'I': 8.343, + 'M': 5.128, + 'L': 9.069, + 'F': 10.877, + 'Y': 5.603, + 'W': 12.183}, + 'lcp': 0.0, + 'const': -3.696} +"""A set of retention coefficients for normal phase chromatography obtained in +Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in +hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), +8890-6. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: ACQUITY UPLC BEH C18 column (100 mm x 2.1 mm I.D.), 1.7 um, 130 A. +Mobile phase A: 0.02% TFA in water, mobile phase B: 0.018% TFA in ACN. +Gradient: 0 to 50% B in 50 min, flow rate 0.2 ml/min, temperature 40 C., pH 2.6. +""" + +RCs_krokhin_100A_fa = {'aa':{'K': -5.08, + 'G': -0.07, + 'L': 9.89, + 'A': 1.63, + 'C': 0.7, + 'camC': 0.7, + 'E': 1.75, + 'D': 0.95, + 'F': 11.92, + 'I': 9.06, + 'H': -5.05, + 'M': 6.96, + 'N': -0.59, + 'Q': 0.2, + 'P': 1.98, + 'S': 0.27, + 'R': -3.55, + 'T': 1.37, + 'W': 13.67, + 'V': 5.72, + 'Y': 5.97}, + 'lcf': 0.0, + 'const': 0.0} +"""A set of retention coefficients from R.C. Dwivedi, V. Spicer, +M. Harder, M. Antonovici, W. Ens, K.G. Standing, J.A. Wilkins, and O.V. Krokhin; +Analytical Chemistry 2008 80 (18), 7036-7042. +Practical Implementation of 2D HPLC Scheme with Accurate Peptide +Retention Prediction in Both Dimensions for High-Throughput Bottom-Up Proteomics. + +.. note:: Cysteine is Carbamidomethylated. + +Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% FA), packed with +5-um Luna C18(2) (Phenomenex, Torrance, CA), pore size 100A, pH=2.0. +Both eluents A (2% ACN in water) and B (98% ACN) contained +0.1% FA as ion-pairing modifier. 0.33% ACN/min +linear gradient (0-30% B). +""" + +RCs_krokhin_100A_tfa = {'aa':{'K': -3.53, + 'G': -0.35, + 'L': 9.44, + 'A': 1.11, + 'C': 0.04, + 'camC': 0.04, + 'E': 1.08, + 'D': -0.22, + 'F': 11.34, + 'I': 7.86, + 'H': -3.04, + 'M': 6.57, + 'N': -1.44, + 'Q': -0.53, + 'P': 1.62, + 'S': -0.33, + 'R': -2.58, + 'T': 0.48, + 'W': 13.12, + 'V': 4.86, + 'Y': 5.4}, + 'lcf': 0.0, + 'const': 0.0} +"""A set of retention coefficients from R.C. Dwivedi, V. Spicer, +M. Harder, M. Antonovici, W. Ens, K.G. Standing, J.A. Wilkins, and O.V. Krokhin; +Analytical Chemistry 2008 80 (18), 7036-7042. +Practical Implementation of 2D HPLC Scheme with Accurate Peptide +Retention Prediction in Both Dimensions for High-Throughput Bottom-Up Proteomics. + +.. note :: Cysteine is Carbamidomethylated. + +Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% TFA), packed with +5-um Luna C18(2) (Phenomenex, Torrance, CA), pore size 100 A, pH=2.0. +Both eluents A (2% ACN in water) and B (98% ACN) contained +0.1% TFA as ion-pairing modifier. 0.33% ACN/min +linear gradient (0-30% B). +""" + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pyteomics/auxiliary/__init__.py b/pyteomics/auxiliary/__init__.py new file mode 100644 index 0000000..79e3e5c --- /dev/null +++ b/pyteomics/auxiliary/__init__.py @@ -0,0 +1,35 @@ +try: + basestring = basestring +except NameError: + basestring = (str, bytes) + +from .structures import ( + PyteomicsError, Charge, ChargeList, + _parse_charge, _parse_ion, BasicComposition, + unitfloat, unitint, unitstr, cvstr, + cvquery) + +from .constants import _nist_mass + +from .file_helpers import ( + _file_obj, _keepstate, _keepstate_method, IteratorContextManager, + FileReader, IndexedTextReader, IndexedReaderMixin, TimeOrderedIndexedReaderMixin, + IndexSavingMixin, OffsetIndex, HierarchicalOffsetIndex, IndexSavingTextReader, + _file_reader, _file_writer, + _make_chain, _check_use_index, FileReadingProcess, TaskMappingMixin, + serializer, ChainBase, TableJoiner) + +from .math import ( + linear_regression, linear_regression_perpendicular, + linear_regression_vertical) + +from .target_decoy import ( + _calculate_qvalues, _qvalues_df, _decoy_or_pep_label, + _construct_dtype, _make_qvalues, _make_filter, + _itercontext, _iter, qvalues, filter, log_factorial, + _expectation, _confidence_value, _log_pi_r, + _log_pi, _make_fdr, fdr, sigma_T, sigma_fdr) + +from .utils import ( + print_tree, memoize, BinaryDataArrayTransformer, ArrayConversionMixin, BinaryArrayConversionMixin, + MaskedArrayConversionMixin, _decode_base64_data_array) diff --git a/pyteomics/auxiliary/constants.py b/pyteomics/auxiliary/constants.py new file mode 100644 index 0000000..7dc7664 --- /dev/null +++ b/pyteomics/auxiliary/constants.py @@ -0,0 +1,3297 @@ +### Bulky constants for other modules are defined below. + +_nist_mass = {'Ac': {0: (227, 1.0), + 206: (206.0145, 0.0), + 207: (207.01195, 0.0), + 208: (208.01155, 0.0), + 209: (209.00949, 0.0), + 210: (210.00944, 0.0), + 211: (211.00773, 0.0), + 212: (212.00781, 0.0), + 213: (213.00661, 0.0), + 214: (214.006902, 0.0), + 215: (215.006454, 0.0), + 216: (216.00872, 0.0), + 217: (217.009347, 0.0), + 218: (218.01164, 0.0), + 219: (219.01242, 0.0), + 220: (220.014763, 0.0), + 221: (221.01559, 0.0), + 222: (222.017844, 0.0), + 223: (223.019137, 0.0), + 224: (224.021723, 0.0), + 225: (225.02323, 0.0), + 226: (226.026098, 0.0), + 227: (227.0277521, 0.0), + 228: (228.0310211, 0.0), + 229: (229.03302, 0.0), + 230: (230.03629, 0.0), + 231: (231.03856, 0.0), + 232: (232.04203, 0.0), + 233: (233.04455, 0.0), + 234: (234.04842, 0.0), + 235: (235.05123, 0.0), + 236: (236.0553, 0.0)}, + 'Ag': {0: (106.905097, 1.0), + 93: (92.94978, 0.0), + 94: (93.94278, 0.0), + 95: (94.93548, 0.0), + 96: (95.93068, 0.0), + 97: (96.92397, 0.0), + 98: (97.92157, 0.0), + 99: (98.9176, 0.0), + 100: (99.9161, 0.0), + 101: (100.9128, 0.0), + 102: (101.91169, 0.0), + 103: (102.908973, 0.0), + 104: (103.908629, 0.0), + 105: (104.906529, 0.0), + 106: (105.906669, 0.0), + 107: (106.905097, 0.51839), + 108: (107.905956, 0.0), + 109: (108.904752, 0.48161), + 110: (109.906107, 0.0), + 111: (110.905291, 0.0), + 112: (111.907005, 0.0), + 113: (112.906567, 0.0), + 114: (113.908804, 0.0), + 115: (114.90876, 0.0), + 116: (115.91136, 0.0), + 117: (116.91168, 0.0), + 118: (117.91458, 0.0), + 119: (118.91567, 0.0), + 120: (119.91879, 0.0), + 121: (120.91985, 0.0), + 122: (121.92353, 0.0), + 123: (122.9249, 0.0), + 124: (123.92864, 0.0), + 125: (124.93043, 0.0), + 126: (125.9345, 0.0), + 127: (126.93677, 0.0), + 128: (127.94117, 0.0), + 129: (128.94369, 0.0), + 130: (129.95045, 0.0)}, + 'Al': {0: (26.98153863, 1.0), + 21: (21.02804, 0.0), + 22: (22.01952, 0.0), + 23: (23.007267, 0.0), + 24: (23.9999389, 0.0), + 25: (24.9904281, 0.0), + 26: (25.98689169, 0.0), + 27: (26.98153863, 1.0), + 28: (27.98191031, 0.0), + 29: (28.980445, 0.0), + 30: (29.98296, 0.0), + 31: (30.983947, 0.0), + 32: (31.98812, 0.0), + 33: (32.99084, 0.0), + 34: (33.99685, 0.0), + 35: (34.99986, 0.0), + 36: (36.00621, 0.0), + 37: (37.01068, 0.0), + 38: (38.01723, 0.0), + 39: (39.02297, 0.0), + 40: (40.03145, 0.0), + 41: (41.03833, 0.0), + 42: (42.04689, 0.0)}, + 'Am': {0: (243, 1.0), + 231: (231.04556, 0.0), + 232: (232.04659, 0.0), + 233: (233.04635, 0.0), + 234: (234.04781, 0.0), + 235: (235.04795, 0.0), + 236: (236.04958, 0.0), + 237: (237.05, 0.0), + 238: (238.05198, 0.0), + 239: (239.0530245, 0.0), + 240: (240.0553, 0.0), + 241: (241.0568291, 0.0), + 242: (242.0595492, 0.0), + 243: (243.0613811, 0.0), + 244: (244.0642848, 0.0), + 245: (245.066452, 0.0), + 246: (246.069775, 0.0), + 247: (247.07209, 0.0), + 248: (248.07575, 0.0), + 249: (249.07848, 0.0)}, + 'Ar': {0: (39.9623831225, 1.0), + 30: (30.02156, 0.0), + 31: (31.01212, 0.0), + 32: (31.997638, 0.0), + 33: (32.9899257, 0.0), + 34: (33.9802712, 0.0), + 35: (34.9752576, 0.0), + 36: (35.967545106, 0.003365), + 37: (36.96677632, 0.0), + 38: (37.9627324, 0.000632), + 39: (38.964313, 0.0), + 40: (39.9623831225, 0.996003), + 41: (40.9645006, 0.0), + 42: (41.963046, 0.0), + 43: (42.965636, 0.0), + 44: (43.964924, 0.0), + 45: (44.96804, 0.0), + 46: (45.96809, 0.0), + 47: (46.97219, 0.0), + 48: (47.97454, 0.0), + 49: (48.98052, 0.0), + 50: (49.98443, 0.0), + 51: (50.99163, 0.0), + 52: (51.99678, 0.0), + 53: (53.00494, 0.0)}, + 'As': {0: (74.9215965, 1.0), + 60: (59.99313, 0.0), + 61: (60.98062, 0.0), + 62: (61.9732, 0.0), + 63: (62.96369, 0.0), + 64: (63.95757, 0.0), + 65: (64.94956, 0.0), + 66: (65.94471, 0.0), + 67: (66.93919, 0.0), + 68: (67.93677, 0.0), + 69: (68.93227, 0.0), + 70: (69.93092, 0.0), + 71: (70.927112, 0.0), + 72: (71.926752, 0.0), + 73: (72.923825, 0.0), + 74: (73.9239287, 0.0), + 75: (74.9215965, 1.0), + 76: (75.922394, 0.0), + 77: (76.9206473, 0.0), + 78: (77.921827, 0.0), + 79: (78.920948, 0.0), + 80: (79.922534, 0.0), + 81: (80.922132, 0.0), + 82: (81.9245, 0.0), + 83: (82.92498, 0.0), + 84: (83.92906, 0.0), + 85: (84.93202, 0.0), + 86: (85.9365, 0.0), + 87: (86.9399, 0.0), + 88: (87.94494, 0.0), + 89: (88.94939, 0.0), + 90: (89.9555, 0.0), + 91: (90.96043, 0.0), + 92: (91.9668, 0.0)}, + 'At': {0: (210, 1.0), + 193: (192.99984, 0.0), + 194: (193.99873, 0.0), + 195: (194.996268, 0.0), + 196: (195.99579, 0.0), + 197: (196.99319, 0.0), + 198: (197.99284, 0.0), + 199: (198.99053, 0.0), + 200: (199.990351, 0.0), + 201: (200.988417, 0.0), + 202: (201.98863, 0.0), + 203: (202.986942, 0.0), + 204: (203.987251, 0.0), + 205: (204.986074, 0.0), + 206: (205.986667, 0.0), + 207: (206.985784, 0.0), + 208: (207.98659, 0.0), + 209: (208.986173, 0.0), + 210: (209.987148, 0.0), + 211: (210.9874963, 0.0), + 212: (211.990745, 0.0), + 213: (212.992937, 0.0), + 214: (213.996372, 0.0), + 215: (214.998653, 0.0), + 216: (216.002423, 0.0), + 217: (217.004719, 0.0), + 218: (218.008694, 0.0), + 219: (219.011162, 0.0), + 220: (220.01541, 0.0), + 221: (221.01805, 0.0), + 222: (222.02233, 0.0), + 223: (223.02519, 0.0)}, + 'Au': {0: (196.9665687, 1.0), + 169: (168.99808, 0.0), + 170: (169.99612, 0.0), + 171: (170.991879, 0.0), + 172: (171.99004, 0.0), + 173: (172.986237, 0.0), + 174: (173.98476, 0.0), + 175: (174.98127, 0.0), + 176: (175.9801, 0.0), + 177: (176.976865, 0.0), + 178: (177.97603, 0.0), + 179: (178.973213, 0.0), + 180: (179.972521, 0.0), + 181: (180.970079, 0.0), + 182: (181.969618, 0.0), + 183: (182.967593, 0.0), + 184: (183.967452, 0.0), + 185: (184.965789, 0.0), + 186: (185.965953, 0.0), + 187: (186.964568, 0.0), + 188: (187.965324, 0.0), + 189: (188.963948, 0.0), + 190: (189.9647, 0.0), + 191: (190.9637, 0.0), + 192: (191.964813, 0.0), + 193: (192.96415, 0.0), + 194: (193.965365, 0.0), + 195: (194.9650346, 0.0), + 196: (195.96657, 0.0), + 197: (196.9665687, 1.0), + 198: (197.9682423, 0.0), + 199: (198.9687652, 0.0), + 200: (199.97073, 0.0), + 201: (200.971657, 0.0), + 202: (201.97381, 0.0), + 203: (202.975155, 0.0), + 204: (203.97772, 0.0), + 205: (204.97987, 0.0)}, + 'B': {0: (11.0093054, 1.0), + 6: (6.04681, 0.0), + 7: (7.02992, 0.0), + 8: (8.0246072, 0.0), + 9: (9.0133288, 0.0), + 10: (10.012937, 0.199), + 11: (11.0093054, 0.801), + 12: (12.0143521, 0.0), + 13: (13.0177802, 0.0), + 14: (14.025404, 0.0), + 15: (15.031103, 0.0), + 16: (16.03981, 0.0), + 17: (17.04699, 0.0), + 18: (18.05617, 0.0), + 19: (19.06373, 0.0)}, + 'Ba': {0: (137.9052472, 1.0), + 114: (113.95068, 0.0), + 115: (114.94737, 0.0), + 116: (115.94138, 0.0), + 117: (116.9385, 0.0), + 118: (117.93304, 0.0), + 119: (118.93066, 0.0), + 120: (119.92604, 0.0), + 121: (120.92405, 0.0), + 122: (121.9199, 0.0), + 123: (122.918781, 0.0), + 124: (123.915094, 0.0), + 125: (124.914473, 0.0), + 126: (125.91125, 0.0), + 127: (126.911094, 0.0), + 128: (127.908318, 0.0), + 129: (128.908679, 0.0), + 130: (129.9063208, 0.00106), + 131: (130.906941, 0.0), + 132: (131.9050613, 0.00101), + 133: (132.9060075, 0.0), + 134: (133.9045084, 0.02417), + 135: (134.9056886, 0.06592), + 136: (135.9045759, 0.07854), + 137: (136.9058274, 0.11232), + 138: (137.9052472, 0.71698), + 139: (138.9088413, 0.0), + 140: (139.910605, 0.0), + 141: (140.914411, 0.0), + 142: (141.916453, 0.0), + 143: (142.920627, 0.0), + 144: (143.922953, 0.0), + 145: (144.92763, 0.0), + 146: (145.93022, 0.0), + 147: (146.93495, 0.0), + 148: (147.93772, 0.0), + 149: (148.94258, 0.0), + 150: (149.94568, 0.0), + 151: (150.95081, 0.0), + 152: (151.95427, 0.0), + 153: (152.95961, 0.0)}, + 'Be': {0: (9.0121822, 1.0), + 5: (5.04079, 0.0), + 6: (6.019726, 0.0), + 7: (7.01692983, 0.0), + 8: (8.0053051, 0.0), + 9: (9.0121822, 1.0), + 10: (10.0135338, 0.0), + 11: (11.021658, 0.0), + 12: (12.026921, 0.0), + 13: (13.03569, 0.0), + 14: (14.04289, 0.0), + 15: (15.05346, 0.0), + 16: (16.06192, 0.0)}, + 'Bh': {0: (272, 1.0), + 260: (260.12197, 0.0), + 261: (261.12166, 0.0), + 262: (262.12289, 0.0), + 263: (263.12304, 0.0), + 264: (264.1246, 0.0), + 265: (265.12515, 0.0), + 266: (266.12694, 0.0), + 267: (267.12765, 0.0), + 268: (268.12976, 0.0), + 269: (269.13069, 0.0), + 270: (270.13362, 0.0), + 271: (271.13518, 0.0), + 272: (272.13803, 0.0), + 273: (273.13962, 0.0), + 274: (274.14244, 0.0), + 275: (275.14425, 0.0)}, + 'Bi': {0: (208.9803987, 1.0), + 184: (184.00112, 0.0), + 185: (184.99763, 0.0), + 186: (185.9966, 0.0), + 187: (186.993158, 0.0), + 188: (187.99227, 0.0), + 189: (188.9892, 0.0), + 190: (189.9883, 0.0), + 191: (190.985786, 0.0), + 192: (191.98546, 0.0), + 193: (192.98296, 0.0), + 194: (193.98283, 0.0), + 195: (194.980651, 0.0), + 196: (195.980667, 0.0), + 197: (196.978864, 0.0), + 198: (197.97921, 0.0), + 199: (198.977672, 0.0), + 200: (199.978132, 0.0), + 201: (200.977009, 0.0), + 202: (201.977742, 0.0), + 203: (202.976876, 0.0), + 204: (203.977813, 0.0), + 205: (204.977389, 0.0), + 206: (205.978499, 0.0), + 207: (206.9784707, 0.0), + 208: (207.9797422, 0.0), + 209: (208.9803987, 1.0), + 210: (209.9841204, 0.0), + 211: (210.987269, 0.0), + 212: (211.9912857, 0.0), + 213: (212.994385, 0.0), + 214: (213.998712, 0.0), + 215: (215.00177, 0.0), + 216: (216.006306, 0.0), + 217: (217.00947, 0.0), + 218: (218.01432, 0.0)}, + 'Bk': {0: (247, 1.0), + 235: (235.05658, 0.0), + 236: (236.05733, 0.0), + 237: (237.057, 0.0), + 238: (238.05828, 0.0), + 239: (239.05828, 0.0), + 240: (240.05976, 0.0), + 241: (241.06023, 0.0), + 242: (242.06198, 0.0), + 243: (243.063008, 0.0), + 244: (244.065181, 0.0), + 245: (245.0663616, 0.0), + 246: (246.06867, 0.0), + 247: (247.070307, 0.0), + 248: (248.07309, 0.0), + 249: (249.0749867, 0.0), + 250: (250.078317, 0.0), + 251: (251.08076, 0.0), + 252: (252.08431, 0.0), + 253: (253.08688, 0.0), + 254: (254.0906, 0.0)}, + 'Br': {0: (78.9183371, 1.0), + 67: (66.96479, 0.0), + 68: (67.95852, 0.0), + 69: (68.95011, 0.0), + 70: (69.94479, 0.0), + 71: (70.93874, 0.0), + 72: (71.93664, 0.0), + 73: (72.93169, 0.0), + 74: (73.929891, 0.0), + 75: (74.925776, 0.0), + 76: (75.924541, 0.0), + 77: (76.921379, 0.0), + 78: (77.921146, 0.0), + 79: (78.9183371, 0.5069), + 80: (79.9185293, 0.0), + 81: (80.9162906, 0.4931), + 82: (81.9168041, 0.0), + 83: (82.91518, 0.0), + 84: (83.916479, 0.0), + 85: (84.915608, 0.0), + 86: (85.918798, 0.0), + 87: (86.920711, 0.0), + 88: (87.92407, 0.0), + 89: (88.92639, 0.0), + 90: (89.93063, 0.0), + 91: (90.93397, 0.0), + 92: (91.93926, 0.0), + 93: (92.94305, 0.0), + 94: (93.94868, 0.0), + 95: (94.95287, 0.0), + 96: (95.95853, 0.0), + 97: (96.9628, 0.0)}, + 'C': {0: (12.0, 1.0), + 8: (8.037675, 0.0), + 9: (9.0310367, 0.0), + 10: (10.0168532, 0.0), + 11: (11.0114336, 0.0), + 12: (12.0, 0.9893), + 13: (13.0033548378, 0.0107), + 14: (14.003241989, 0.0), + 15: (15.0105993, 0.0), + 16: (16.014701, 0.0), + 17: (17.022586, 0.0), + 18: (18.02676, 0.0), + 19: (19.03481, 0.0), + 20: (20.04032, 0.0), + 21: (21.04934, 0.0), + 22: (22.0572, 0.0)}, + 'Ca': {0: (39.96259098, 1.0), + 34: (34.01412, 0.0), + 35: (35.00494, 0.0), + 36: (35.99309, 0.0), + 37: (36.98587, 0.0), + 38: (37.976318, 0.0), + 39: (38.9707197, 0.0), + 40: (39.96259098, 0.96941), + 41: (40.96227806, 0.0), + 42: (41.95861801, 0.00647), + 43: (42.9587666, 0.00135), + 44: (43.9554818, 0.02086), + 45: (44.9561866, 0.0), + 46: (45.9536926, 4e-05), + 47: (46.954546, 0.0), + 48: (47.952534, 0.00187), + 49: (48.955674, 0.0), + 50: (49.957519, 0.0), + 51: (50.9615, 0.0), + 52: (51.9651, 0.0), + 53: (52.97005, 0.0), + 54: (53.97435, 0.0), + 55: (54.98055, 0.0), + 56: (55.98557, 0.0), + 57: (56.99236, 0.0)}, + 'Cd': {0: (113.9033585, 1.0), + 95: (94.94987, 0.0), + 96: (95.93977, 0.0), + 97: (96.93494, 0.0), + 98: (97.9274, 0.0), + 99: (98.92501, 0.0), + 100: (99.92029, 0.0), + 101: (100.91868, 0.0), + 102: (101.91446, 0.0), + 103: (102.913419, 0.0), + 104: (103.909849, 0.0), + 105: (104.909468, 0.0), + 106: (105.906459, 0.0125), + 107: (106.906618, 0.0), + 108: (107.904184, 0.0089), + 109: (108.904982, 0.0), + 110: (109.9030021, 0.1249), + 111: (110.9041781, 0.128), + 112: (111.9027578, 0.2413), + 113: (112.9044017, 0.1222), + 114: (113.9033585, 0.2873), + 115: (114.905431, 0.0), + 116: (115.904756, 0.0749), + 117: (116.907219, 0.0), + 118: (117.906915, 0.0), + 119: (118.90992, 0.0), + 120: (119.90985, 0.0), + 121: (120.91298, 0.0), + 122: (121.91333, 0.0), + 123: (122.917, 0.0), + 124: (123.91765, 0.0), + 125: (124.92125, 0.0), + 126: (125.92235, 0.0), + 127: (126.92644, 0.0), + 128: (127.92776, 0.0), + 129: (128.93215, 0.0), + 130: (129.9339, 0.0), + 131: (130.94067, 0.0), + 132: (131.94555, 0.0)}, + 'Ce': {0: (139.9054387, 1.0), + 119: (118.95276, 0.0), + 120: (119.94664, 0.0), + 121: (120.94342, 0.0), + 122: (121.93791, 0.0), + 123: (122.9354, 0.0), + 124: (123.93041, 0.0), + 125: (124.92844, 0.0), + 126: (125.92397, 0.0), + 127: (126.92273, 0.0), + 128: (127.91891, 0.0), + 129: (128.9181, 0.0), + 130: (129.91474, 0.0), + 131: (130.91442, 0.0), + 132: (131.91146, 0.0), + 133: (132.911515, 0.0), + 134: (133.908925, 0.0), + 135: (134.909151, 0.0), + 136: (135.907172, 0.00185), + 137: (136.907806, 0.0), + 138: (137.905991, 0.00251), + 139: (138.906653, 0.0), + 140: (139.9054387, 0.8845), + 141: (140.9082763, 0.0), + 142: (141.909244, 0.11114), + 143: (142.912386, 0.0), + 144: (143.913647, 0.0), + 145: (144.91723, 0.0), + 146: (145.91876, 0.0), + 147: (146.92267, 0.0), + 148: (147.92443, 0.0), + 149: (148.9284, 0.0), + 150: (149.93041, 0.0), + 151: (150.93398, 0.0), + 152: (151.93654, 0.0), + 153: (152.94058, 0.0), + 154: (153.94342, 0.0), + 155: (154.94804, 0.0), + 156: (155.95126, 0.0), + 157: (156.95634, 0.0)}, + 'Cf': {0: (251, 1.0), + 237: (237.06207, 0.0), + 238: (238.06141, 0.0), + 239: (239.06242, 0.0), + 240: (240.0623, 0.0), + 241: (241.06373, 0.0), + 242: (242.0637, 0.0), + 243: (243.06543, 0.0), + 244: (244.066001, 0.0), + 245: (245.068049, 0.0), + 246: (246.0688053, 0.0), + 247: (247.071001, 0.0), + 248: (248.072185, 0.0), + 249: (249.0748535, 0.0), + 250: (250.0764061, 0.0), + 251: (251.079587, 0.0), + 252: (252.081626, 0.0), + 253: (253.085133, 0.0), + 254: (254.087323, 0.0), + 255: (255.09105, 0.0), + 256: (256.09344, 0.0)}, + 'Cl': {0: (34.96885268, 1.0), + 28: (28.02851, 0.0), + 29: (29.01411, 0.0), + 30: (30.00477, 0.0), + 31: (30.99241, 0.0), + 32: (31.98569, 0.0), + 33: (32.9774519, 0.0), + 34: (33.97376282, 0.0), + 35: (34.96885268, 0.7576), + 36: (35.96830698, 0.0), + 37: (36.96590259, 0.2424), + 38: (37.96801043, 0.0), + 39: (38.9680082, 0.0), + 40: (39.97042, 0.0), + 41: (40.97068, 0.0), + 42: (41.97325, 0.0), + 43: (42.97405, 0.0), + 44: (43.97828, 0.0), + 45: (44.98029, 0.0), + 46: (45.98421, 0.0), + 47: (46.98871, 0.0), + 48: (47.99495, 0.0), + 49: (49.00032, 0.0), + 50: (50.00784, 0.0), + 51: (51.01449, 0.0)}, + 'Cm': {0: (247, 1.0), + 233: (233.05077, 0.0), + 234: (234.05016, 0.0), + 235: (235.05143, 0.0), + 236: (236.05141, 0.0), + 237: (237.0529, 0.0), + 238: (238.05303, 0.0), + 239: (239.05496, 0.0), + 240: (240.0555295, 0.0), + 241: (241.057653, 0.0), + 242: (242.0588358, 0.0), + 243: (243.0613891, 0.0), + 244: (244.0627526, 0.0), + 245: (245.0654912, 0.0), + 246: (246.0672237, 0.0), + 247: (247.070354, 0.0), + 248: (248.072349, 0.0), + 249: (249.075953, 0.0), + 250: (250.078357, 0.0), + 251: (251.082285, 0.0), + 252: (252.08487, 0.0)}, + 'Cn': {0: (285, 1.0), + 277: (277.16394, 0.0), + 278: (278.16431, 0.0), + 279: (279.16655, 0.0), + 280: (280.16704, 0.0), + 281: (281.16929, 0.0), + 282: (282.16977, 0.0), + 283: (283.17179, 0.0), + 284: (284.17238, 0.0), + 285: (285.17411, 0.0)}, + 'Co': {0: (58.933195, 1.0), + 47: (47.01149, 0.0), + 48: (48.00176, 0.0), + 49: (48.98972, 0.0), + 50: (49.98154, 0.0), + 51: (50.97072, 0.0), + 52: (51.96359, 0.0), + 53: (52.954219, 0.0), + 54: (53.9484596, 0.0), + 55: (54.941999, 0.0), + 56: (55.9398393, 0.0), + 57: (56.9362914, 0.0), + 58: (57.9357528, 0.0), + 59: (58.933195, 1.0), + 60: (59.9338171, 0.0), + 61: (60.9324758, 0.0), + 62: (61.934051, 0.0), + 63: (62.933612, 0.0), + 64: (63.93581, 0.0), + 65: (64.936478, 0.0), + 66: (65.93976, 0.0), + 67: (66.94089, 0.0), + 68: (67.94487, 0.0), + 69: (68.94632, 0.0), + 70: (69.951, 0.0), + 71: (70.9529, 0.0), + 72: (71.95781, 0.0), + 73: (72.96024, 0.0), + 74: (73.96538, 0.0), + 75: (74.96833, 0.0)}, + 'Cr': {0: (51.9405075, 1.0), + 42: (42.00643, 0.0), + 43: (42.99771, 0.0), + 44: (43.98555, 0.0), + 45: (44.97964, 0.0), + 46: (45.968359, 0.0), + 47: (46.9629, 0.0), + 48: (47.954032, 0.0), + 49: (48.9513357, 0.0), + 50: (49.9460442, 0.04345), + 51: (50.9447674, 0.0), + 52: (51.9405075, 0.83789), + 53: (52.9406494, 0.09501), + 54: (53.9388804, 0.02365), + 55: (54.9408397, 0.0), + 56: (55.9406531, 0.0), + 57: (56.943613, 0.0), + 58: (57.94435, 0.0), + 59: (58.94859, 0.0), + 60: (59.95008, 0.0), + 61: (60.95472, 0.0), + 62: (61.95661, 0.0), + 63: (62.96186, 0.0), + 64: (63.96441, 0.0), + 65: (64.97016, 0.0), + 66: (65.97338, 0.0), + 67: (66.97955, 0.0)}, + 'Cs': {0: (132.905451933, 1.0), + 112: (111.9503, 0.0), + 113: (112.94449, 0.0), + 114: (113.94145, 0.0), + 115: (114.93591, 0.0), + 116: (115.93337, 0.0), + 117: (116.92867, 0.0), + 118: (117.926559, 0.0), + 119: (118.922377, 0.0), + 120: (119.920677, 0.0), + 121: (120.917229, 0.0), + 122: (121.91611, 0.0), + 123: (122.912996, 0.0), + 124: (123.912258, 0.0), + 125: (124.909728, 0.0), + 126: (125.909452, 0.0), + 127: (126.907418, 0.0), + 128: (127.907749, 0.0), + 129: (128.906064, 0.0), + 130: (129.906709, 0.0), + 131: (130.905464, 0.0), + 132: (131.9064343, 0.0), + 133: (132.905451933, 1.0), + 134: (133.906718475, 0.0), + 135: (134.905977, 0.0), + 136: (135.9073116, 0.0), + 137: (136.9070895, 0.0), + 138: (137.911017, 0.0), + 139: (138.913364, 0.0), + 140: (139.917282, 0.0), + 141: (140.920046, 0.0), + 142: (141.924299, 0.0), + 143: (142.927352, 0.0), + 144: (143.932077, 0.0), + 145: (144.935526, 0.0), + 146: (145.94029, 0.0), + 147: (146.94416, 0.0), + 148: (147.94922, 0.0), + 149: (148.95293, 0.0), + 150: (149.95817, 0.0), + 151: (150.96219, 0.0)}, + 'Cu': {0: (62.9295975, 1.0), + 52: (51.99718, 0.0), + 53: (52.98555, 0.0), + 54: (53.97671, 0.0), + 55: (54.96605, 0.0), + 56: (55.95856, 0.0), + 57: (56.949211, 0.0), + 58: (57.9445385, 0.0), + 59: (58.939498, 0.0), + 60: (59.937365, 0.0), + 61: (60.9334578, 0.0), + 62: (61.932584, 0.0), + 63: (62.9295975, 0.6915), + 64: (63.9297642, 0.0), + 65: (64.9277895, 0.3085), + 66: (65.9288688, 0.0), + 67: (66.9277303, 0.0), + 68: (67.9296109, 0.0), + 69: (68.9294293, 0.0), + 70: (69.9323923, 0.0), + 71: (70.9326768, 0.0), + 72: (71.9358203, 0.0), + 73: (72.936675, 0.0), + 74: (73.939875, 0.0), + 75: (74.9419, 0.0), + 76: (75.945275, 0.0), + 77: (76.94785, 0.0), + 78: (77.95196, 0.0), + 79: (78.95456, 0.0), + 80: (79.96087, 0.0)}, + 'Db': {0: (268, 1.0), + 255: (255.1074, 0.0), + 256: (256.10813, 0.0), + 257: (257.10772, 0.0), + 258: (258.10923, 0.0), + 259: (259.10961, 0.0), + 260: (260.1113, 0.0), + 261: (261.11206, 0.0), + 262: (262.11408, 0.0), + 263: (263.11499, 0.0), + 264: (264.1174, 0.0), + 265: (265.1186, 0.0), + 266: (266.12103, 0.0), + 267: (267.12238, 0.0), + 268: (268.12545, 0.0), + 269: (269.12746, 0.0), + 270: (270.13071, 0.0)}, + 'Ds': {0: (281, 1.0), + 267: (267.14434, 0.0), + 268: (268.1438, 0.0), + 269: (269.14512, 0.0), + 270: (270.14472, 0.0), + 271: (271.14606, 0.0), + 272: (272.14632, 0.0), + 273: (273.14886, 0.0), + 274: (274.14949, 0.0), + 275: (275.15218, 0.0), + 276: (276.15303, 0.0), + 277: (277.15565, 0.0), + 278: (278.15647, 0.0), + 279: (279.15886, 0.0), + 280: (280.1598, 0.0), + 281: (281.16206, 0.0)}, + 'Dy': {0: (163.9291748, 1.0), + 138: (137.96249, 0.0), + 139: (138.95954, 0.0), + 140: (139.95401, 0.0), + 141: (140.95135, 0.0), + 142: (141.94637, 0.0), + 143: (142.94383, 0.0), + 144: (143.93925, 0.0), + 145: (144.93743, 0.0), + 146: (145.932845, 0.0), + 147: (146.931092, 0.0), + 148: (147.92715, 0.0), + 149: (148.927305, 0.0), + 150: (149.925585, 0.0), + 151: (150.926185, 0.0), + 152: (151.924718, 0.0), + 153: (152.925765, 0.0), + 154: (153.924424, 0.0), + 155: (154.925754, 0.0), + 156: (155.924283, 0.00056), + 157: (156.925466, 0.0), + 158: (157.924409, 0.00095), + 159: (158.9257392, 0.0), + 160: (159.9251975, 0.02329), + 161: (160.9269334, 0.18889), + 162: (161.9267984, 0.25475), + 163: (162.9287312, 0.24896), + 164: (163.9291748, 0.2826), + 165: (164.9317033, 0.0), + 166: (165.9328067, 0.0), + 167: (166.93566, 0.0), + 168: (167.93713, 0.0), + 169: (168.94031, 0.0), + 170: (169.94239, 0.0), + 171: (170.9462, 0.0), + 172: (171.94876, 0.0), + 173: (172.953, 0.0)}, + 'Er': {0: (165.9302931, 1.0), + 143: (142.96634, 0.0), + 144: (143.96038, 0.0), + 145: (144.95739, 0.0), + 146: (145.952, 0.0), + 147: (146.94949, 0.0), + 148: (147.94455, 0.0), + 149: (148.94231, 0.0), + 150: (149.937914, 0.0), + 151: (150.937449, 0.0), + 152: (151.93505, 0.0), + 153: (152.935063, 0.0), + 154: (153.932783, 0.0), + 155: (154.933209, 0.0), + 156: (155.931065, 0.0), + 157: (156.93192, 0.0), + 158: (157.929893, 0.0), + 159: (158.930684, 0.0), + 160: (159.929083, 0.0), + 161: (160.929995, 0.0), + 162: (161.928778, 0.00139), + 163: (162.930033, 0.0), + 164: (163.9292, 0.01601), + 165: (164.930726, 0.0), + 166: (165.9302931, 0.33503), + 167: (166.9320482, 0.22869), + 168: (167.9323702, 0.26978), + 169: (168.9345904, 0.0), + 170: (169.9354643, 0.1491), + 171: (170.9380298, 0.0), + 172: (171.939356, 0.0), + 173: (172.9424, 0.0), + 174: (173.94423, 0.0), + 175: (174.94777, 0.0), + 176: (175.95008, 0.0), + 177: (176.95405, 0.0)}, + 'Es': {0: (252, 1.0), + 240: (240.06892, 0.0), + 241: (241.06854, 0.0), + 242: (242.06975, 0.0), + 243: (243.06955, 0.0), + 244: (244.07088, 0.0), + 245: (245.07132, 0.0), + 246: (246.0729, 0.0), + 247: (247.07366, 0.0), + 248: (248.07547, 0.0), + 249: (249.07641, 0.0), + 250: (250.07861, 0.0), + 251: (251.079992, 0.0), + 252: (252.08298, 0.0), + 253: (253.0848247, 0.0), + 254: (254.088022, 0.0), + 255: (255.090273, 0.0), + 256: (256.0936, 0.0), + 257: (257.09598, 0.0), + 258: (258.09952, 0.0)}, + 'Eu': {0: (152.9212303, 1.0), + 130: (129.96357, 0.0), + 131: (130.95775, 0.0), + 132: (131.95437, 0.0), + 133: (132.94924, 0.0), + 134: (133.94651, 0.0), + 135: (134.94182, 0.0), + 136: (135.9396, 0.0), + 137: (136.93557, 0.0), + 138: (137.93371, 0.0), + 139: (138.929792, 0.0), + 140: (139.92809, 0.0), + 141: (140.924931, 0.0), + 142: (141.92343, 0.0), + 143: (142.920298, 0.0), + 144: (143.918817, 0.0), + 145: (144.916265, 0.0), + 146: (145.917206, 0.0), + 147: (146.916746, 0.0), + 148: (147.918086, 0.0), + 149: (148.917931, 0.0), + 150: (149.919702, 0.0), + 151: (150.9198502, 0.4781), + 152: (151.9217445, 0.0), + 153: (152.9212303, 0.5219), + 154: (153.9229792, 0.0), + 155: (154.9228933, 0.0), + 156: (155.924752, 0.0), + 157: (156.925424, 0.0), + 158: (157.92785, 0.0), + 159: (158.929089, 0.0), + 160: (159.93197, 0.0), + 161: (160.93368, 0.0), + 162: (161.93704, 0.0), + 163: (162.93921, 0.0), + 164: (163.94299, 0.0), + 165: (164.94572, 0.0), + 166: (165.94997, 0.0), + 167: (166.95321, 0.0)}, + 'F': {0: (18.99840322, 1.0), + 14: (14.03506, 0.0), + 15: (15.01801, 0.0), + 16: (16.011466, 0.0), + 17: (17.00209524, 0.0), + 18: (18.000938, 0.0), + 19: (18.99840322, 1.0), + 20: (19.99998132, 0.0), + 21: (20.999949, 0.0), + 22: (22.002999, 0.0), + 23: (23.00357, 0.0), + 24: (24.00812, 0.0), + 25: (25.0121, 0.0), + 26: (26.01962, 0.0), + 27: (27.02676, 0.0), + 28: (28.03567, 0.0), + 29: (29.04326, 0.0), + 30: (30.0525, 0.0), + 31: (31.06043, 0.0)}, + 'Fe': {0: (55.9349375, 1.0), + 45: (45.01458, 0.0), + 46: (46.00081, 0.0), + 47: (46.99289, 0.0), + 48: (47.9805, 0.0), + 49: (48.97361, 0.0), + 50: (49.96299, 0.0), + 51: (50.95682, 0.0), + 52: (51.948114, 0.0), + 53: (52.9453079, 0.0), + 54: (53.9396105, 0.05845), + 55: (54.9382934, 0.0), + 56: (55.9349375, 0.91754), + 57: (56.935394, 0.02119), + 58: (57.9332756, 0.00282), + 59: (58.9348755, 0.0), + 60: (59.934072, 0.0), + 61: (60.936745, 0.0), + 62: (61.936767, 0.0), + 63: (62.94037, 0.0), + 64: (63.9412, 0.0), + 65: (64.94538, 0.0), + 66: (65.94678, 0.0), + 67: (66.95095, 0.0), + 68: (67.9537, 0.0), + 69: (68.95878, 0.0), + 70: (69.96146, 0.0), + 71: (70.96672, 0.0), + 72: (71.96962, 0.0)}, + 'Fm': {0: (257, 1.0), + 242: (242.07343, 0.0), + 243: (243.07435, 0.0), + 244: (244.07408, 0.0), + 245: (245.07539, 0.0), + 246: (246.0753, 0.0), + 247: (247.07685, 0.0), + 248: (248.077195, 0.0), + 249: (249.07903, 0.0), + 250: (250.079521, 0.0), + 251: (251.081575, 0.0), + 252: (252.082467, 0.0), + 253: (253.085185, 0.0), + 254: (254.0868542, 0.0), + 255: (255.089962, 0.0), + 256: (256.091773, 0.0), + 257: (257.095105, 0.0), + 258: (258.09708, 0.0), + 259: (259.1006, 0.0), + 260: (260.10268, 0.0)}, + 'Fr': {0: (223, 1.0), + 199: (199.00726, 0.0), + 200: (200.00657, 0.0), + 201: (201.00386, 0.0), + 202: (202.00337, 0.0), + 203: (203.000925, 0.0), + 204: (204.000653, 0.0), + 205: (204.998594, 0.0), + 206: (205.99867, 0.0), + 207: (206.99695, 0.0), + 208: (207.99714, 0.0), + 209: (208.995954, 0.0), + 210: (209.996408, 0.0), + 211: (210.995537, 0.0), + 212: (211.996202, 0.0), + 213: (212.996189, 0.0), + 214: (213.998971, 0.0), + 215: (215.000341, 0.0), + 216: (216.003198, 0.0), + 217: (217.004632, 0.0), + 218: (218.007578, 0.0), + 219: (219.009252, 0.0), + 220: (220.012327, 0.0), + 221: (221.014255, 0.0), + 222: (222.017552, 0.0), + 223: (223.0197359, 0.0), + 224: (224.02325, 0.0), + 225: (225.02557, 0.0), + 226: (226.02939, 0.0), + 227: (227.03184, 0.0), + 228: (228.03573, 0.0), + 229: (229.03845, 0.0), + 230: (230.04251, 0.0), + 231: (231.04544, 0.0), + 232: (232.04977, 0.0)}, + 'Ga': {0: (68.9255736, 1.0), + 56: (55.99491, 0.0), + 57: (56.98293, 0.0), + 58: (57.97425, 0.0), + 59: (58.96337, 0.0), + 60: (59.95706, 0.0), + 61: (60.94945, 0.0), + 62: (61.944175, 0.0), + 63: (62.9392942, 0.0), + 64: (63.9368387, 0.0), + 65: (64.9327348, 0.0), + 66: (65.931589, 0.0), + 67: (66.9282017, 0.0), + 68: (67.9279801, 0.0), + 69: (68.9255736, 0.60108), + 70: (69.926022, 0.0), + 71: (70.9247013, 0.39892), + 72: (71.9263663, 0.0), + 73: (72.9251747, 0.0), + 74: (73.926946, 0.0), + 75: (74.9265002, 0.0), + 76: (75.9288276, 0.0), + 77: (76.9291543, 0.0), + 78: (77.9316082, 0.0), + 79: (78.93289, 0.0), + 80: (79.93652, 0.0), + 81: (80.93775, 0.0), + 82: (81.94299, 0.0), + 83: (82.94698, 0.0), + 84: (83.95265, 0.0), + 85: (84.957, 0.0), + 86: (85.96312, 0.0)}, + 'Gd': {0: (157.9241039, 1.0), + 134: (133.95537, 0.0), + 135: (134.95257, 0.0), + 136: (135.94734, 0.0), + 137: (136.94502, 0.0), + 138: (137.94012, 0.0), + 139: (138.93824, 0.0), + 140: (139.93367, 0.0), + 141: (140.932126, 0.0), + 142: (141.92812, 0.0), + 143: (142.92675, 0.0), + 144: (143.92296, 0.0), + 145: (144.921709, 0.0), + 146: (145.918311, 0.0), + 147: (146.919094, 0.0), + 148: (147.918115, 0.0), + 149: (148.919341, 0.0), + 150: (149.918659, 0.0), + 151: (150.920348, 0.0), + 152: (151.919791, 0.002), + 153: (152.9217495, 0.0), + 154: (153.9208656, 0.0218), + 155: (154.922622, 0.148), + 156: (155.9221227, 0.2047), + 157: (156.9239601, 0.1565), + 158: (157.9241039, 0.2484), + 159: (158.9263887, 0.0), + 160: (159.9270541, 0.2186), + 161: (160.9296692, 0.0), + 162: (161.930985, 0.0), + 163: (162.93399, 0.0), + 164: (163.93586, 0.0), + 165: (164.93938, 0.0), + 166: (165.9416, 0.0), + 167: (166.94557, 0.0), + 168: (167.94836, 0.0), + 169: (168.95287, 0.0)}, + 'Ge': {0: (73.9211778, 1.0), + 58: (57.99101, 0.0), + 59: (58.98175, 0.0), + 60: (59.97019, 0.0), + 61: (60.96379, 0.0), + 62: (61.95465, 0.0), + 63: (62.94964, 0.0), + 64: (63.94165, 0.0), + 65: (64.93944, 0.0), + 66: (65.93384, 0.0), + 67: (66.932734, 0.0), + 68: (67.928094, 0.0), + 69: (68.9279645, 0.0), + 70: (69.9242474, 0.2038), + 71: (70.924951, 0.0), + 72: (71.9220758, 0.2731), + 73: (72.9234589, 0.0776), + 74: (73.9211778, 0.3672), + 75: (74.9228589, 0.0), + 76: (75.9214026, 0.0783), + 77: (76.9235486, 0.0), + 78: (77.922853, 0.0), + 79: (78.9254, 0.0), + 80: (79.92537, 0.0), + 81: (80.92882, 0.0), + 82: (81.92955, 0.0), + 83: (82.93462, 0.0), + 84: (83.93747, 0.0), + 85: (84.94303, 0.0), + 86: (85.94649, 0.0), + 87: (86.95251, 0.0), + 88: (87.95691, 0.0), + 89: (88.96383, 0.0)}, + 'H': {0: (1.00782503207, 1.0), + 1: (1.00782503207, 0.999885), + 2: (2.0141017778, 0.000115), + 3: (3.0160492777, 0.0), + 4: (4.02781, 0.0), + 5: (5.03531, 0.0), + 6: (6.04494, 0.0), + 7: (7.05275, 0.0)}, + 'H+': {0: (1.00727646677, 1.0), 1: (1.00727646677, 1.0)}, + 'He': {0: (4.00260325415, 1.0), + 3: (3.0160293191, 1.34e-06), + 4: (4.00260325415, 0.99999866), + 5: (5.01222, 0.0), + 6: (6.0188891, 0.0), + 7: (7.028021, 0.0), + 8: (8.033922, 0.0), + 9: (9.04395, 0.0), + 10: (10.0524, 0.0)}, + 'Hf': {0: (179.94655, 1.0), + 153: (152.97069, 0.0), + 154: (153.96486, 0.0), + 155: (154.96339, 0.0), + 156: (155.95936, 0.0), + 157: (156.9584, 0.0), + 158: (157.954799, 0.0), + 159: (158.953995, 0.0), + 160: (159.950684, 0.0), + 161: (160.950275, 0.0), + 162: (161.94721, 0.0), + 163: (162.94709, 0.0), + 164: (163.944367, 0.0), + 165: (164.94457, 0.0), + 166: (165.94218, 0.0), + 167: (166.9426, 0.0), + 168: (167.94057, 0.0), + 169: (168.94126, 0.0), + 170: (169.93961, 0.0), + 171: (170.94049, 0.0), + 172: (171.939448, 0.0), + 173: (172.94051, 0.0), + 174: (173.940046, 0.0016), + 175: (174.941509, 0.0), + 176: (175.9414086, 0.0526), + 177: (176.9432207, 0.186), + 178: (177.9436988, 0.2728), + 179: (178.9458161, 0.1362), + 180: (179.94655, 0.3508), + 181: (180.9491012, 0.0), + 182: (181.950554, 0.0), + 183: (182.95353, 0.0), + 184: (183.95545, 0.0), + 185: (184.95882, 0.0), + 186: (185.96089, 0.0), + 187: (186.96459, 0.0), + 188: (187.96685, 0.0)}, + 'Hg': {0: (201.970643, 1.0), + 171: (171.00376, 0.0), + 172: (171.99883, 0.0), + 173: (172.99724, 0.0), + 174: (173.992864, 0.0), + 175: (174.99142, 0.0), + 176: (175.987355, 0.0), + 177: (176.98628, 0.0), + 178: (177.982483, 0.0), + 179: (178.981834, 0.0), + 180: (179.978266, 0.0), + 181: (180.977819, 0.0), + 182: (181.97469, 0.0), + 183: (182.97445, 0.0), + 184: (183.971713, 0.0), + 185: (184.971899, 0.0), + 186: (185.969362, 0.0), + 187: (186.969814, 0.0), + 188: (187.967577, 0.0), + 189: (188.96819, 0.0), + 190: (189.966322, 0.0), + 191: (190.967157, 0.0), + 192: (191.965634, 0.0), + 193: (192.966665, 0.0), + 194: (193.965439, 0.0), + 195: (194.96672, 0.0), + 196: (195.965833, 0.0015), + 197: (196.967213, 0.0), + 198: (197.966769, 0.0997), + 199: (198.9682799, 0.1687), + 200: (199.968326, 0.231), + 201: (200.9703023, 0.1318), + 202: (201.970643, 0.2986), + 203: (202.9728725, 0.0), + 204: (203.9734939, 0.0687), + 205: (204.976073, 0.0), + 206: (205.977514, 0.0), + 207: (206.98259, 0.0), + 208: (207.98594, 0.0), + 209: (208.99104, 0.0), + 210: (209.99451, 0.0)}, + 'Ho': {0: (164.9303221, 1.0), + 140: (139.96854, 0.0), + 141: (140.9631, 0.0), + 142: (141.95977, 0.0), + 143: (142.95461, 0.0), + 144: (143.95148, 0.0), + 145: (144.9472, 0.0), + 146: (145.94464, 0.0), + 147: (146.94006, 0.0), + 148: (147.93772, 0.0), + 149: (148.933775, 0.0), + 150: (149.933496, 0.0), + 151: (150.931688, 0.0), + 152: (151.931714, 0.0), + 153: (152.930199, 0.0), + 154: (153.930602, 0.0), + 155: (154.929103, 0.0), + 156: (155.92984, 0.0), + 157: (156.928256, 0.0), + 158: (157.928941, 0.0), + 159: (158.927712, 0.0), + 160: (159.928729, 0.0), + 161: (160.927855, 0.0), + 162: (161.929096, 0.0), + 163: (162.9287339, 0.0), + 164: (163.9302335, 0.0), + 165: (164.9303221, 1.0), + 166: (165.9322842, 0.0), + 167: (166.933133, 0.0), + 168: (167.93552, 0.0), + 169: (168.936872, 0.0), + 170: (169.93962, 0.0), + 171: (170.94147, 0.0), + 172: (171.94482, 0.0), + 173: (172.94729, 0.0), + 174: (173.95115, 0.0), + 175: (174.95405, 0.0)}, + 'Hs': {0: (270, 1.0), + 263: (263.12856, 0.0), + 264: (264.12839, 0.0), + 265: (265.13009, 0.0), + 266: (266.1301, 0.0), + 267: (267.13179, 0.0), + 268: (268.13216, 0.0), + 269: (269.13406, 0.0), + 270: (270.13465, 0.0), + 271: (271.13766, 0.0), + 272: (272.13905, 0.0), + 273: (273.14199, 0.0), + 274: (274.14313, 0.0), + 275: (275.14595, 0.0), + 276: (276.14721, 0.0), + 277: (277.14984, 0.0)}, + 'I': {0: (126.904473, 1.0), + 108: (107.94348, 0.0), + 109: (108.93815, 0.0), + 110: (109.93524, 0.0), + 111: (110.93028, 0.0), + 112: (111.92797, 0.0), + 113: (112.92364, 0.0), + 114: (113.92185, 0.0), + 115: (114.91805, 0.0), + 116: (115.91681, 0.0), + 117: (116.91365, 0.0), + 118: (117.913074, 0.0), + 119: (118.91007, 0.0), + 120: (119.910048, 0.0), + 121: (120.907367, 0.0), + 122: (121.907589, 0.0), + 123: (122.905589, 0.0), + 124: (123.9062099, 0.0), + 125: (124.9046302, 0.0), + 126: (125.905624, 0.0), + 127: (126.904473, 1.0), + 128: (127.905809, 0.0), + 129: (128.904988, 0.0), + 130: (129.906674, 0.0), + 131: (130.9061246, 0.0), + 132: (131.907997, 0.0), + 133: (132.907797, 0.0), + 134: (133.909744, 0.0), + 135: (134.910048, 0.0), + 136: (135.91465, 0.0), + 137: (136.917871, 0.0), + 138: (137.92235, 0.0), + 139: (138.9261, 0.0), + 140: (139.931, 0.0), + 141: (140.93503, 0.0), + 142: (141.94018, 0.0), + 143: (142.94456, 0.0), + 144: (143.94999, 0.0)}, + 'In': {0: (114.903878, 1.0), + 97: (96.94954, 0.0), + 98: (97.94214, 0.0), + 99: (98.93422, 0.0), + 100: (99.93111, 0.0), + 101: (100.92634, 0.0), + 102: (101.92409, 0.0), + 103: (102.919914, 0.0), + 104: (103.9183, 0.0), + 105: (104.914674, 0.0), + 106: (105.913465, 0.0), + 107: (106.910295, 0.0), + 108: (107.909698, 0.0), + 109: (108.907151, 0.0), + 110: (109.907165, 0.0), + 111: (110.905103, 0.0), + 112: (111.905532, 0.0), + 113: (112.904058, 0.0429), + 114: (113.904914, 0.0), + 115: (114.903878, 0.9571), + 116: (115.90526, 0.0), + 117: (116.904514, 0.0), + 118: (117.906354, 0.0), + 119: (118.905845, 0.0), + 120: (119.90796, 0.0), + 121: (120.907846, 0.0), + 122: (121.91028, 0.0), + 123: (122.910438, 0.0), + 124: (123.91318, 0.0), + 125: (124.9136, 0.0), + 126: (125.91646, 0.0), + 127: (126.91735, 0.0), + 128: (127.92017, 0.0), + 129: (128.9217, 0.0), + 130: (129.92497, 0.0), + 131: (130.92685, 0.0), + 132: (131.93299, 0.0), + 133: (132.93781, 0.0), + 134: (133.94415, 0.0), + 135: (134.94933, 0.0)}, + 'Ir': {0: (192.9629264, 1.0), + 164: (163.9922, 0.0), + 165: (164.98752, 0.0), + 166: (165.98582, 0.0), + 167: (166.981665, 0.0), + 168: (167.97988, 0.0), + 169: (168.976295, 0.0), + 170: (169.97497, 0.0), + 171: (170.97163, 0.0), + 172: (171.97046, 0.0), + 173: (172.967502, 0.0), + 174: (173.966861, 0.0), + 175: (174.964113, 0.0), + 176: (175.963649, 0.0), + 177: (176.961302, 0.0), + 178: (177.961082, 0.0), + 179: (178.959122, 0.0), + 180: (179.959229, 0.0), + 181: (180.957625, 0.0), + 182: (181.958076, 0.0), + 183: (182.956846, 0.0), + 184: (183.95748, 0.0), + 185: (184.9567, 0.0), + 186: (185.957946, 0.0), + 187: (186.957363, 0.0), + 188: (187.958853, 0.0), + 189: (188.958719, 0.0), + 190: (189.960546, 0.0), + 191: (190.960594, 0.373), + 192: (191.962605, 0.0), + 193: (192.9629264, 0.627), + 194: (193.9650784, 0.0), + 195: (194.9659796, 0.0), + 196: (195.9684, 0.0), + 197: (196.969653, 0.0), + 198: (197.97228, 0.0), + 199: (198.9738, 0.0)}, + 'K': {0: (38.96370668, 1.0), + 32: (32.02192, 0.0), + 33: (33.00726, 0.0), + 34: (33.99841, 0.0), + 35: (34.98801, 0.0), + 36: (35.981292, 0.0), + 37: (36.97337589, 0.0), + 38: (37.9690812, 0.0), + 39: (38.96370668, 0.932581), + 40: (39.96399848, 0.000117), + 41: (40.96182576, 0.067302), + 42: (41.96240281, 0.0), + 43: (42.960716, 0.0), + 44: (43.96156, 0.0), + 45: (44.960699, 0.0), + 46: (45.961977, 0.0), + 47: (46.961678, 0.0), + 48: (47.965514, 0.0), + 49: (48.96745, 0.0), + 50: (49.97278, 0.0), + 51: (50.97638, 0.0), + 52: (51.98261, 0.0), + 53: (52.98712, 0.0), + 54: (53.9942, 0.0), + 55: (54.99971, 0.0)}, + 'Kr': {0: (83.911507, 1.0), + 69: (68.96518, 0.0), + 70: (69.95526, 0.0), + 71: (70.94963, 0.0), + 72: (71.942092, 0.0), + 73: (72.939289, 0.0), + 74: (73.9330844, 0.0), + 75: (74.930946, 0.0), + 76: (75.92591, 0.0), + 77: (76.92467, 0.0), + 78: (77.9203648, 0.00355), + 79: (78.920082, 0.0), + 80: (79.916379, 0.02286), + 81: (80.916592, 0.0), + 82: (81.9134836, 0.11593), + 83: (82.914136, 0.115), + 84: (83.911507, 0.56987), + 85: (84.9125273, 0.0), + 86: (85.91061073, 0.17279), + 87: (86.91335486, 0.0), + 88: (87.914447, 0.0), + 89: (88.91763, 0.0), + 90: (89.919517, 0.0), + 91: (90.92345, 0.0), + 92: (91.926156, 0.0), + 93: (92.93127, 0.0), + 94: (93.93436, 0.0), + 95: (94.93984, 0.0), + 96: (95.94307, 0.0), + 97: (96.94856, 0.0), + 98: (97.95191, 0.0), + 99: (98.9576, 0.0), + 100: (99.96114, 0.0)}, + 'La': {0: (138.9063533, 1.0), + 117: (116.95007, 0.0), + 118: (117.94673, 0.0), + 119: (118.94099, 0.0), + 120: (119.93807, 0.0), + 121: (120.93301, 0.0), + 122: (121.93071, 0.0), + 123: (122.92624, 0.0), + 124: (123.92457, 0.0), + 125: (124.920816, 0.0), + 126: (125.91951, 0.0), + 127: (126.916375, 0.0), + 128: (127.91559, 0.0), + 129: (128.912693, 0.0), + 130: (129.912369, 0.0), + 131: (130.91007, 0.0), + 132: (131.9101, 0.0), + 133: (132.90822, 0.0), + 134: (133.908514, 0.0), + 135: (134.906977, 0.0), + 136: (135.90764, 0.0), + 137: (136.906494, 0.0), + 138: (137.907112, 0.0009), + 139: (138.9063533, 0.9991), + 140: (139.9094776, 0.0), + 141: (140.910962, 0.0), + 142: (141.914079, 0.0), + 143: (142.916063, 0.0), + 144: (143.9196, 0.0), + 145: (144.92165, 0.0), + 146: (145.92579, 0.0), + 147: (146.92824, 0.0), + 148: (147.93223, 0.0), + 149: (148.93473, 0.0), + 150: (149.93877, 0.0), + 151: (150.94172, 0.0), + 152: (151.94625, 0.0), + 153: (152.94962, 0.0), + 154: (153.9545, 0.0), + 155: (154.95835, 0.0)}, + 'Li': {0: (7.01600455, 1.0), + 3: (3.03078, 0.0), + 4: (4.02719, 0.0), + 5: (5.01254, 0.0), + 6: (6.015122795, 0.0759), + 7: (7.01600455, 0.9241), + 8: (8.02248736, 0.0), + 9: (9.0267895, 0.0), + 10: (10.035481, 0.0), + 11: (11.043798, 0.0), + 12: (12.05378, 0.0)}, + 'Lr': {0: (262, 1.0), + 251: (251.09436, 0.0), + 252: (252.09537, 0.0), + 253: (253.09521, 0.0), + 254: (254.09645, 0.0), + 255: (255.09668, 0.0), + 256: (256.09863, 0.0), + 257: (257.09956, 0.0), + 258: (258.10181, 0.0), + 259: (259.1029, 0.0), + 260: (260.1055, 0.0), + 261: (261.10688, 0.0), + 262: (262.10963, 0.0), + 263: (263.11129, 0.0), + 264: (264.11404, 0.0), + 265: (265.11584, 0.0), + 266: (266.11931, 0.0)}, + 'Lu': {0: (174.9407718, 1.0), + 150: (149.97323, 0.0), + 151: (150.96758, 0.0), + 152: (151.96412, 0.0), + 153: (152.95877, 0.0), + 154: (153.95752, 0.0), + 155: (154.954316, 0.0), + 156: (155.95303, 0.0), + 157: (156.950098, 0.0), + 158: (157.949313, 0.0), + 159: (158.94663, 0.0), + 160: (159.94603, 0.0), + 161: (160.94357, 0.0), + 162: (161.94328, 0.0), + 163: (162.94118, 0.0), + 164: (163.94134, 0.0), + 165: (164.939407, 0.0), + 166: (165.93986, 0.0), + 167: (166.93827, 0.0), + 168: (167.93874, 0.0), + 169: (168.937651, 0.0), + 170: (169.938475, 0.0), + 171: (170.9379131, 0.0), + 172: (171.939086, 0.0), + 173: (172.9389306, 0.0), + 174: (173.9403375, 0.0), + 175: (174.9407718, 0.9741), + 176: (175.9426863, 0.0259), + 177: (176.9437581, 0.0), + 178: (177.945955, 0.0), + 179: (178.947327, 0.0), + 180: (179.94988, 0.0), + 181: (180.95197, 0.0), + 182: (181.95504, 0.0), + 183: (182.95757, 0.0), + 184: (183.96091, 0.0)}, + 'Md': {0: (258, 1.0), + 245: (245.08083, 0.0), + 246: (246.08189, 0.0), + 247: (247.08164, 0.0), + 248: (248.08282, 0.0), + 249: (249.08301, 0.0), + 250: (250.08442, 0.0), + 251: (251.08484, 0.0), + 252: (252.08656, 0.0), + 253: (253.08728, 0.0), + 254: (254.08966, 0.0), + 255: (255.091083, 0.0), + 256: (256.09406, 0.0), + 257: (257.095541, 0.0), + 258: (258.098431, 0.0), + 259: (259.10051, 0.0), + 260: (260.10365, 0.0), + 261: (261.10572, 0.0), + 262: (262.10887, 0.0)}, + 'Mg': {0: (23.9850417, 1.0), + 19: (19.03547, 0.0), + 20: (20.018863, 0.0), + 21: (21.011713, 0.0), + 22: (21.9995738, 0.0), + 23: (22.9941237, 0.0), + 24: (23.9850417, 0.7899), + 25: (24.98583692, 0.1), + 26: (25.982592929, 0.1101), + 27: (26.98434059, 0.0), + 28: (27.9838768, 0.0), + 29: (28.9886, 0.0), + 30: (29.990434, 0.0), + 31: (30.996546, 0.0), + 32: (31.998975, 0.0), + 33: (33.005254, 0.0), + 34: (34.00946, 0.0), + 35: (35.01734, 0.0), + 36: (36.023, 0.0), + 37: (37.0314, 0.0), + 38: (38.03757, 0.0), + 39: (39.04677, 0.0), + 40: (40.05393, 0.0)}, + 'Mn': {0: (54.9380451, 1.0), + 44: (44.00687, 0.0), + 45: (44.99451, 0.0), + 46: (45.98672, 0.0), + 47: (46.9761, 0.0), + 48: (47.96852, 0.0), + 49: (48.959618, 0.0), + 50: (49.9542382, 0.0), + 51: (50.9482108, 0.0), + 52: (51.9455655, 0.0), + 53: (52.9412901, 0.0), + 54: (53.9403589, 0.0), + 55: (54.9380451, 1.0), + 56: (55.9389049, 0.0), + 57: (56.9382854, 0.0), + 58: (57.93998, 0.0), + 59: (58.94044, 0.0), + 60: (59.94291, 0.0), + 61: (60.94465, 0.0), + 62: (61.94843, 0.0), + 63: (62.95024, 0.0), + 64: (63.95425, 0.0), + 65: (64.95634, 0.0), + 66: (65.96108, 0.0), + 67: (66.96414, 0.0), + 68: (67.9693, 0.0), + 69: (68.97284, 0.0)}, + 'Mo': {0: (97.9054082, 1.0), + 83: (82.94874, 0.0), + 84: (83.94009, 0.0), + 85: (84.93655, 0.0), + 86: (85.9307, 0.0), + 87: (86.92733, 0.0), + 88: (87.921953, 0.0), + 89: (88.91948, 0.0), + 90: (89.913937, 0.0), + 91: (90.91175, 0.0), + 92: (91.906811, 0.1477), + 93: (92.906813, 0.0), + 94: (93.9050883, 0.0923), + 95: (94.9058421, 0.159), + 96: (95.9046795, 0.1668), + 97: (96.9060215, 0.0956), + 98: (97.9054082, 0.2419), + 99: (98.9077119, 0.0), + 100: (99.907477, 0.0967), + 101: (100.910347, 0.0), + 102: (101.910297, 0.0), + 103: (102.91321, 0.0), + 104: (103.91376, 0.0), + 105: (104.91697, 0.0), + 106: (105.918137, 0.0), + 107: (106.92169, 0.0), + 108: (107.92345, 0.0), + 109: (108.92781, 0.0), + 110: (109.92973, 0.0), + 111: (110.93441, 0.0), + 112: (111.93684, 0.0), + 113: (112.94188, 0.0), + 114: (113.94492, 0.0), + 115: (114.95029, 0.0)}, + 'Mt': {0: (276, 1.0), + 265: (265.13615, 0.0), + 266: (266.1373, 0.0), + 267: (267.13731, 0.0), + 268: (268.13873, 0.0), + 269: (269.13906, 0.0), + 270: (270.14066, 0.0), + 271: (271.14114, 0.0), + 272: (272.14374, 0.0), + 273: (273.14491, 0.0), + 274: (274.14749, 0.0), + 275: (275.14865, 0.0), + 276: (276.15116, 0.0), + 277: (277.15242, 0.0), + 278: (278.15481, 0.0), + 279: (279.15619, 0.0)}, + 'N': {0: (14.0030740048, 1.0), + 10: (10.04165, 0.0), + 11: (11.02609, 0.0), + 12: (12.0186132, 0.0), + 13: (13.00573861, 0.0), + 14: (14.0030740048, 0.99636), + 15: (15.0001088982, 0.00364), + 16: (16.0061017, 0.0), + 17: (17.00845, 0.0), + 18: (18.014079, 0.0), + 19: (19.017029, 0.0), + 20: (20.02337, 0.0), + 21: (21.02711, 0.0), + 22: (22.03439, 0.0), + 23: (23.04122, 0.0), + 24: (24.05104, 0.0), + 25: (25.06066, 0.0)}, + 'Na': {0: (22.9897692809, 1.0), + 18: (18.02597, 0.0), + 19: (19.013877, 0.0), + 20: (20.007351, 0.0), + 21: (20.9976552, 0.0), + 22: (21.9944364, 0.0), + 23: (22.9897692809, 1.0), + 24: (23.99096278, 0.0), + 25: (24.989954, 0.0), + 26: (25.992633, 0.0), + 27: (26.994077, 0.0), + 28: (27.998938, 0.0), + 29: (29.002861, 0.0), + 30: (30.008976, 0.0), + 31: (31.01359, 0.0), + 32: (32.02047, 0.0), + 33: (33.02672, 0.0), + 34: (34.03517, 0.0), + 35: (35.04249, 0.0), + 36: (36.05148, 0.0), + 37: (37.05934, 0.0)}, + 'Nb': {0: (92.9063781, 1.0), + 81: (80.94903, 0.0), + 82: (81.94313, 0.0), + 83: (82.93671, 0.0), + 84: (83.93357, 0.0), + 85: (84.92791, 0.0), + 86: (85.92504, 0.0), + 87: (86.92036, 0.0), + 88: (87.91833, 0.0), + 89: (88.913418, 0.0), + 90: (89.911265, 0.0), + 91: (90.906996, 0.0), + 92: (91.907194, 0.0), + 93: (92.9063781, 1.0), + 94: (93.9072839, 0.0), + 95: (94.9068358, 0.0), + 96: (95.908101, 0.0), + 97: (96.9080986, 0.0), + 98: (97.910328, 0.0), + 99: (98.911618, 0.0), + 100: (99.914182, 0.0), + 101: (100.915252, 0.0), + 102: (101.91804, 0.0), + 103: (102.91914, 0.0), + 104: (103.92246, 0.0), + 105: (104.92394, 0.0), + 106: (105.92797, 0.0), + 107: (106.93031, 0.0), + 108: (107.93484, 0.0), + 109: (108.93763, 0.0), + 110: (109.94244, 0.0), + 111: (110.94565, 0.0), + 112: (111.95083, 0.0), + 113: (112.9547, 0.0)}, + 'Nd': {0: (141.9077233, 1.0), + 124: (123.95223, 0.0), + 125: (124.94888, 0.0), + 126: (125.94322, 0.0), + 127: (126.9405, 0.0), + 128: (127.93539, 0.0), + 129: (128.93319, 0.0), + 130: (129.92851, 0.0), + 131: (130.92725, 0.0), + 132: (131.923321, 0.0), + 133: (132.92235, 0.0), + 134: (133.91879, 0.0), + 135: (134.918181, 0.0), + 136: (135.914976, 0.0), + 137: (136.914567, 0.0), + 138: (137.91195, 0.0), + 139: (138.911978, 0.0), + 140: (139.90955, 0.0), + 141: (140.90961, 0.0), + 142: (141.9077233, 0.272), + 143: (142.9098143, 0.122), + 144: (143.9100873, 0.238), + 145: (144.9125736, 0.083), + 146: (145.9131169, 0.172), + 147: (146.9161004, 0.0), + 148: (147.916893, 0.057), + 149: (148.920149, 0.0), + 150: (149.920891, 0.056), + 151: (150.923829, 0.0), + 152: (151.924682, 0.0), + 153: (152.927698, 0.0), + 154: (153.92948, 0.0), + 155: (154.93293, 0.0), + 156: (155.93502, 0.0), + 157: (156.93903, 0.0), + 158: (157.9416, 0.0), + 159: (158.94609, 0.0), + 160: (159.94909, 0.0), + 161: (160.95388, 0.0)}, + 'Ne': {0: (19.9924401754, 1.0), + 16: (16.025761, 0.0), + 17: (17.017672, 0.0), + 18: (18.0057082, 0.0), + 19: (19.0018802, 0.0), + 20: (19.9924401754, 0.9048), + 21: (20.99384668, 0.0027), + 22: (21.991385114, 0.0925), + 23: (22.9944669, 0.0), + 24: (23.9936108, 0.0), + 25: (24.997737, 0.0), + 26: (26.000461, 0.0), + 27: (27.00759, 0.0), + 28: (28.01207, 0.0), + 29: (29.01939, 0.0), + 30: (30.0248, 0.0), + 31: (31.03311, 0.0), + 32: (32.04002, 0.0), + 33: (33.04938, 0.0), + 34: (34.05703, 0.0)}, + 'Ni': {0: (57.9353429, 1.0), + 48: (48.01975, 0.0), + 49: (49.00966, 0.0), + 50: (49.99593, 0.0), + 51: (50.98772, 0.0), + 52: (51.97568, 0.0), + 53: (52.96847, 0.0), + 54: (53.95791, 0.0), + 55: (54.95133, 0.0), + 56: (55.942132, 0.0), + 57: (56.9397935, 0.0), + 58: (57.9353429, 0.680769), + 59: (58.9343467, 0.0), + 60: (59.9307864, 0.262231), + 61: (60.931056, 0.011399), + 62: (61.9283451, 0.036345), + 63: (62.9296694, 0.0), + 64: (63.927966, 0.009256), + 65: (64.9300843, 0.0), + 66: (65.9291393, 0.0), + 67: (66.931569, 0.0), + 68: (67.931869, 0.0), + 69: (68.93561, 0.0), + 70: (69.9365, 0.0), + 71: (70.94074, 0.0), + 72: (71.94209, 0.0), + 73: (72.94647, 0.0), + 74: (73.94807, 0.0), + 75: (74.95287, 0.0), + 76: (75.95533, 0.0), + 77: (76.96055, 0.0), + 78: (77.96318, 0.0)}, + 'No': {0: (259, 1.0), + 248: (248.0866, 0.0), + 249: (249.08783, 0.0), + 250: (250.08751, 0.0), + 251: (251.08901, 0.0), + 252: (252.088977, 0.0), + 253: (253.09068, 0.0), + 254: (254.090955, 0.0), + 255: (255.093241, 0.0), + 256: (256.094283, 0.0), + 257: (257.096877, 0.0), + 258: (258.09821, 0.0), + 259: (259.10103, 0.0), + 260: (260.10264, 0.0), + 261: (261.10575, 0.0), + 262: (262.1073, 0.0), + 263: (263.11055, 0.0), + 264: (264.11235, 0.0)}, + 'Np': {0: (237, 1.0), + 225: (225.03391, 0.0), + 226: (226.03515, 0.0), + 227: (227.03496, 0.0), + 228: (228.03618, 0.0), + 229: (229.03626, 0.0), + 230: (230.03783, 0.0), + 231: (231.03825, 0.0), + 232: (232.04011, 0.0), + 233: (233.04074, 0.0), + 234: (234.042895, 0.0), + 235: (235.0440633, 0.0), + 236: (236.04657, 0.0), + 237: (237.0481734, 0.0), + 238: (238.0509464, 0.0), + 239: (239.052939, 0.0), + 240: (240.056162, 0.0), + 241: (241.05825, 0.0), + 242: (242.06164, 0.0), + 243: (243.06428, 0.0), + 244: (244.06785, 0.0)}, + 'O': {0: (15.99491461956, 1.0), + 12: (12.034405, 0.0), + 13: (13.024812, 0.0), + 14: (14.00859625, 0.0), + 15: (15.0030656, 0.0), + 16: (15.99491461956, 0.99757), + 17: (16.9991317, 0.00038), + 18: (17.999161, 0.00205), + 19: (19.00358, 0.0), + 20: (20.0040767, 0.0), + 21: (21.008656, 0.0), + 22: (22.00997, 0.0), + 23: (23.01569, 0.0), + 24: (24.02047, 0.0), + 25: (25.02946, 0.0), + 26: (26.03834, 0.0), + 27: (27.04826, 0.0), + 28: (28.05781, 0.0)}, + 'Os': {0: (191.9614807, 1.0), + 162: (161.98443, 0.0), + 163: (162.98269, 0.0), + 164: (163.97804, 0.0), + 165: (164.97676, 0.0), + 166: (165.972691, 0.0), + 167: (166.97155, 0.0), + 168: (167.967804, 0.0), + 169: (168.967019, 0.0), + 170: (169.963577, 0.0), + 171: (170.963185, 0.0), + 172: (171.960023, 0.0), + 173: (172.959808, 0.0), + 174: (173.957062, 0.0), + 175: (174.956946, 0.0), + 176: (175.95481, 0.0), + 177: (176.954965, 0.0), + 178: (177.953251, 0.0), + 179: (178.953816, 0.0), + 180: (179.952379, 0.0), + 181: (180.95324, 0.0), + 182: (181.95211, 0.0), + 183: (182.95313, 0.0), + 184: (183.9524891, 0.0002), + 185: (184.9540423, 0.0), + 186: (185.9538382, 0.0159), + 187: (186.9557505, 0.0196), + 188: (187.9558382, 0.1324), + 189: (188.9581475, 0.1615), + 190: (189.958447, 0.2626), + 191: (190.9609297, 0.0), + 192: (191.9614807, 0.4078), + 193: (192.9641516, 0.0), + 194: (193.9651821, 0.0), + 195: (194.96813, 0.0), + 196: (195.96964, 0.0)}, + 'P': {0: (30.97376163, 1.0), + 24: (24.03435, 0.0), + 25: (25.02026, 0.0), + 26: (26.01178, 0.0), + 27: (26.99923, 0.0), + 28: (27.992315, 0.0), + 29: (28.9818006, 0.0), + 30: (29.9783138, 0.0), + 31: (30.97376163, 1.0), + 32: (31.97390727, 0.0), + 33: (32.9717255, 0.0), + 34: (33.973636, 0.0), + 35: (34.9733141, 0.0), + 36: (35.97826, 0.0), + 37: (36.97961, 0.0), + 38: (37.98416, 0.0), + 39: (38.98618, 0.0), + 40: (39.9913, 0.0), + 41: (40.99434, 0.0), + 42: (42.00101, 0.0), + 43: (43.00619, 0.0), + 44: (44.01299, 0.0), + 45: (45.01922, 0.0), + 46: (46.02738, 0.0)}, + 'Pa': {0: (231.035884, 1.0), + 212: (212.0232, 0.0), + 213: (213.02111, 0.0), + 214: (214.02092, 0.0), + 215: (215.01919, 0.0), + 216: (216.01911, 0.0), + 217: (217.01832, 0.0), + 218: (218.020042, 0.0), + 219: (219.01988, 0.0), + 220: (220.02188, 0.0), + 221: (221.02188, 0.0), + 222: (222.02374, 0.0), + 223: (223.02396, 0.0), + 224: (224.025626, 0.0), + 225: (225.02613, 0.0), + 226: (226.027948, 0.0), + 227: (227.028805, 0.0), + 228: (228.031051, 0.0), + 229: (229.0320968, 0.0), + 230: (230.034541, 0.0), + 231: (231.035884, 1.0), + 232: (232.038592, 0.0), + 233: (233.0402473, 0.0), + 234: (234.043308, 0.0), + 235: (235.04544, 0.0), + 236: (236.04868, 0.0), + 237: (237.05115, 0.0), + 238: (238.0545, 0.0), + 239: (239.05726, 0.0), + 240: (240.06098, 0.0)}, + 'Pb': {0: (207.9766521, 1.0), + 178: (178.00383, 0.0), + 179: (179.00215, 0.0), + 180: (179.997918, 0.0), + 181: (180.99662, 0.0), + 182: (181.992672, 0.0), + 183: (182.99187, 0.0), + 184: (183.988142, 0.0), + 185: (184.98761, 0.0), + 186: (185.984239, 0.0), + 187: (186.983918, 0.0), + 188: (187.980874, 0.0), + 189: (188.98081, 0.0), + 190: (189.978082, 0.0), + 191: (190.97827, 0.0), + 192: (191.975785, 0.0), + 193: (192.97617, 0.0), + 194: (193.974012, 0.0), + 195: (194.974542, 0.0), + 196: (195.972774, 0.0), + 197: (196.973431, 0.0), + 198: (197.972034, 0.0), + 199: (198.972917, 0.0), + 200: (199.971827, 0.0), + 201: (200.972885, 0.0), + 202: (201.972159, 0.0), + 203: (202.973391, 0.0), + 204: (203.9730436, 0.014), + 205: (204.9744818, 0.0), + 206: (205.9744653, 0.241), + 207: (206.9758969, 0.221), + 208: (207.9766521, 0.524), + 209: (208.9810901, 0.0), + 210: (209.9841885, 0.0), + 211: (210.988737, 0.0), + 212: (211.9918975, 0.0), + 213: (212.996581, 0.0), + 214: (213.9998054, 0.0), + 215: (215.00481, 0.0)}, + 'Pd': {0: (105.903486, 1.0), + 91: (90.94911, 0.0), + 92: (91.94042, 0.0), + 93: (92.93591, 0.0), + 94: (93.92877, 0.0), + 95: (94.92469, 0.0), + 96: (95.91816, 0.0), + 97: (96.91648, 0.0), + 98: (97.912721, 0.0), + 99: (98.911768, 0.0), + 100: (99.908506, 0.0), + 101: (100.908289, 0.0), + 102: (101.905609, 0.0102), + 103: (102.906087, 0.0), + 104: (103.904036, 0.1114), + 105: (104.905085, 0.2233), + 106: (105.903486, 0.2733), + 107: (106.905133, 0.0), + 108: (107.903892, 0.2646), + 109: (108.90595, 0.0), + 110: (109.905153, 0.1172), + 111: (110.907671, 0.0), + 112: (111.907314, 0.0), + 113: (112.91015, 0.0), + 114: (113.910363, 0.0), + 115: (114.91368, 0.0), + 116: (115.91416, 0.0), + 117: (116.91784, 0.0), + 118: (117.91898, 0.0), + 119: (118.92311, 0.0), + 120: (119.92469, 0.0), + 121: (120.92887, 0.0), + 122: (121.93055, 0.0), + 123: (122.93493, 0.0), + 124: (123.93688, 0.0)}, + 'Pm': {0: (145, 1.0), + 126: (125.95752, 0.0), + 127: (126.95163, 0.0), + 128: (127.94842, 0.0), + 129: (128.94316, 0.0), + 130: (129.94045, 0.0), + 131: (130.93587, 0.0), + 132: (131.93375, 0.0), + 133: (132.92978, 0.0), + 134: (133.92835, 0.0), + 135: (134.92488, 0.0), + 136: (135.92357, 0.0), + 137: (136.920479, 0.0), + 138: (137.919548, 0.0), + 139: (138.916804, 0.0), + 140: (139.91604, 0.0), + 141: (140.913555, 0.0), + 142: (141.912874, 0.0), + 143: (142.910933, 0.0), + 144: (143.912591, 0.0), + 145: (144.912749, 0.0), + 146: (145.914696, 0.0), + 147: (146.9151385, 0.0), + 148: (147.917475, 0.0), + 149: (148.918334, 0.0), + 150: (149.920984, 0.0), + 151: (150.921207, 0.0), + 152: (151.923497, 0.0), + 153: (152.924117, 0.0), + 154: (153.92646, 0.0), + 155: (154.9281, 0.0), + 156: (155.93106, 0.0), + 157: (156.93304, 0.0), + 158: (157.93656, 0.0), + 159: (158.93897, 0.0), + 160: (159.94299, 0.0), + 161: (160.94586, 0.0), + 162: (161.95029, 0.0), + 163: (162.95368, 0.0)}, + 'Po': {0: (209, 1.0), + 188: (187.999422, 0.0), + 189: (188.998481, 0.0), + 190: (189.995101, 0.0), + 191: (190.994574, 0.0), + 192: (191.991335, 0.0), + 193: (192.99103, 0.0), + 194: (193.988186, 0.0), + 195: (194.98811, 0.0), + 196: (195.985535, 0.0), + 197: (196.98566, 0.0), + 198: (197.983389, 0.0), + 199: (198.983666, 0.0), + 200: (199.981799, 0.0), + 201: (200.98226, 0.0), + 202: (201.980758, 0.0), + 203: (202.98142, 0.0), + 204: (203.980318, 0.0), + 205: (204.981203, 0.0), + 206: (205.980481, 0.0), + 207: (206.981593, 0.0), + 208: (207.9812457, 0.0), + 209: (208.9824304, 0.0), + 210: (209.9828737, 0.0), + 211: (210.9866532, 0.0), + 212: (211.988868, 0.0), + 213: (212.992857, 0.0), + 214: (213.9952014, 0.0), + 215: (214.99942, 0.0), + 216: (216.001915, 0.0), + 217: (217.006335, 0.0), + 218: (218.008973, 0.0), + 219: (219.01374, 0.0), + 220: (220.0166, 0.0)}, + 'Pr': {0: (140.9076528, 1.0), + 121: (120.95536, 0.0), + 122: (121.95181, 0.0), + 123: (122.94596, 0.0), + 124: (123.94296, 0.0), + 125: (124.93783, 0.0), + 126: (125.93531, 0.0), + 127: (126.93083, 0.0), + 128: (127.92879, 0.0), + 129: (128.9251, 0.0), + 130: (129.92359, 0.0), + 131: (130.92026, 0.0), + 132: (131.91926, 0.0), + 133: (132.916331, 0.0), + 134: (133.91571, 0.0), + 135: (134.913112, 0.0), + 136: (135.912692, 0.0), + 137: (136.910705, 0.0), + 138: (137.910755, 0.0), + 139: (138.908938, 0.0), + 140: (139.909076, 0.0), + 141: (140.9076528, 1.0), + 142: (141.9100448, 0.0), + 143: (142.9108169, 0.0), + 144: (143.913305, 0.0), + 145: (144.914512, 0.0), + 146: (145.91764, 0.0), + 147: (146.918996, 0.0), + 148: (147.922135, 0.0), + 149: (148.92372, 0.0), + 150: (149.926673, 0.0), + 151: (150.928319, 0.0), + 152: (151.9315, 0.0), + 153: (152.93384, 0.0), + 154: (153.93752, 0.0), + 155: (154.94012, 0.0), + 156: (155.94427, 0.0), + 157: (156.94743, 0.0), + 158: (157.95198, 0.0), + 159: (158.9555, 0.0)}, + 'Pt': {0: (194.9647911, 1.0), + 166: (165.99486, 0.0), + 167: (166.99298, 0.0), + 168: (167.98815, 0.0), + 169: (168.98672, 0.0), + 170: (169.982495, 0.0), + 171: (170.98124, 0.0), + 172: (171.977347, 0.0), + 173: (172.97644, 0.0), + 174: (173.972819, 0.0), + 175: (174.972421, 0.0), + 176: (175.968945, 0.0), + 177: (176.968469, 0.0), + 178: (177.965649, 0.0), + 179: (178.965363, 0.0), + 180: (179.963031, 0.0), + 181: (180.963097, 0.0), + 182: (181.961171, 0.0), + 183: (182.961597, 0.0), + 184: (183.959922, 0.0), + 185: (184.96062, 0.0), + 186: (185.959351, 0.0), + 187: (186.96059, 0.0), + 188: (187.959395, 0.0), + 189: (188.960834, 0.0), + 190: (189.959932, 0.00014), + 191: (190.961677, 0.0), + 192: (191.961038, 0.00782), + 193: (192.9629874, 0.0), + 194: (193.9626803, 0.32967), + 195: (194.9647911, 0.33832), + 196: (195.9649515, 0.25242), + 197: (196.9673402, 0.0), + 198: (197.967893, 0.07163), + 199: (198.970593, 0.0), + 200: (199.971441, 0.0), + 201: (200.97451, 0.0), + 202: (201.97574, 0.0)}, + 'Pu': {0: (244, 1.0), + 228: (228.03874, 0.0), + 229: (229.04015, 0.0), + 230: (230.03965, 0.0), + 231: (231.041101, 0.0), + 232: (232.041187, 0.0), + 233: (233.043, 0.0), + 234: (234.043317, 0.0), + 235: (235.045286, 0.0), + 236: (236.046058, 0.0), + 237: (237.0484097, 0.0), + 238: (238.0495599, 0.0), + 239: (239.0521634, 0.0), + 240: (240.0538135, 0.0), + 241: (241.0568515, 0.0), + 242: (242.0587426, 0.0), + 243: (243.062003, 0.0), + 244: (244.064204, 0.0), + 245: (245.067747, 0.0), + 246: (246.070205, 0.0), + 247: (247.07407, 0.0)}, + 'Ra': {0: (226, 1.0), + 202: (202.00989, 0.0), + 203: (203.00927, 0.0), + 204: (204.0065, 0.0), + 205: (205.00627, 0.0), + 206: (206.003827, 0.0), + 207: (207.0038, 0.0), + 208: (208.00184, 0.0), + 209: (209.00199, 0.0), + 210: (210.000495, 0.0), + 211: (211.000898, 0.0), + 212: (211.999794, 0.0), + 213: (213.000384, 0.0), + 214: (214.000108, 0.0), + 215: (215.00272, 0.0), + 216: (216.003533, 0.0), + 217: (217.00632, 0.0), + 218: (218.00714, 0.0), + 219: (219.010085, 0.0), + 220: (220.011028, 0.0), + 221: (221.013917, 0.0), + 222: (222.015375, 0.0), + 223: (223.0185022, 0.0), + 224: (224.0202118, 0.0), + 225: (225.023612, 0.0), + 226: (226.0254098, 0.0), + 227: (227.0291778, 0.0), + 228: (228.0310703, 0.0), + 229: (229.034958, 0.0), + 230: (230.037056, 0.0), + 231: (231.04122, 0.0), + 232: (232.04364, 0.0), + 233: (233.04806, 0.0), + 234: (234.0507, 0.0)}, + 'Rb': {0: (84.911789738, 1.0), + 71: (70.96532, 0.0), + 72: (71.95908, 0.0), + 73: (72.95056, 0.0), + 74: (73.944265, 0.0), + 75: (74.93857, 0.0), + 76: (75.9350722, 0.0), + 77: (76.930408, 0.0), + 78: (77.928141, 0.0), + 79: (78.923989, 0.0), + 80: (79.922519, 0.0), + 81: (80.918996, 0.0), + 82: (81.9182086, 0.0), + 83: (82.91511, 0.0), + 84: (83.914385, 0.0), + 85: (84.911789738, 0.7217), + 86: (85.91116742, 0.0), + 87: (86.909180527, 0.2783), + 88: (87.91131559, 0.0), + 89: (88.912278, 0.0), + 90: (89.914802, 0.0), + 91: (90.916537, 0.0), + 92: (91.919729, 0.0), + 93: (92.922042, 0.0), + 94: (93.926405, 0.0), + 95: (94.929303, 0.0), + 96: (95.93427, 0.0), + 97: (96.93735, 0.0), + 98: (97.94179, 0.0), + 99: (98.94538, 0.0), + 100: (99.94987, 0.0), + 101: (100.9532, 0.0), + 102: (101.95887, 0.0)}, + 'Re': {0: (186.9557531, 1.0), + 160: (159.98212, 0.0), + 161: (160.97759, 0.0), + 162: (161.976, 0.0), + 163: (162.972081, 0.0), + 164: (163.97032, 0.0), + 165: (164.967089, 0.0), + 166: (165.96581, 0.0), + 167: (166.9626, 0.0), + 168: (167.96157, 0.0), + 169: (168.95879, 0.0), + 170: (169.95822, 0.0), + 171: (170.95572, 0.0), + 172: (171.95542, 0.0), + 173: (172.95324, 0.0), + 174: (173.95312, 0.0), + 175: (174.95138, 0.0), + 176: (175.95162, 0.0), + 177: (176.95033, 0.0), + 178: (177.95099, 0.0), + 179: (178.949988, 0.0), + 180: (179.950789, 0.0), + 181: (180.950068, 0.0), + 182: (181.95121, 0.0), + 183: (182.95082, 0.0), + 184: (183.952521, 0.0), + 185: (184.952955, 0.374), + 186: (185.9549861, 0.0), + 187: (186.9557531, 0.626), + 188: (187.9581144, 0.0), + 189: (188.959229, 0.0), + 190: (189.96182, 0.0), + 191: (190.963125, 0.0), + 192: (191.96596, 0.0), + 193: (192.96747, 0.0), + 194: (193.97042, 0.0)}, + 'Rf': {0: (265, 1.0), + 253: (253.10069, 0.0), + 254: (254.10018, 0.0), + 255: (255.10134, 0.0), + 256: (256.101166, 0.0), + 257: (257.10299, 0.0), + 258: (258.10349, 0.0), + 259: (259.10564, 0.0), + 260: (260.10644, 0.0), + 261: (261.10877, 0.0), + 262: (262.10993, 0.0), + 263: (263.11255, 0.0), + 264: (264.11399, 0.0), + 265: (265.1167, 0.0), + 266: (266.11796, 0.0), + 267: (267.12153, 0.0), + 268: (268.12364, 0.0)}, + 'Rg': {0: (280, 1.0), + 272: (272.15362, 0.0), + 273: (273.15368, 0.0), + 274: (274.15571, 0.0), + 275: (275.15614, 0.0), + 276: (276.15849, 0.0), + 277: (277.15952, 0.0), + 278: (278.1616, 0.0), + 279: (279.16247, 0.0), + 280: (280.16447, 0.0), + 281: (281.16537, 0.0), + 282: (282.16749, 0.0), + 283: (283.16842, 0.0)}, + 'Rh': {0: (102.905504, 1.0), + 89: (88.94884, 0.0), + 90: (89.94287, 0.0), + 91: (90.93655, 0.0), + 92: (91.93198, 0.0), + 93: (92.92574, 0.0), + 94: (93.9217, 0.0), + 95: (94.9159, 0.0), + 96: (95.914461, 0.0), + 97: (96.91134, 0.0), + 98: (97.910708, 0.0), + 99: (98.908132, 0.0), + 100: (99.908122, 0.0), + 101: (100.906164, 0.0), + 102: (101.906843, 0.0), + 103: (102.905504, 1.0), + 104: (103.906656, 0.0), + 105: (104.905694, 0.0), + 106: (105.907287, 0.0), + 107: (106.906748, 0.0), + 108: (107.90873, 0.0), + 109: (108.908737, 0.0), + 110: (109.91114, 0.0), + 111: (110.91159, 0.0), + 112: (111.91439, 0.0), + 113: (112.91553, 0.0), + 114: (113.91881, 0.0), + 115: (114.92033, 0.0), + 116: (115.92406, 0.0), + 117: (116.92598, 0.0), + 118: (117.93007, 0.0), + 119: (118.93211, 0.0), + 120: (119.93641, 0.0), + 121: (120.93872, 0.0), + 122: (121.94321, 0.0)}, + 'Rn': {0: (222, 1.0), + 195: (195.00544, 0.0), + 196: (196.002115, 0.0), + 197: (197.00158, 0.0), + 198: (197.998679, 0.0), + 199: (198.99837, 0.0), + 200: (199.995699, 0.0), + 201: (200.99563, 0.0), + 202: (201.993263, 0.0), + 203: (202.993387, 0.0), + 204: (203.991429, 0.0), + 205: (204.99172, 0.0), + 206: (205.990214, 0.0), + 207: (206.990734, 0.0), + 208: (207.989642, 0.0), + 209: (208.990415, 0.0), + 210: (209.989696, 0.0), + 211: (210.990601, 0.0), + 212: (211.990704, 0.0), + 213: (212.993883, 0.0), + 214: (213.995363, 0.0), + 215: (214.998745, 0.0), + 216: (216.000274, 0.0), + 217: (217.003928, 0.0), + 218: (218.0056013, 0.0), + 219: (219.0094802, 0.0), + 220: (220.011394, 0.0), + 221: (221.015537, 0.0), + 222: (222.0175777, 0.0), + 223: (223.02179, 0.0), + 224: (224.02409, 0.0), + 225: (225.02844, 0.0), + 226: (226.03089, 0.0), + 227: (227.03541, 0.0), + 228: (228.03799, 0.0)}, + 'Ru': {0: (101.9043493, 1.0), + 87: (86.94918, 0.0), + 88: (87.94026, 0.0), + 89: (88.93611, 0.0), + 90: (89.92989, 0.0), + 91: (90.92629, 0.0), + 92: (91.92012, 0.0), + 93: (92.91705, 0.0), + 94: (93.91136, 0.0), + 95: (94.910413, 0.0), + 96: (95.907598, 0.0554), + 97: (96.907555, 0.0), + 98: (97.905287, 0.0187), + 99: (98.9059393, 0.1276), + 100: (99.9042195, 0.126), + 101: (100.9055821, 0.1706), + 102: (101.9043493, 0.3155), + 103: (102.9063238, 0.0), + 104: (103.905433, 0.1862), + 105: (104.907753, 0.0), + 106: (105.907329, 0.0), + 107: (106.90991, 0.0), + 108: (107.91017, 0.0), + 109: (108.9132, 0.0), + 110: (109.91414, 0.0), + 111: (110.9177, 0.0), + 112: (111.91897, 0.0), + 113: (112.92249, 0.0), + 114: (113.92428, 0.0), + 115: (114.92869, 0.0), + 116: (115.93081, 0.0), + 117: (116.93558, 0.0), + 118: (117.93782, 0.0), + 119: (118.94284, 0.0), + 120: (119.94531, 0.0)}, + 'S': {0: (31.972071, 1.0), + 26: (26.02788, 0.0), + 27: (27.01883, 0.0), + 28: (28.00437, 0.0), + 29: (28.99661, 0.0), + 30: (29.984903, 0.0), + 31: (30.9795547, 0.0), + 32: (31.972071, 0.9499), + 33: (32.97145876, 0.0075), + 34: (33.9678669, 0.0425), + 35: (34.96903216, 0.0), + 36: (35.96708076, 0.0001), + 37: (36.97112557, 0.0), + 38: (37.971163, 0.0), + 39: (38.97513, 0.0), + 40: (39.97545, 0.0), + 41: (40.97958, 0.0), + 42: (41.98102, 0.0), + 43: (42.98715, 0.0), + 44: (43.99021, 0.0), + 45: (44.99651, 0.0), + 46: (46.00075, 0.0), + 47: (47.00859, 0.0), + 48: (48.01417, 0.0), + 49: (49.02362, 0.0)}, + 'Sb': {0: (120.9038157, 1.0), + 103: (102.93969, 0.0), + 104: (103.93647, 0.0), + 105: (104.93149, 0.0), + 106: (105.92879, 0.0), + 107: (106.92415, 0.0), + 108: (107.92216, 0.0), + 109: (108.918132, 0.0), + 110: (109.91675, 0.0), + 111: (110.91316, 0.0), + 112: (111.912398, 0.0), + 113: (112.909372, 0.0), + 114: (113.90927, 0.0), + 115: (114.906598, 0.0), + 116: (115.906794, 0.0), + 117: (116.904836, 0.0), + 118: (117.905529, 0.0), + 119: (118.903942, 0.0), + 120: (119.905072, 0.0), + 121: (120.9038157, 0.5721), + 122: (121.9051737, 0.0), + 123: (122.904214, 0.4279), + 124: (123.9059357, 0.0), + 125: (124.9052538, 0.0), + 126: (125.90725, 0.0), + 127: (126.906924, 0.0), + 128: (127.909169, 0.0), + 129: (128.909148, 0.0), + 130: (129.911656, 0.0), + 131: (130.911982, 0.0), + 132: (131.914467, 0.0), + 133: (132.915252, 0.0), + 134: (133.92038, 0.0), + 135: (134.92517, 0.0), + 136: (135.93035, 0.0), + 137: (136.93531, 0.0), + 138: (137.94079, 0.0), + 139: (138.94598, 0.0)}, + 'Sc': {0: (44.9559119, 1.0), + 36: (36.01492, 0.0), + 37: (37.00305, 0.0), + 38: (37.9947, 0.0), + 39: (38.98479, 0.0), + 40: (39.977967, 0.0), + 41: (40.96925113, 0.0), + 42: (41.96551643, 0.0), + 43: (42.9611507, 0.0), + 44: (43.9594028, 0.0), + 45: (44.9559119, 1.0), + 46: (45.9551719, 0.0), + 47: (46.9524075, 0.0), + 48: (47.952231, 0.0), + 49: (48.950024, 0.0), + 50: (49.952188, 0.0), + 51: (50.953603, 0.0), + 52: (51.95668, 0.0), + 53: (52.95961, 0.0), + 54: (53.96326, 0.0), + 55: (54.96824, 0.0), + 56: (55.97287, 0.0), + 57: (56.97779, 0.0), + 58: (57.98371, 0.0), + 59: (58.98922, 0.0), + 60: (59.99571, 0.0)}, + 'Se': {0: (79.9165213, 1.0), + 65: (64.96466, 0.0), + 66: (65.95521, 0.0), + 67: (66.95009, 0.0), + 68: (67.9418, 0.0), + 69: (68.93956, 0.0), + 70: (69.93339, 0.0), + 71: (70.93224, 0.0), + 72: (71.927112, 0.0), + 73: (72.926765, 0.0), + 74: (73.9224764, 0.0089), + 75: (74.9225234, 0.0), + 76: (75.9192136, 0.0937), + 77: (76.919914, 0.0763), + 78: (77.9173091, 0.2377), + 79: (78.9184991, 0.0), + 80: (79.9165213, 0.4961), + 81: (80.9179925, 0.0), + 82: (81.9166994, 0.0873), + 83: (82.919118, 0.0), + 84: (83.918462, 0.0), + 85: (84.92225, 0.0), + 86: (85.924272, 0.0), + 87: (86.92852, 0.0), + 88: (87.93142, 0.0), + 89: (88.93645, 0.0), + 90: (89.93996, 0.0), + 91: (90.94596, 0.0), + 92: (91.94992, 0.0), + 93: (92.95629, 0.0), + 94: (93.96049, 0.0)}, + 'Sg': {0: (271, 1.0), + 258: (258.11317, 0.0), + 259: (259.1145, 0.0), + 260: (260.11442, 0.0), + 261: (261.11612, 0.0), + 262: (262.1164, 0.0), + 263: (263.11832, 0.0), + 264: (264.11893, 0.0), + 265: (265.12111, 0.0), + 266: (266.12207, 0.0), + 267: (267.12443, 0.0), + 268: (268.12561, 0.0), + 269: (269.12876, 0.0), + 270: (270.13033, 0.0), + 271: (271.13347, 0.0), + 272: (272.13516, 0.0), + 273: (273.13822, 0.0)}, + 'Si': {0: (27.9769265325, 1.0), + 22: (22.03453, 0.0), + 23: (23.02552, 0.0), + 24: (24.011546, 0.0), + 25: (25.004106, 0.0), + 26: (25.99233, 0.0), + 27: (26.98670491, 0.0), + 28: (27.9769265325, 0.92223), + 29: (28.9764947, 0.04685), + 30: (29.97377017, 0.03092), + 31: (30.97536323, 0.0), + 32: (31.97414808, 0.0), + 33: (32.978, 0.0), + 34: (33.978576, 0.0), + 35: (34.98458, 0.0), + 36: (35.9866, 0.0), + 37: (36.99294, 0.0), + 38: (37.99563, 0.0), + 39: (39.00207, 0.0), + 40: (40.00587, 0.0), + 41: (41.01456, 0.0), + 42: (42.01979, 0.0), + 43: (43.02866, 0.0), + 44: (44.03526, 0.0)}, + 'Sm': {0: (151.9197324, 1.0), + 128: (127.95808, 0.0), + 129: (128.95464, 0.0), + 130: (129.94892, 0.0), + 131: (130.94611, 0.0), + 132: (131.94069, 0.0), + 133: (132.93867, 0.0), + 134: (133.93397, 0.0), + 135: (134.93252, 0.0), + 136: (135.928276, 0.0), + 137: (136.92697, 0.0), + 138: (137.923244, 0.0), + 139: (138.922297, 0.0), + 140: (139.918995, 0.0), + 141: (140.918476, 0.0), + 142: (141.915198, 0.0), + 143: (142.914628, 0.0), + 144: (143.911999, 0.0307), + 145: (144.91341, 0.0), + 146: (145.913041, 0.0), + 147: (146.9148979, 0.1499), + 148: (147.9148227, 0.1124), + 149: (148.9171847, 0.1382), + 150: (149.9172755, 0.0738), + 151: (150.9199324, 0.0), + 152: (151.9197324, 0.2675), + 153: (152.9220974, 0.0), + 154: (153.9222093, 0.2275), + 155: (154.9246402, 0.0), + 156: (155.925528, 0.0), + 157: (156.92836, 0.0), + 158: (157.92999, 0.0), + 159: (158.93321, 0.0), + 160: (159.93514, 0.0), + 161: (160.93883, 0.0), + 162: (161.94122, 0.0), + 163: (162.94536, 0.0), + 164: (163.94828, 0.0), + 165: (164.95298, 0.0)}, + 'Sn': {0: (119.9021947, 1.0), + 99: (98.94933, 0.0), + 100: (99.93904, 0.0), + 101: (100.93606, 0.0), + 102: (101.9303, 0.0), + 103: (102.9281, 0.0), + 104: (103.92314, 0.0), + 105: (104.92135, 0.0), + 106: (105.91688, 0.0), + 107: (106.91564, 0.0), + 108: (107.911925, 0.0), + 109: (108.911283, 0.0), + 110: (109.907843, 0.0), + 111: (110.907734, 0.0), + 112: (111.904818, 0.0097), + 113: (112.905171, 0.0), + 114: (113.902779, 0.0066), + 115: (114.903342, 0.0034), + 116: (115.901741, 0.1454), + 117: (116.902952, 0.0768), + 118: (117.901603, 0.2422), + 119: (118.903308, 0.0859), + 120: (119.9021947, 0.3258), + 121: (120.9042355, 0.0), + 122: (121.903439, 0.0463), + 123: (122.9057208, 0.0), + 124: (123.9052739, 0.0579), + 125: (124.9077841, 0.0), + 126: (125.907653, 0.0), + 127: (126.91036, 0.0), + 128: (127.910537, 0.0), + 129: (128.91348, 0.0), + 130: (129.913967, 0.0), + 131: (130.917, 0.0), + 132: (131.917816, 0.0), + 133: (132.92383, 0.0), + 134: (133.92829, 0.0), + 135: (134.93473, 0.0), + 136: (135.93934, 0.0), + 137: (136.94599, 0.0)}, + 'Sr': {0: (87.9056121, 1.0), + 73: (72.96597, 0.0), + 74: (73.95631, 0.0), + 75: (74.94995, 0.0), + 76: (75.94177, 0.0), + 77: (76.937945, 0.0), + 78: (77.93218, 0.0), + 79: (78.929708, 0.0), + 80: (79.924521, 0.0), + 81: (80.923212, 0.0), + 82: (81.918402, 0.0), + 83: (82.917557, 0.0), + 84: (83.913425, 0.0056), + 85: (84.912933, 0.0), + 86: (85.9092602, 0.0986), + 87: (86.9088771, 0.07), + 88: (87.9056121, 0.8258), + 89: (88.9074507, 0.0), + 90: (89.907738, 0.0), + 91: (90.910203, 0.0), + 92: (91.911038, 0.0), + 93: (92.914026, 0.0), + 94: (93.915361, 0.0), + 95: (94.919359, 0.0), + 96: (95.921697, 0.0), + 97: (96.926153, 0.0), + 98: (97.928453, 0.0), + 99: (98.93324, 0.0), + 100: (99.93535, 0.0), + 101: (100.94052, 0.0), + 102: (101.94302, 0.0), + 103: (102.94895, 0.0), + 104: (103.95233, 0.0), + 105: (104.95858, 0.0)}, + 'Ta': {0: (180.9479958, 1.0), + 155: (154.97459, 0.0), + 156: (155.9723, 0.0), + 157: (156.96819, 0.0), + 158: (157.9667, 0.0), + 159: (158.963018, 0.0), + 160: (159.96149, 0.0), + 161: (160.95842, 0.0), + 162: (161.95729, 0.0), + 163: (162.95433, 0.0), + 164: (163.95353, 0.0), + 165: (164.950773, 0.0), + 166: (165.95051, 0.0), + 167: (166.94809, 0.0), + 168: (167.94805, 0.0), + 169: (168.94601, 0.0), + 170: (169.94618, 0.0), + 171: (170.94448, 0.0), + 172: (171.9449, 0.0), + 173: (172.94375, 0.0), + 174: (173.94445, 0.0), + 175: (174.94374, 0.0), + 176: (175.94486, 0.0), + 177: (176.944472, 0.0), + 178: (177.945778, 0.0), + 179: (178.9459295, 0.0), + 180: (179.9474648, 0.00012), + 181: (180.9479958, 0.99988), + 182: (181.9501518, 0.0), + 183: (182.9513726, 0.0), + 184: (183.954008, 0.0), + 185: (184.955559, 0.0), + 186: (185.95855, 0.0), + 187: (186.96053, 0.0), + 188: (187.9637, 0.0), + 189: (188.96583, 0.0), + 190: (189.96923, 0.0)}, + 'Tb': {0: (158.9253468, 1.0), + 136: (135.96138, 0.0), + 137: (136.95598, 0.0), + 138: (137.95316, 0.0), + 139: (138.94829, 0.0), + 140: (139.94581, 0.0), + 141: (140.94145, 0.0), + 142: (141.93874, 0.0), + 143: (142.93512, 0.0), + 144: (143.93305, 0.0), + 145: (144.92927, 0.0), + 146: (145.92725, 0.0), + 147: (146.924045, 0.0), + 148: (147.924272, 0.0), + 149: (148.923246, 0.0), + 150: (149.92366, 0.0), + 151: (150.923103, 0.0), + 152: (151.92407, 0.0), + 153: (152.923435, 0.0), + 154: (153.92468, 0.0), + 155: (154.923505, 0.0), + 156: (155.924747, 0.0), + 157: (156.9240246, 0.0), + 158: (157.9254131, 0.0), + 159: (158.9253468, 1.0), + 160: (159.9271676, 0.0), + 161: (160.9275699, 0.0), + 162: (161.92949, 0.0), + 163: (162.930648, 0.0), + 164: (163.93335, 0.0), + 165: (164.93488, 0.0), + 166: (165.93799, 0.0), + 167: (166.94005, 0.0), + 168: (167.94364, 0.0), + 169: (168.94622, 0.0), + 170: (169.95025, 0.0), + 171: (170.9533, 0.0)}, + 'Tc': {0: (98, 1.0), + 85: (84.94883, 0.0), + 86: (85.94288, 0.0), + 87: (86.93653, 0.0), + 88: (87.93268, 0.0), + 89: (88.92717, 0.0), + 90: (89.92356, 0.0), + 91: (90.91843, 0.0), + 92: (91.91526, 0.0), + 93: (92.910249, 0.0), + 94: (93.909657, 0.0), + 95: (94.907657, 0.0), + 96: (95.907871, 0.0), + 97: (96.906365, 0.0), + 98: (97.907216, 0.0), + 99: (98.9062547, 0.0), + 100: (99.9076578, 0.0), + 101: (100.907315, 0.0), + 102: (101.909215, 0.0), + 103: (102.909181, 0.0), + 104: (103.91145, 0.0), + 105: (104.91166, 0.0), + 106: (105.914358, 0.0), + 107: (106.91508, 0.0), + 108: (107.91846, 0.0), + 109: (108.91998, 0.0), + 110: (109.92382, 0.0), + 111: (110.92569, 0.0), + 112: (111.92915, 0.0), + 113: (112.93159, 0.0), + 114: (113.93588, 0.0), + 115: (114.93869, 0.0), + 116: (115.94337, 0.0), + 117: (116.94648, 0.0), + 118: (117.95148, 0.0)}, + 'Te': {0: (129.9062244, 1.0), + 105: (104.94364, 0.0), + 106: (105.9375, 0.0), + 107: (106.93501, 0.0), + 108: (107.92944, 0.0), + 109: (108.92742, 0.0), + 110: (109.92241, 0.0), + 111: (110.92111, 0.0), + 112: (111.91701, 0.0), + 113: (112.91589, 0.0), + 114: (113.91209, 0.0), + 115: (114.9119, 0.0), + 116: (115.90846, 0.0), + 117: (116.908645, 0.0), + 118: (117.905828, 0.0), + 119: (118.906404, 0.0), + 120: (119.90402, 0.0009), + 121: (120.904936, 0.0), + 122: (121.9030439, 0.0255), + 123: (122.90427, 0.0089), + 124: (123.9028179, 0.0474), + 125: (124.9044307, 0.0707), + 126: (125.9033117, 0.1884), + 127: (126.9052263, 0.0), + 128: (127.9044631, 0.3174), + 129: (128.9065982, 0.0), + 130: (129.9062244, 0.3408), + 131: (130.9085239, 0.0), + 132: (131.908553, 0.0), + 133: (132.910955, 0.0), + 134: (133.911369, 0.0), + 135: (134.91645, 0.0), + 136: (135.9201, 0.0), + 137: (136.92532, 0.0), + 138: (137.92922, 0.0), + 139: (138.93473, 0.0), + 140: (139.93885, 0.0), + 141: (140.94465, 0.0), + 142: (141.94908, 0.0)}, + 'Th': {0: (232.0380553, 1.0), + 209: (209.01772, 0.0), + 210: (210.015075, 0.0), + 211: (211.01493, 0.0), + 212: (212.01298, 0.0), + 213: (213.01301, 0.0), + 214: (214.0115, 0.0), + 215: (215.01173, 0.0), + 216: (216.011062, 0.0), + 217: (217.013114, 0.0), + 218: (218.013284, 0.0), + 219: (219.01554, 0.0), + 220: (220.015748, 0.0), + 221: (221.018184, 0.0), + 222: (222.018468, 0.0), + 223: (223.020811, 0.0), + 224: (224.021467, 0.0), + 225: (225.023951, 0.0), + 226: (226.024903, 0.0), + 227: (227.0277041, 0.0), + 228: (228.0287411, 0.0), + 229: (229.031762, 0.0), + 230: (230.0331338, 0.0), + 231: (231.0363043, 0.0), + 232: (232.0380553, 1.0), + 233: (233.0415818, 0.0), + 234: (234.043601, 0.0), + 235: (235.04751, 0.0), + 236: (236.04987, 0.0), + 237: (237.05389, 0.0), + 238: (238.0565, 0.0)}, + 'Ti': {0: (47.9479463, 1.0), + 38: (38.00977, 0.0), + 39: (39.00161, 0.0), + 40: (39.9905, 0.0), + 41: (40.98315, 0.0), + 42: (41.973031, 0.0), + 43: (42.968522, 0.0), + 44: (43.9596901, 0.0), + 45: (44.9581256, 0.0), + 46: (45.9526316, 0.0825), + 47: (46.9517631, 0.0744), + 48: (47.9479463, 0.7372), + 49: (48.94787, 0.0541), + 50: (49.9447912, 0.0518), + 51: (50.946615, 0.0), + 52: (51.946897, 0.0), + 53: (52.94973, 0.0), + 54: (53.95105, 0.0), + 55: (54.95527, 0.0), + 56: (55.9582, 0.0), + 57: (56.96399, 0.0), + 58: (57.96697, 0.0), + 59: (58.97293, 0.0), + 60: (59.97676, 0.0), + 61: (60.9832, 0.0), + 62: (61.98749, 0.0), + 63: (62.99442, 0.0)}, + 'Tl': {0: (204.9744275, 1.0), + 176: (176.00059, 0.0), + 177: (176.996427, 0.0), + 178: (177.9949, 0.0), + 179: (178.99109, 0.0), + 180: (179.98991, 0.0), + 181: (180.986257, 0.0), + 182: (181.98567, 0.0), + 183: (182.982193, 0.0), + 184: (183.98187, 0.0), + 185: (184.97879, 0.0), + 186: (185.97833, 0.0), + 187: (186.975906, 0.0), + 188: (187.97601, 0.0), + 189: (188.973588, 0.0), + 190: (189.97388, 0.0), + 191: (190.971786, 0.0), + 192: (191.97223, 0.0), + 193: (192.97067, 0.0), + 194: (193.9712, 0.0), + 195: (194.969774, 0.0), + 196: (195.970481, 0.0), + 197: (196.969575, 0.0), + 198: (197.97048, 0.0), + 199: (198.96988, 0.0), + 200: (199.970963, 0.0), + 201: (200.970819, 0.0), + 202: (201.972106, 0.0), + 203: (202.9723442, 0.2952), + 204: (203.9738635, 0.0), + 205: (204.9744275, 0.7048), + 206: (205.9761103, 0.0), + 207: (206.977419, 0.0), + 208: (207.9820187, 0.0), + 209: (208.985359, 0.0), + 210: (209.990074, 0.0), + 211: (210.99348, 0.0), + 212: (211.99823, 0.0)}, + 'Tm': {0: (168.9342133, 1.0), + 145: (144.97007, 0.0), + 146: (145.96643, 0.0), + 147: (146.96096, 0.0), + 148: (147.95784, 0.0), + 149: (148.95272, 0.0), + 150: (149.94996, 0.0), + 151: (150.945483, 0.0), + 152: (151.94442, 0.0), + 153: (152.942012, 0.0), + 154: (153.941568, 0.0), + 155: (154.939199, 0.0), + 156: (155.93898, 0.0), + 157: (156.93697, 0.0), + 158: (157.93698, 0.0), + 159: (158.93498, 0.0), + 160: (159.93526, 0.0), + 161: (160.93355, 0.0), + 162: (161.933995, 0.0), + 163: (162.932651, 0.0), + 164: (163.93356, 0.0), + 165: (164.932435, 0.0), + 166: (165.933554, 0.0), + 167: (166.9328516, 0.0), + 168: (167.934173, 0.0), + 169: (168.9342133, 1.0), + 170: (169.9358014, 0.0), + 171: (170.9364294, 0.0), + 172: (171.9384, 0.0), + 173: (172.939604, 0.0), + 174: (173.94217, 0.0), + 175: (174.94384, 0.0), + 176: (175.94699, 0.0), + 177: (176.94904, 0.0), + 178: (177.95264, 0.0), + 179: (178.95534, 0.0)}, + 'U': {0: (238.0507882, 1.0), + 217: (217.02437, 0.0), + 218: (218.02354, 0.0), + 219: (219.02492, 0.0), + 220: (220.02472, 0.0), + 221: (221.0264, 0.0), + 222: (222.02609, 0.0), + 223: (223.02774, 0.0), + 224: (224.027605, 0.0), + 225: (225.029391, 0.0), + 226: (226.029339, 0.0), + 227: (227.031156, 0.0), + 228: (228.031374, 0.0), + 229: (229.033506, 0.0), + 230: (230.03394, 0.0), + 231: (231.036294, 0.0), + 232: (232.0371562, 0.0), + 233: (233.0396352, 0.0), + 234: (234.0409521, 5.4e-05), + 235: (235.0439299, 0.007204), + 236: (236.045568, 0.0), + 237: (237.0487302, 0.0), + 238: (238.0507882, 0.992742), + 239: (239.0542933, 0.0), + 240: (240.056592, 0.0), + 241: (241.06033, 0.0), + 242: (242.06293, 0.0)}, + 'Uuh': {0: (293, 1.0), + 289: (289.19886, 0.0), + 290: (290.19859, 0.0), + 291: (291.20001, 0.0), + 292: (292.19979, 0.0)}, + 'Uuo': {0: (294, 1.0), 293: (293.21467, 0.0)}, + 'Uup': {0: (288, 1.0), + 287: (287.19119, 0.0), + 288: (288.19249, 0.0), + 289: (289.19272, 0.0), + 290: (290.19414, 0.0), + 291: (291.19438, 0.0)}, + 'Uuq': {0: (289, 1.0), + 285: (285.1837, 0.0), + 286: (286.18386, 0.0), + 287: (287.1856, 0.0), + 288: (288.18569, 0.0), + 289: (289.18728, 0.0)}, + 'Uus': {0: (292, 1.0), 291: (291.20656, 0.0), 292: (292.20755, 0.0)}, + 'Uut': {0: (284, 1.0), + 283: (283.17645, 0.0), + 284: (284.17808, 0.0), + 285: (285.17873, 0.0), + 286: (286.18048, 0.0), + 287: (287.18105, 0.0)}, + 'V': {0: (50.9439595, 1.0), + 40: (40.01109, 0.0), + 41: (40.99978, 0.0), + 42: (41.99123, 0.0), + 43: (42.98065, 0.0), + 44: (43.97411, 0.0), + 45: (44.965776, 0.0), + 46: (45.9602005, 0.0), + 47: (46.9549089, 0.0), + 48: (47.9522537, 0.0), + 49: (48.9485161, 0.0), + 50: (49.9471585, 0.0025), + 51: (50.9439595, 0.9975), + 52: (51.9447755, 0.0), + 53: (52.944338, 0.0), + 54: (53.94644, 0.0), + 55: (54.94723, 0.0), + 56: (55.95053, 0.0), + 57: (56.95256, 0.0), + 58: (57.95683, 0.0), + 59: (58.96021, 0.0), + 60: (59.96503, 0.0), + 61: (60.96848, 0.0), + 62: (61.97378, 0.0), + 63: (62.97755, 0.0), + 64: (63.98347, 0.0), + 65: (64.98792, 0.0)}, + 'W': {0: (183.9509312, 1.0), + 158: (157.97456, 0.0), + 159: (158.97292, 0.0), + 160: (159.96848, 0.0), + 161: (160.96736, 0.0), + 162: (161.963497, 0.0), + 163: (162.96252, 0.0), + 164: (163.958954, 0.0), + 165: (164.95828, 0.0), + 166: (165.955027, 0.0), + 167: (166.954816, 0.0), + 168: (167.951808, 0.0), + 169: (168.951779, 0.0), + 170: (169.949228, 0.0), + 171: (170.94945, 0.0), + 172: (171.94729, 0.0), + 173: (172.94769, 0.0), + 174: (173.94608, 0.0), + 175: (174.94672, 0.0), + 176: (175.94563, 0.0), + 177: (176.94664, 0.0), + 178: (177.945876, 0.0), + 179: (178.94707, 0.0), + 180: (179.946704, 0.0012), + 181: (180.948197, 0.0), + 182: (181.9482042, 0.265), + 183: (182.950223, 0.1431), + 184: (183.9509312, 0.3064), + 185: (184.9534193, 0.0), + 186: (185.9543641, 0.2843), + 187: (186.9571605, 0.0), + 188: (187.958489, 0.0), + 189: (188.96191, 0.0), + 190: (189.96318, 0.0), + 191: (190.9666, 0.0), + 192: (191.96817, 0.0)}, + 'Xe': {0: (131.9041535, 1.0), + 110: (109.94428, 0.0), + 111: (110.9416, 0.0), + 112: (111.93562, 0.0), + 113: (112.93334, 0.0), + 114: (113.92798, 0.0), + 115: (114.926294, 0.0), + 116: (115.921581, 0.0), + 117: (116.920359, 0.0), + 118: (117.916179, 0.0), + 119: (118.915411, 0.0), + 120: (119.911784, 0.0), + 121: (120.911462, 0.0), + 122: (121.908368, 0.0), + 123: (122.908482, 0.0), + 124: (123.905893, 0.000952), + 125: (124.9063955, 0.0), + 126: (125.904274, 0.00089), + 127: (126.905184, 0.0), + 128: (127.9035313, 0.019102), + 129: (128.9047794, 0.264006), + 130: (129.903508, 0.04071), + 131: (130.9050824, 0.212324), + 132: (131.9041535, 0.269086), + 133: (132.9059107, 0.0), + 134: (133.9053945, 0.104357), + 135: (134.907227, 0.0), + 136: (135.907219, 0.088573), + 137: (136.911562, 0.0), + 138: (137.91395, 0.0), + 139: (138.918793, 0.0), + 140: (139.92164, 0.0), + 141: (140.92665, 0.0), + 142: (141.92971, 0.0), + 143: (142.93511, 0.0), + 144: (143.93851, 0.0), + 145: (144.94407, 0.0), + 146: (145.94775, 0.0), + 147: (146.95356, 0.0)}, + 'Y': {0: (88.9058483, 1.0), + 76: (75.95845, 0.0), + 77: (76.94965, 0.0), + 78: (77.94361, 0.0), + 79: (78.93735, 0.0), + 80: (79.93428, 0.0), + 81: (80.92913, 0.0), + 82: (81.92679, 0.0), + 83: (82.92235, 0.0), + 84: (83.92039, 0.0), + 85: (84.916433, 0.0), + 86: (85.914886, 0.0), + 87: (86.9108757, 0.0), + 88: (87.9095011, 0.0), + 89: (88.9058483, 1.0), + 90: (89.9071519, 0.0), + 91: (90.907305, 0.0), + 92: (91.908949, 0.0), + 93: (92.909583, 0.0), + 94: (93.911595, 0.0), + 95: (94.912821, 0.0), + 96: (95.915891, 0.0), + 97: (96.918134, 0.0), + 98: (97.922203, 0.0), + 99: (98.924636, 0.0), + 100: (99.92776, 0.0), + 101: (100.93031, 0.0), + 102: (101.93356, 0.0), + 103: (102.93673, 0.0), + 104: (103.94105, 0.0), + 105: (104.94487, 0.0), + 106: (105.94979, 0.0), + 107: (106.95414, 0.0), + 108: (107.95948, 0.0)}, + 'Yb': {0: (173.9388621, 1.0), + 148: (147.96742, 0.0), + 149: (148.96404, 0.0), + 150: (149.95842, 0.0), + 151: (150.9554, 0.0), + 152: (151.95029, 0.0), + 153: (152.94948, 0.0), + 154: (153.946394, 0.0), + 155: (154.945782, 0.0), + 156: (155.942818, 0.0), + 157: (156.942628, 0.0), + 158: (157.939866, 0.0), + 159: (158.94005, 0.0), + 160: (159.937552, 0.0), + 161: (160.937902, 0.0), + 162: (161.935768, 0.0), + 163: (162.936334, 0.0), + 164: (163.934489, 0.0), + 165: (164.93528, 0.0), + 166: (165.933882, 0.0), + 167: (166.93495, 0.0), + 168: (167.933897, 0.0013), + 169: (168.93519, 0.0), + 170: (169.9347618, 0.0304), + 171: (170.9363258, 0.1428), + 172: (171.9363815, 0.2183), + 173: (172.9382108, 0.1613), + 174: (173.9388621, 0.3183), + 175: (174.9412765, 0.0), + 176: (175.9425717, 0.1276), + 177: (176.9452608, 0.0), + 178: (177.946647, 0.0), + 179: (178.95017, 0.0), + 180: (179.95233, 0.0), + 181: (180.95615, 0.0)}, + 'Zn': {0: (63.9291422, 1.0), + 54: (53.99295, 0.0), + 55: (54.98398, 0.0), + 56: (55.97238, 0.0), + 57: (56.96479, 0.0), + 58: (57.95459, 0.0), + 59: (58.94926, 0.0), + 60: (59.941827, 0.0), + 61: (60.939511, 0.0), + 62: (61.93433, 0.0), + 63: (62.9332116, 0.0), + 64: (63.9291422, 0.48268), + 65: (64.929241, 0.0), + 66: (65.9260334, 0.27975), + 67: (66.9271273, 0.04102), + 68: (67.9248442, 0.19024), + 69: (68.9265503, 0.0), + 70: (69.9253193, 0.00631), + 71: (70.927722, 0.0), + 72: (71.926858, 0.0), + 73: (72.92978, 0.0), + 74: (73.92946, 0.0), + 75: (74.93294, 0.0), + 76: (75.93329, 0.0), + 77: (76.93696, 0.0), + 78: (77.93844, 0.0), + 79: (78.94265, 0.0), + 80: (79.94434, 0.0), + 81: (80.95048, 0.0), + 82: (81.95442, 0.0), + 83: (82.96103, 0.0)}, + 'Zr': {0: (89.9047044, 1.0), + 78: (77.95523, 0.0), + 79: (78.94916, 0.0), + 80: (79.9404, 0.0), + 81: (80.93721, 0.0), + 82: (81.93109, 0.0), + 83: (82.92865, 0.0), + 84: (83.92325, 0.0), + 85: (84.92147, 0.0), + 86: (85.91647, 0.0), + 87: (86.914816, 0.0), + 88: (87.910227, 0.0), + 89: (88.90889, 0.0), + 90: (89.9047044, 0.5145), + 91: (90.9056458, 0.1122), + 92: (91.9050408, 0.1715), + 93: (92.906476, 0.0), + 94: (93.9063152, 0.1738), + 95: (94.9080426, 0.0), + 96: (95.9082734, 0.028), + 97: (96.9109531, 0.0), + 98: (97.912735, 0.0), + 99: (98.916512, 0.0), + 100: (99.91776, 0.0), + 101: (100.92114, 0.0), + 102: (101.92298, 0.0), + 103: (102.9266, 0.0), + 104: (103.92878, 0.0), + 105: (104.93305, 0.0), + 106: (105.93591, 0.0), + 107: (106.94075, 0.0), + 108: (107.94396, 0.0), + 109: (108.94924, 0.0), + 110: (109.95287, 0.0)}, + 'e*': {0: (0.00054857990943, 1.0)}} diff --git a/pyteomics/auxiliary/file_helpers.py b/pyteomics/auxiliary/file_helpers.py new file mode 100644 index 0000000..d29e106 --- /dev/null +++ b/pyteomics/auxiliary/file_helpers.py @@ -0,0 +1,1250 @@ +import sys +import codecs +import re +from functools import wraps +from contextlib import contextmanager +from collections import OrderedDict, defaultdict +import json +import multiprocessing as mp +import threading +import warnings +import os +from abc import ABCMeta + +try: + basestring +except NameError: + basestring = (str, bytes) + +try: + import pandas as pd +except ImportError: + pd = None + +try: + import numpy as np +except ImportError: + np = None + +try: + import dill +except ImportError: + dill = None + try: + import cPickle as pickle + except ImportError: + import pickle + serializer = pickle +else: + serializer = dill + +try: + from queue import Empty +except ImportError: + from Queue import Empty + +try: + from collections.abc import Sequence +except ImportError: + from collections import Sequence + +from .structures import PyteomicsError +from .utils import add_metaclass + + +def _keepstate(func): + """Decorator to help keep the position in open files passed as + positional arguments to functions""" + @wraps(func) + def wrapped(*args, **kwargs): + positions = [getattr(arg, 'seek', None) and getattr(arg, 'tell', type(None))() for arg in args] + for arg, pos in zip(args, positions): + if pos is not None: + arg.seek(0) + res = func(*args, **kwargs) + for arg, pos in zip(args, positions): + if pos is not None: + try: + arg.seek(pos) + except ValueError: + pass + return res + return wrapped + + +def _keepstate_method(func): + """Decorator for :py:class:`FileReader` methods to help keep the position + in the underlying file. + """ + @wraps(func) + def wrapped(self, *args, **kwargs): + position = self.tell() + self.seek(0) + try: + return func(self, *args, **kwargs) + finally: + self.seek(position) + return wrapped + + +class _file_obj(object): + """Check if `f` is a file name and open the file in `mode`. + A context manager.""" + + def __init__(self, f, mode, encoding=None): + self._file_spec = None + self.mode = mode + if f is None: + self.file = {'r': sys.stdin, 'a': sys.stdout, 'w': sys.stdout + }[mode[0]] + self._file_spec = None + elif isinstance(f, basestring): + self.file = codecs.open(f, mode, encoding) + self._file_spec = f + else: + self._file_spec = f + self.file = f + self.encoding = getattr(self.file, 'encoding', encoding) + self.close_file = (self.file is not f) + + def __enter__(self): + return self + + def __reduce_ex__(self, protocol): + return self.__class__, (self._file_spec, self.mode, self.encoding) + + def __exit__(self, *args, **kwargs): + if (not self.close_file) or self._file_spec is None: + return # do nothing + # clean up + exit = getattr(self.file, '__exit__', None) + if exit is not None: + return exit(*args, **kwargs) + else: + exit = getattr(self.file, 'close', None) + if exit is not None: + exit() + + def __getattr__(self, attr): + return getattr(self.file, attr) + + def __iter__(self): + return iter(self.file) + + +class NoOpBaseReader(object): + def __init__(self, *args, **kwargs): + pass + + +class IteratorContextManager(NoOpBaseReader): + def __init__(self, *args, **kwargs): + self._func = kwargs.pop('parser_func') + self._args = args + self._kwargs = kwargs + if type(self) == IteratorContextManager: + self.reset() + super(IteratorContextManager, self).__init__(*args, **kwargs) + + def __getstate__(self): + state = {} + state['_iterator_args'] = self._args + state['_iterator_kwargs'] = self._kwargs + return state + + def __setstate__(self, state): + self._args = state['_iterator_args'] + self._kwargs = state['_iterator_kwargs'] + + def reset(self): + """Resets the iterator to its initial state.""" + try: + self._reader = self._func(*self._args, **self._kwargs) + except Exception: + self.__exit__(*sys.exc_info()) + raise + + def __enter__(self): + return self + + def __exit__(self, *args, **kwargs): + pass + + def __iter__(self): + return self + + def __next__(self): + # try: + return next(self._reader) + # except StopIteration: + # self.__exit__(None, None, None) + # raise + + next = __next__ + + +@add_metaclass(ABCMeta) +class FileReader(IteratorContextManager): + """Abstract class implementing context manager protocol + for file readers. + """ + + def __init__(self, source, **kwargs): + func = kwargs['parser_func'] + super(FileReader, self).__init__(*kwargs['args'], parser_func=func, **kwargs['kwargs']) + self._pass_file = kwargs['pass_file'] + self._source_init = source + self._mode = kwargs['mode'] + self._encoding = kwargs.get('encoding') + self.reset() + + def reset(self): + if hasattr(self, '_source'): + self._source.__exit__(None, None, None) + self._source = _file_obj(self._source_init, self._mode, self._encoding) + try: + if self._pass_file: + self._reader = self._func( + self._source, *self._args, **self._kwargs) + else: + self._reader = self._func(*self._args, **self._kwargs) + except Exception: # clean up on any error + self.__exit__(*sys.exc_info()) + raise + + def __exit__(self, *args, **kwargs): + self._source.__exit__(*args, **kwargs) + + # delegate everything else to file object + def __getattr__(self, attr): + if attr == '_source': + raise AttributeError + return getattr(self._source, attr) + + +def remove_bom(bstr): + return bstr.replace(codecs.BOM_LE, b'').lstrip(b"\x00") + + +class IndexedReaderMixin(NoOpBaseReader): + """Common interface for :py:class:`IndexedTextReader` and :py:class:`IndexedXML`.""" + @property + def index(self): + return self._offset_index + + @property + def default_index(self): + return self._offset_index + + def __len__(self): + return len(self._offset_index) + + def __contains__(self, key): + return key in self._offset_index + + def _item_from_offsets(self, offsets): + raise NotImplementedError + + def get_by_id(self, elem_id): + index = self.default_index + if index is None: + raise PyteomicsError('Access by ID requires building an offset index.') + offsets = index[elem_id] + return self._item_from_offsets(offsets) + + def get_by_ids(self, ids): + return [self.get_by_id(key) for key in ids] + + def get_by_index(self, i): + try: + key = self.default_index.from_index(i, False) + except AttributeError: + raise PyteomicsError('Positional access requires building an offset index.') + return self.get_by_id(key) + + def get_by_indexes(self, indexes): + return [self.get_by_index(i) for i in indexes] + + def get_by_index_slice(self, s): + try: + keys = self.default_index.from_slice(s, False) + except AttributeError: + raise PyteomicsError('Positional access requires building an offset index.') + return self.get_by_ids(keys) + + def get_by_key_slice(self, s): + keys = self.default_index.between(s.start, s.stop) + if s.step: + keys = keys[::s.step] + return self.get_by_ids(keys) + + def __getitem__(self, key): + if isinstance(key, basestring): + return self.get_by_id(key) + if isinstance(key, int): + return self.get_by_index(key) + if isinstance(key, Sequence): + if not key: + return [] + if isinstance(key[0], int): + return self.get_by_indexes(key) + if isinstance(key[0], basestring): + return self.get_by_ids(key) + if isinstance(key, slice): + for item in (key.start, key.stop, key.step): + if item is not None: + break + if isinstance(item, int): + return self.get_by_index_slice(key) + if isinstance(item, basestring): + return self.get_by_key_slice(key) + if item is None: + return list(self) + raise PyteomicsError('Unsupported query key: {}'.format(key)) + + +class RTLocator(): + def __init__(self, reader): + self._reader = reader + + def _get_scan_by_time(self, time): + """Retrieve the scan object for the specified scan time. + + Parameters + ---------- + time : float + The time to get the nearest scan from + Returns + ------- + tuple: (scan_id, scan, scan_time) + """ + if not self._reader.default_index: + raise PyteomicsError("This method requires the index. Please pass `use_index=True` during initialization") + + scan_ids = tuple(self._reader.default_index) + lo = 0 + hi = len(scan_ids) + + best_match = None + best_error = float('inf') + best_time = None + best_id = None + + if time == float('inf'): + scan = self._reader.get_by_id(scan_ids[-1]) + return scan_ids[-1], scan, self._reader._get_time(scan) + + while hi != lo: + mid = (hi + lo) // 2 + sid = scan_ids[mid] + scan = self._reader.get_by_id(sid) + scan_time = self._reader._get_time(scan) + err = abs(scan_time - time) + if err < best_error: + best_error = err + best_match = scan + best_time = scan_time + best_id = sid + if scan_time == time: + return sid, scan, scan_time + elif (hi - lo) == 1: + return best_id, best_match, best_time + elif scan_time > time: + hi = mid + else: + lo = mid + + def __getitem__(self, key): + if isinstance(key, (int, float)): + return self._get_scan_by_time(key)[1] + if isinstance(key, Sequence): + return [self._get_scan_by_time(t)[1] for t in key] + if isinstance(key, slice): + if key.start is None: + start_index = self._reader.default_index.from_index(0) + else: + start_index = self._get_scan_by_time(key.start)[0] + if key.stop is None: + stop_index = self._reader.default_index.from_index(-1) + else: + stop_index = self._get_scan_by_time(key.stop)[0] + return self._reader[start_index:stop_index:key.step] + + +class TimeOrderedIndexedReaderMixin(IndexedReaderMixin): + @property + def time(self): + return self._time + + def __init__(self, *args, **kwargs): + super(TimeOrderedIndexedReaderMixin, self).__init__(*args, **kwargs) + self._time = RTLocator(self) + + @staticmethod + def _get_time(scan): + raise NotImplementedError + + +class IndexedTextReader(IndexedReaderMixin, FileReader): + """Abstract class for text file readers that keep an index of records for random access. + This requires reading the file in binary mode.""" + + delimiter = None + label = None + block_size = 1000000 + label_group = 1 + _kw_keys = ['delimiter', 'label', 'block_size', 'label_group'] + + def __init__(self, source, **kwargs): + # the underlying _file_obj gets None as encoding + # to avoid transparent decoding of StreamReader on read() calls + encoding = kwargs.pop('encoding', 'utf-8') + super(IndexedTextReader, self).__init__(source, mode='rb', encoding=None, **kwargs) + self.encoding = encoding + for attr in self._kw_keys: + if attr in kwargs: + setattr(self, attr, kwargs.pop(attr)) + self._offset_index = None + if not kwargs.pop('_skip_index', False): + self._offset_index = self.build_byte_index() + + def __getstate__(self): + state = super(IndexedTextReader, self).__getstate__() + state['offset_index'] = self._offset_index + for key in self._kw_keys: + state[key] = getattr(self, key) + return state + + def __setstate__(self, state): + super(IndexedTextReader, self).__setstate__(state) + self._offset_index = state['offset_index'] + for key in self._kw_keys: + if key in state: + setattr(self, key, state[key]) + + def _chunk_iterator(self): + fh = self._source.file + delim = remove_bom(self.delimiter.encode(self.encoding)) + buff = fh.read(self.block_size) + parts = buff.split(delim) + started_with_delim = buff.startswith(delim) + tail = parts[-1] + front = parts[:-1] + i = 0 + for part in front: + i += 1 + if part == b"": + continue + if i == 1: + if started_with_delim: + yield delim + part + else: + yield part + else: + yield delim + part + running = True + while running: + buff = fh.read(self.block_size) + if len(buff) == 0: + running = False + buff = tail + else: + buff = tail + buff + parts = buff.split(delim) + tail = parts[-1] + front = parts[:-1] + for part in front: + yield delim + part + yield delim + tail + + def _generate_offsets(self): + i = 0 + pattern = re.compile(remove_bom(self.label.encode(self.encoding))) + for chunk in self._chunk_iterator(): + match = pattern.search(chunk) + if match: + label = match.group(self.label_group) + yield i, label.decode(self.encoding), match + i += len(chunk) + yield i, None, None + + def build_byte_index(self): + index = OffsetIndex() + g = self._generate_offsets() + last_offset = 0 + last_label = None + for offset, label, keyline in g: + if last_label is not None: + index[last_label] = (last_offset, offset) + last_label = label + last_offset = offset + assert last_label is None + return index + + def _read_lines_from_offsets(self, start, end): + self._source.seek(start) + lines = self._source.read(end - start).decode(self.encoding).split('\n') + return lines + + +class IndexSavingMixin(NoOpBaseReader): + """Common interface for :py:class:`IndexSavingXML` and :py:class:`IndexSavingTextReader`.""" + _index_class = NotImplemented + + @property + def _byte_offset_filename(self): + try: + path = self._source.name + except AttributeError: + return None + name, ext = os.path.splitext(path) + byte_offset_filename = '{}-{}-byte-offsets.json'.format(name, ext[1:]) + return byte_offset_filename + + def _check_has_byte_offset_file(self): + """Check if the file at :attr:`_byte_offset_filename` exists + + Returns + ------- + bool + Whether the file exists + """ + path = self._byte_offset_filename + if path is None: + return False + return os.path.exists(path) + + @classmethod + def prebuild_byte_offset_file(cls, path): + """Construct a new XML reader, build its byte offset index and + write it to file + + Parameters + ---------- + path : str + The path to the file to parse + """ + with cls(path) as inst: + inst.write_byte_offsets() + + def write_byte_offsets(self): + """Write the byte offsets in :attr:`_offset_index` to the file + at :attr:`_byte_offset_filename` + """ + with open(self._byte_offset_filename, 'w') as f: + self._offset_index.save(f) + + @_keepstate_method + def _build_index(self): + """Build the byte offset index by either reading these offsets + from the file at :attr:`_byte_offset_filename`, or falling back + to the method used by :class:`IndexedXML` if this operation fails + due to an IOError + """ + if not self._use_index: return + try: + self._read_byte_offsets() + except (IOError, AttributeError, TypeError): + super(IndexSavingMixin, self)._build_index() + + def _read_byte_offsets(self): + """Read the byte offset index JSON file at :attr:`_byte_offset_filename` + and populate :attr:`_offset_index` + """ + with open(self._byte_offset_filename, 'r') as f: + index = self._index_class.load(f) + self._offset_index = index + + +def _file_reader(_mode='r'): + # a lot of the code below is borrowed from + # http://stackoverflow.com/a/14095585/1258041 + def decorator(_func): + """A decorator implementing the context manager protocol for functions + that read files. + + Note: 'close' must be in kwargs! Otherwise it won't be respected. + """ + @wraps(_func) + def helper(*args, **kwargs): + if args: + return FileReader(args[0], mode=_mode, parser_func=_func, pass_file=True, args=args[1:], kwargs=kwargs, + encoding=kwargs.pop('encoding', None)) + source = kwargs.pop('source', None) + return FileReader(source, mode=_mode, parser_func=_func, pass_file=True, args=(), kwargs=kwargs, encoding=kwargs.pop('encoding', None)) + return helper + return decorator + + +def _file_writer(_mode='w'): + def decorator(_func): + """A decorator that opens output files for writer functions. + """ + @wraps(_func) + def helper(*args, **kwargs): + m = kwargs.pop('file_mode', _mode) + enc = kwargs.pop('encoding', None) + if len(args) > 1: + out_arg = args[1] + else: + out_arg = kwargs.pop('output', None) + + with _file_obj(out_arg, m, encoding=enc) as out: + if len(args) > 1: + call_args = (args[0], out) + args[2:] + call_kwargs = kwargs + else: + call_args = args + call_kwargs = dict(output=out, **kwargs) + return _func(*call_args, **call_kwargs) + return helper + return decorator + + +class WritableIndex(object): + schema_version = (1, 0, 0) + _schema_version_tag_key = "@pyteomics_schema_version" + + def _serializable_container(self): + container = {'index': list(self.items())} + return container + + def save(self, fp): + container = self._serializable_container() + container[self._schema_version_tag_key] = self.schema_version + json.dump(container, fp) + + @classmethod + def load(cls, fp): + container = json.load(fp, object_hook=OrderedDict) + version_tag = container.get(cls._schema_version_tag_key) + if version_tag is None: + # The legacy case, no special processing yet + inst = cls() + inst.schema_version = None + return inst + version_tag = tuple(version_tag) + index = container.get("index") + if version_tag < cls.schema_version: + # schema upgrade case, no special processing yet + inst = cls(index) + inst.schema_version = version_tag + return inst + # no need to upgrade + return cls(index) + + +class OffsetIndex(OrderedDict, WritableIndex): + '''An augmented OrderedDict that formally wraps getting items by index + ''' + + def __init__(self, *args, **kwargs): + super(OffsetIndex, self).__init__(*args, **kwargs) + self._index_sequence = None + + def _invalidate(self): + self._index_sequence = None + + @property + def index_sequence(self): + """Keeps a cached copy of the :meth:`items` sequence + stored as a :class:`tuple` to avoid repeatedly copying + the sequence over many method calls. + + Returns + ------- + :class:`tuple` + """ + if self._index_sequence is None: + self._index_sequence = tuple(self.items()) + return self._index_sequence + + def __setitem__(self, key, value): + self._invalidate() + return super(OffsetIndex, self).__setitem__(key, value) + + def pop(self, *args, **kwargs): + self._invalidate() + return super(OffsetIndex, self).pop(*args, **kwargs) + + def find(self, key, *args, **kwargs): + return self[key] + + def from_index(self, index, include_value=False): + '''Get an entry by its integer index in the ordered sequence + of this mapping. + + Parameters + ---------- + index: int + The index to retrieve. + include_value: bool + Whether to return both the key and the value or just the key. + Defaults to :const:`False`. + + Returns + ------- + object: + If ``include_value`` is :const:`True`, a tuple of (key, value) at ``index`` + else just the key at ``index``. + ''' + items = self.index_sequence + if include_value: + return items[index] + else: + return items[index][0] + + def from_slice(self, spec, include_value=False): + '''Get a slice along index in the ordered sequence + of this mapping. + + Parameters + ---------- + spec: slice + The slice over the range of indices to retrieve + include_value: bool + Whether to return both the key and the value or just the key. + Defaults to :const:`False` + + Returns + ------- + list: + If ``include_value`` is :const:`True`, a tuple of (key, value) at ``index`` + else just the key at ``index`` for each ``index`` in ``spec`` + ''' + items = self.index_sequence + return [(k, v) if include_value else k for k, v in items[spec]] + + def between(self, start, stop, include_value=False): + keys = list(self) + if start is not None: + try: + start_index = keys.index(start) + except ValueError: + raise KeyError(start) + else: + start_index = 0 + if stop is not None: + try: + stop_index = keys.index(stop) + except ValueError: + raise KeyError(stop) + else: + stop_index = len(keys) - 1 + if start is None or stop is None: + pass # won't switch indices + else: + start_index, stop_index = min(start_index, stop_index), max(start_index, stop_index) + + if include_value: + return [(k, self[k]) for k in keys[start_index:stop_index + 1]] + return keys[start_index:stop_index + 1] + + def __repr__(self): + template = "{self.__class__.__name__}({items})" + return template.format(self=self, items=list(self.items())) + + def _integrity_check(self): + indices = list(self.values()) + sorted_indices = sorted(self.values()) + return indices == sorted_indices + + def sort(self): + sorted_pairs = sorted(self.items(), key=lambda x: x[1]) + self.clear() + self._invalidate() + for key, value in sorted_pairs: + self[key] = value + return self + + +class IndexSavingTextReader(IndexSavingMixin, IndexedTextReader): + _index_class = OffsetIndex + + +class HierarchicalOffsetIndex(WritableIndex): + _inner_type = OffsetIndex + + def __init__(self, base=None): + self.mapping = defaultdict(self._inner_type) + for key, value in (base or {}).items(): + self.mapping[key] = self._inner_type(value) + + def _integrity_check(self): + for key, value in self.items(): + if not value._integrity_check(): + return False + return True + + def sort(self): + for key, value in self.items(): + value.sort() + return self + + def __getitem__(self, key): + return self.mapping[key] + + def __setitem__(self, key, value): + self.mapping[key] = value + + def __iter__(self): + return iter(self.mapping) + + def __len__(self): + return sum(len(group) for key, group in self.items()) + + def __contains__(self, key): + return key in self.mapping + + def find(self, key, element_type=None): + if element_type is None: + for element_type in self.keys(): + try: + return self.find(key, element_type) + except KeyError: + continue + raise KeyError(key) + else: + return self[element_type][key] + + def find_no_type(self, key): + """Try to find `key` in each of the lower-level indexes, returning both + value and the element type that match the key.""" + for element_type in self.keys(): + try: + return self.find(key, element_type), element_type + except KeyError: + continue + raise KeyError(key) + + def update(self, *args, **kwargs): + self.mapping.update(*args, **kwargs) + + def pop(self, key, default=None): + return self.mapping.pop(key, default) + + def keys(self): + return self.mapping.keys() + + def values(self): + return self.mapping.values() + + def items(self): + return self.mapping.items() + + def _serializable_container(self): + encoded_index = {} + container = { + 'keys': list(self.keys()) + } + for key, offset in self.items(): + encoded_index[key] = list(offset.items()) + container['index'] = encoded_index + return container + + +def _make_chain(reader, readername, full_output=False): + + def concat_results(*args, **kwargs): + results = [reader(arg, **kwargs) for arg in args] + if pd is not None and all(isinstance(a, pd.DataFrame) for a in args): + return pd.concat(results) + return np.concatenate(results) + + def _iter(files, kwargs): + for f in files: + with reader(f, **kwargs) as r: + for item in r: + yield item + + def chain(*files, **kwargs): + return _iter(files, kwargs) + + def from_iterable(files, **kwargs): + return _iter(files, kwargs) + + @contextmanager + def _chain(*files, **kwargs): + yield chain(*files, **kwargs) + + @contextmanager + def _from_iterable(files, **kwargs): + yield from_iterable(files, **kwargs) + + def dispatch(*args, **kwargs): + return dispatch_from_iterable(args, **kwargs) + + def dispatch_from_iterable(args, **kwargs): + if kwargs.get('full_output', full_output): + return concat_results(*args, **kwargs) + return _chain(*args, **kwargs) + + dispatch.__doc__ = """Chain :py:func:`{0}` for several files. + Positional arguments should be file names or file objects. + Keyword arguments are passed to the :py:func:`{0}` function. + """.format(readername) + dispatch_from_iterable.__doc__ = """Chain :py:func:`{0}` for several files. + Keyword arguments are passed to the :py:func:`{0}` function. + + Parameters + ---------- + files : iterable + Iterable of file names or file objects. + """.format(readername) + dispatch.from_iterable = dispatch_from_iterable + + return dispatch + + +def _check_use_index(source, use_index, default): + try: + if use_index is not None: + use_index = bool(use_index) + + # if a file name is given, do not override anything; short-circuit + if isinstance(source, basestring): + return use_index if use_index is not None else default + + # collect information on source + if hasattr(source, 'seekable'): + seekable = source.seekable() + else: + seekable = None + + if hasattr(source, 'mode'): + binary = 'b' in source.mode + else: + binary = None + + # now check for conflicts + if seekable is False: + if binary: + raise PyteomicsError('Cannot work with non-seekable file in binary mode: {}.'.format(source)) + if use_index: + warnings.warn('Cannot use indexing as {} is not seekable. Setting `use_index` to False.'.format(source)) + use_index = False + elif binary is not None: + if use_index is not None and binary != use_index: + warnings.warn('use_index is {}, but the file mode is {}. ' + 'Setting `use_index` to {}'.format(use_index, source.mode, binary)) + use_index = binary + elif use_index is None: + warnings.warn('Could not check mode on {}. Specify `use_index` explicitly to avoid errors.'.format(source)) + + if use_index is not None: + return use_index + + return default + + except PyteomicsError: + raise + except Exception as e: + if use_index is None: + warnings.warn('Could not check mode on {}. Reason: {!r}. ' + 'Specify `use_index` explicitly to avoid errors.'.format(source, e)) + return default + return use_index + + +class FileReadingProcess(mp.Process): + """Process that does a share of distributed work on entries read from file. + Reconstructs a reader object, parses an entries from given indexes, + optionally does additional processing, sends results back. + + The reader class must support the :py:meth:`__getitem__` dict-like lookup. + """ + + def __init__(self, reader_spec, target_spec, qin, qout, args_spec, kwargs_spec): + super(FileReadingProcess, self).__init__(name='pyteomics-map-worker') + self.reader_spec = reader_spec + self.target_spec = target_spec + self.args_spec = args_spec + self.kwargs_spec = kwargs_spec + self._qin = qin + self._qout = qout + # self._in_flag = in_flag + self._done_flag = mp.Event() + self.daemon = True + + def run(self): + reader = serializer.loads(self.reader_spec) + target = serializer.loads(self.target_spec) + args = serializer.loads(self.args_spec) + kwargs = serializer.loads(self.kwargs_spec) + for key in iter(self._qin.get, None): + item = reader[key] + if target is not None: + result = target(item, *args, **kwargs) + else: + result = item + self._qout.put(result) + self._done_flag.set() + + def is_done(self): + return self._done_flag.is_set() + + +try: + _NPROC = mp.cpu_count() +except NotImplementedError: + _NPROC = 4 +_QUEUE_TIMEOUT = 4 +_QUEUE_SIZE = int(1e7) + + +class TaskMappingMixin(NoOpBaseReader): + def __init__(self, *args, **kwargs): + ''' + Instantiate a :py:class:`TaskMappingMixin` object, set default parameters for IPC. + + Parameters + ---------- + + queue_timeout : float, keyword only, optional + The number of seconds to block, waiting for a result before checking to see if + all workers are done. + queue_size : int, keyword only, optional + The length of IPC queue used. + processes : int, keyword only, optional + Number of worker processes to spawn when :py:meth:`map` is called. This can also be + specified in the :py:meth:`map` call. + ''' + self._queue_size = kwargs.pop('queue_size', _QUEUE_SIZE) + self._queue_timeout = kwargs.pop('timeout', _QUEUE_TIMEOUT) + self._nproc = kwargs.pop('processes', _NPROC) + super(TaskMappingMixin, self).__init__(*args, **kwargs) + + def _get_reader_for_worker_spec(self): + return self + + def _build_worker_spec(self, target, args, kwargs): + serialized = [] + for obj, objname in [(self._get_reader_for_worker_spec(), 'reader'), (target, 'target'), (args, 'args'), + (kwargs, 'kwargs')]: + try: + serialized.append(serializer.dumps(obj)) + except serializer.PicklingError: + msg = 'Could not serialize {0} {1} with {2.__name__}.'.format(objname, obj, serializer) + if serializer is not dill: + msg += ' Try installing `dill`.' + raise PyteomicsError(msg) + return serialized + + def _spawn_workers(self, specifications, in_queue, out_queue, processes): + reader_spec, target_spec, args_spec, kwargs_spec = specifications + workers = [] + for _ in range(processes): + worker = FileReadingProcess( + reader_spec, target_spec, in_queue, out_queue, args_spec, kwargs_spec) + workers.append(worker) + return workers + + def _spawn_feeder_thread(self, in_queue, iterator, processes): + def feeder(): + for key in iterator: + in_queue.put(key) + for _ in range(processes): + in_queue.put(None) + + feeder_thread = threading.Thread(target=feeder) + feeder_thread.daemon = True + feeder_thread.start() + return feeder_thread + + def map(self, target=None, processes=-1, args=None, kwargs=None, **_kwargs): + """Execute the ``target`` function over entries of this object across up to ``processes`` + processes. + + Results will be returned out of order. + + Parameters + ---------- + target : :class:`Callable`, optional + The function to execute over each entry. It will be given a single object yielded by + the wrapped iterator as well as all of the values in ``args`` and ``kwargs`` + processes : int, optional + The number of worker processes to use. If 0 or negative, + defaults to the number of available CPUs. + This parameter can also be set at reader creation. + args : :class:`Sequence`, optional + Additional positional arguments to be passed to the target function + kwargs : :class:`Mapping`, optional + Additional keyword arguments to be passed to the target function + **_kwargs + Additional keyword arguments to be passed to the target function + + Yields + ------ + object + The work item returned by the target function. + """ + if self._offset_index is None: + raise PyteomicsError('The reader needs an index for map() calls. Create the reader with `use_index=True`.') + + if processes < 1: + processes = self._nproc + iterator = self._task_map_iterator() + + if args is None: + args = tuple() + else: + args = tuple(args) + if kwargs is None: + kwargs = dict() + else: + kwargs = dict(kwargs) + kwargs.update(_kwargs) + + serialized = self._build_worker_spec(target, args, kwargs) + + in_queue = mp.Queue(self._queue_size) + out_queue = mp.Queue(self._queue_size) + + workers = self._spawn_workers(serialized, in_queue, out_queue, processes) + feeder_thread = self._spawn_feeder_thread(in_queue, iterator, processes) + for worker in workers: + worker.start() + + def iterate(): + while True: + try: + result = out_queue.get(True, self._queue_timeout) + yield result + except Empty: + if all(w.is_done() for w in workers): + break + else: + continue + + feeder_thread.join() + for worker in workers: + worker.join() + return iterate() + + def _task_map_iterator(self): + """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC + queue used by :meth:`map` + + Returns + ------- + :class:`Iteratable` + """ + + return iter(self._offset_index.keys()) + + +class ChainBase(object): + """Chain :meth:`sequence_maker` for several sources into a + single iterable. Positional arguments should be sources like + file names or file objects. Keyword arguments are passed to + the :meth:`sequence_maker` function. + + Parameters + ---------- + sources : :class:`Iterable` + Sources for creating new sequences from, such as paths or + file-like objects + kwargs : :class:`Mapping` + Additional arguments used to instantiate each sequence + """ + + def __init__(self, *sources, **kwargs): + self.sources = sources + self.kwargs = kwargs + self._iterator = None + + @classmethod + def from_iterable(cls, sources, **kwargs): + return cls(*sources, **kwargs) + + @classmethod + def _make_chain(cls, sequence_maker): + if isinstance(sequence_maker, type): + tp = type('%sChain' % sequence_maker.__class__.__name__, (cls,), { + 'sequence_maker': sequence_maker, + '__doc__': cls.__doc__.replace(':meth:`sequence_maker`', ':class:`{}`'.format(sequence_maker.__name__)) + }) + else: + tp = type('FunctionChain', (cls,), { + 'sequence_maker': staticmethod(sequence_maker), + '__doc__': cls.__doc__.replace(':meth:`sequence_maker`', ':func:`{}`'.format(sequence_maker.__name__)) + }) + return tp + + def sequence_maker(self, file): + raise NotImplementedError() + + def _create_sequence(self, file): + return self.sequence_maker(file, **self.kwargs) + + def _iterate_over_series(self): + for f in self.sources: + with self._create_sequence(f) as r: + for item in r: + yield item + + def __enter__(self): + self._iterator = iter(self._iterate_over_series()) + return self + + def __exit__(self, *args, **kwargs): + self._iterator = None + + def __iter__(self): + return self + + def __next__(self): + if self._iterator is None: + self._iterator = self._iterate_over_series() + return next(self._iterator) + + def next(self): + return self.__next__() + + def map(self, target=None, processes=-1, queue_timeout=_QUEUE_TIMEOUT, args=None, kwargs=None, **_kwargs): + """Execute the ``target`` function over entries of this object across up to ``processes`` + processes. + + Results will be returned out of order. + + Parameters + ---------- + target : :class:`Callable`, optional + The function to execute over each entry. It will be given a single object yielded by + the wrapped iterator as well as all of the values in ``args`` and ``kwargs`` + processes : int, optional + The number of worker processes to use. If negative, the number of processes + will match the number of available CPUs. + queue_timeout : float, optional + The number of seconds to block, waiting for a result before checking to see if + all workers are done. + args : :class:`Sequence`, optional + Additional positional arguments to be passed to the target function + kwargs : :class:`Mapping`, optional + Additional keyword arguments to be passed to the target function + **_kwargs + Additional keyword arguments to be passed to the target function + + Yields + ------ + object + The work item returned by the target function. + """ + for f in self.sources: + with self._create_sequence(f) as r: + for result in r.map(target, processes, queue_timeout, args, kwargs, **_kwargs): + yield result + + +class TableJoiner(ChainBase): + def concatenate(self, results): + if pd is not None and all(isinstance(a, pd.DataFrame) for a in results): + return pd.concat(results) + if isinstance(results[0], np.ndarray): + return np.concatenate(results) + else: + return np.array([b for a in results for b in a]) + + def _iterate_over_series(self): + results = [self._create_sequence(f) for f in self.sources] + return self.concatenate(results) diff --git a/pyteomics/auxiliary/math.py b/pyteomics/auxiliary/math.py new file mode 100644 index 0000000..1f1f46a --- /dev/null +++ b/pyteomics/auxiliary/math.py @@ -0,0 +1,97 @@ +from .structures import PyteomicsError + + +def linear_regression_vertical(x, y=None, a=None, b=None): + """Calculate coefficients of a linear regression y = a * x + b. + The fit minimizes *vertical* distances between the points and the line. + + Requires :py:mod:`numpy`. + + Parameters + ---------- + x, y : array_like of float + 1-D arrays of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). + a : float, optional + If specified then the slope coefficient is fixed and equals a. + b : float, optional + If specified then the free term is fixed and equals b. + + Returns + ------- + out : 4-tuple of float + The structure is (a, b, r, stderr), where + a -- slope coefficient, + b -- free term, + r -- Peason correlation coefficient, + stderr -- standard deviation. + """ + + import numpy as np + x = np.array(x, copy=False) + if y is not None: + y = np.array(y, copy=False) + else: + if len(x.shape) != 2 or x.shape[-1] != 2: + raise PyteomicsError( + 'If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) + y = x[:, 1] + x = x[:, 0] + if (a is not None and b is None): + b = (y - a * x).mean() + elif (a is not None and b is not None): + pass + else: + a, b = np.polyfit(x, y, 1) + + r = np.corrcoef(x, y)[0, 1] + stderr = (y - a * x - b).std() + + return a, b, r, stderr + + +def linear_regression(x, y=None, a=None, b=None): + """Alias of :py:func:`linear_regression_vertical`.""" + return linear_regression_vertical(x, y, a, b) + + +def linear_regression_perpendicular(x, y=None): + """Calculate coefficients of a linear regression y = a * x + b. + The fit minimizes *perpendicular* distances between the points and the line. + + Requires :py:mod:`numpy`. + + Parameters + ---------- + x, y : array_like of float + 1-D arrays of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). + + Returns + ------- + out : 4-tuple of float + The structure is (a, b, r, stderr), where + a -- slope coefficient, + b -- free term, + r -- Peason correlation coefficient, + stderr -- standard deviation. + """ + + import numpy as np + x = np.array(x, copy=False) + if y is not None: + y = np.array(y, copy=False) + data = np.hstack((x.reshape((-1, 1)), y.reshape((-1, 1)))) + else: + if len(x.shape) != 2 or x.shape[-1] != 2: + raise PyteomicsError( + 'If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) + data = x + mu = data.mean(axis=0) + eigenvectors, eigenvalues, V = np.linalg.svd((data - mu).T, full_matrices=False) + a = eigenvectors[0][1] / eigenvectors[0][0] + xm, ym = data.mean(axis=0) + b = ym - a * xm + + r = np.corrcoef(data[:, 0], data[:, 1])[0, 1] + stderr = ((data[:, 1] - a * data[:, 0] - b) / np.sqrt(a**2 + 1)).std() + + return a, b, r, stderr diff --git a/pyteomics/auxiliary/patch.py b/pyteomics/auxiliary/patch.py new file mode 100644 index 0000000..e69de29 diff --git a/pyteomics/auxiliary/structures.py b/pyteomics/auxiliary/structures.py new file mode 100644 index 0000000..56e56e1 --- /dev/null +++ b/pyteomics/auxiliary/structures.py @@ -0,0 +1,504 @@ +import re +from collections import defaultdict, Counter +import warnings + +try: + basestring + PY2 = True +except NameError: + basestring = (str, bytes) + PY2 = False + + +_UNIT_CV_INTERN_TABLE = dict() + + +def clear_unit_cv_table(): + """Clear the module-level unit name and + controlled vocabulary accession table. + """ + _UNIT_CV_INTERN_TABLE.clear() + + +def _intern_unit_or_cv(unit_or_cv): + """Intern `unit_or_cv` in :const:`~._UNIT_CV_INTERN_TABLE`, potentially + keeping a reference to the object stored for the duration of the program. + + Parameters + ---------- + unit_or_cv : object + The value to intern + + Returns + ------- + object: + The object which `unit_or_cv` hash-equals in :const:`~._UNIT_CV_INTERN_TABLE`. + """ + if unit_or_cv is None: + return None + try: + return _UNIT_CV_INTERN_TABLE[unit_or_cv] + except KeyError: + _UNIT_CV_INTERN_TABLE[unit_or_cv] = unit_or_cv + return _UNIT_CV_INTERN_TABLE[unit_or_cv] + + +class PyteomicsError(Exception): + """Exception raised for errors in Pyteomics library. + + Attributes + ---------- + message : str + Error message. + """ + + def __init__(self, msg, *values): + self.message = msg + self.values = values + + def __str__(self): + if not self.values: + return "Pyteomics error, message: %s" % (repr(self.message),) + else: + return "Pyteomics error, message: %s %r" % (repr(self.message), self.values) + + +class Charge(int): + """A subclass of :py:class:`int`. Can be constructed from strings in "N+" + or "N-" format, and the string representation of a :py:class:`Charge` is + also in that format. + """ + def __new__(cls, *args, **kwargs): + try: + return super(Charge, cls).__new__(cls, *args) + except ValueError as e: + if isinstance(args[0], basestring): + try: + num, sign = re.match(r'^(\d+)(\+|-)$', args[0]).groups() + return super(Charge, cls).__new__(cls, sign + num, *args[1:], **kwargs) + except Exception: + pass + raise PyteomicsError(*e.args) + + def __str__(self): + return str(abs(self)) + '+-'[self < 0] + + +class Ion(str): + """Represents an Ion, right now just a subclass of String. + """ + _pattern = r'([abcxyz]\d+(\-H2O|\-NH3)?)([\+|-]\d+)' # "y2-H2O+1" + + def __init__(self, *args, **kwargs): + if args and isinstance(args[0], basestring): + try: + self.ion_type, self.neutral_loss, self.charge = re.match(self._pattern, args[0]).groups() + except Exception: + raise PyteomicsError("Malformed ion string, must match the regex {!r}".format(self._pattern)) + + +class ChargeList(list): + """Just a list of :py:class:`Charge`s. When printed, looks like an + enumeration of the list contents. Can also be constructed from such + strings (e.g. "2+, 3+ and 4+"). + """ + + def __init__(self, *args, **kwargs): + if args and isinstance(args[0], basestring): + delim = r'(?:,\s*)|(?:\s*and\s*)' + self.extend(map(Charge, re.split(delim, args[0]))) + else: + try: + super(ChargeList, self).__init__( + sorted(set(args[0])), *args[1:], **kwargs) + except Exception: + super(ChargeList, self).__init__(*args, **kwargs) + self[:] = map(Charge, self) + + def __str__(self): + if len(self) > 1: + return ', '.join(map(str, self[:-1])) + ' and {}'.format(self[-1]) + elif self: + return str(self[0]) + return super(ChargeList, self).__str__() + + +def _parse_charge(s, list_only=False): + if not list_only: + try: + return Charge(s) + except PyteomicsError: + pass + return ChargeList(s) + + +def _parse_ion(ion_text): + try: + return Ion(ion_text) + except Exception as e: + warnings.warn('Could not parse ion string: {} ({})'.format(ion_text, e.args[0])) + + +class BasicComposition(defaultdict, Counter): + """A generic dictionary for compositions. + Keys should be strings, values should be integers. + Allows simple arithmetics.""" + + def __init__(self, *args, **kwargs): + defaultdict.__init__(self, int) + Counter.__init__(self, *args, **kwargs) + for k, v in list(self.items()): + if not v: + del self[k] + + def __str__(self): + return '{}({})'.format(type(self).__name__, dict.__repr__(self)) + + def __repr__(self): + return str(self) + + def _repr_pretty_(self, p, cycle): + if cycle: # should never happen + p.text('{} object with a cyclic reference'.format(type(self).__name__)) + p.text(str(self)) + + def __add__(self, other): + result = self.copy() + for elem, cnt in other.items(): + result[elem] += cnt + return result + + def __iadd__(self, other): + for elem, cnt in other.items(): + self[elem] += cnt + return self + + def __radd__(self, other): + return self + other + + def __sub__(self, other): + result = self.copy() + for elem, cnt in other.items(): + result[elem] -= cnt + return result + + def __isub__(self, other): + for elem, cnt in other.items(): + self[elem] -= cnt + return self + + def __rsub__(self, other): + return (self - other) * (-1) + + def __mul__(self, other): + if not isinstance(other, int): + raise PyteomicsError('Cannot multiply Composition by non-integer', + other) + return type(self)({k: v * other for k, v in self.items()}) + + def __imul__(self, other): + if not isinstance(other, int): + raise PyteomicsError('Cannot multiply Composition by non-integer', + other) + for elem in self: + self[elem] *= other + return self + + def __rmul__(self, other): + return self * other + + def __eq__(self, other): + if not isinstance(other, dict): + return False + self_items = {i for i in self.items() if i[1]} + other_items = {i for i in other.items() if i[1]} + return self_items == other_items + + # override default behavior: + # we don't want to add 0's to the dictionary + def __missing__(self, key): + return 0 + + def __setitem__(self, key, value): + if isinstance(value, float): + value = int(round(value)) + elif not isinstance(value, int): + raise PyteomicsError('Only integers allowed as values in ' + 'Composition, got {}.'.format(type(value).__name__)) + if value: # reject 0's + super(BasicComposition, self).__setitem__(key, value) + elif key in self: + del self[key] + + def copy(self): + return type(self)(self) + + def __reduce__(self): + class_, args, state, list_iterator, dict_iterator = super( + BasicComposition, self).__reduce__() + # Override the reduce of defaultdict so we do not provide the + # `int` type as the first argument + # which prevents from correctly unpickling the object + args = () + return class_, args, state, list_iterator, dict_iterator + + +class _MappingOverAttributeProxy(object): + '''A replacement for __dict__ for unpickling an object which once + has __slots__ now but did not before.''' + + def __init__(self, obj): + self.obj = obj + + def __getitem__(self, key): + return getattr(self.obj, key) + + def __setitem__(self, key, value): + setattr(self.obj, key, value) + + def __contains__(self, key): + return hasattr(self.obj, key) + + def __repr__(self): + return "{self.__class__.__name__}({self.obj})".format(self=self) + + +class unitint(int): + '''Represents an integer value with a unit name. + + Behaves identically to a built-in :class:`int` type. + + Attributes + ---------- + unit_info : :class:`str` + The name of the unit this value posseses. + ''' + def __new__(cls, value, unit_info=None): + inst = int.__new__(cls, value) + inst.unit_info = unit_info + return inst + + def __reduce__(self): + return self.__class__, (int(self), self.unit_info) + + def _repr_pretty_(self, p, cycle): + base = super(unitint, self).__repr__() + if self.unit_info: + string = "%s %s" % (base, self.unit_info) + else: + string = base + p.text(string) + + +class unitfloat(float): + '''Represents an float value with a unit name. + + Behaves identically to a built-in :class:`float` type. + + Attributes + ---------- + unit_info : :class:`str` + The name of the unit this value posseses. + ''' + __slots__ = ('unit_info', ) + + def __new__(cls, value, unit_info=None): + inst = float.__new__(cls, value) + inst.unit_info = unit_info + return inst + + @property + def __dict__(self): + return _MappingOverAttributeProxy(self) + + def __reduce__(self): + return self.__class__, (float(self), self.unit_info) + + def _repr_pretty_(self, p, cycle): + base = super(unitfloat, self).__repr__() + if self.unit_info: + string = "%s %s" % (base, self.unit_info) + else: + string = base + p.text(string) + + +class unitstr(str): + '''Represents an string value with a unit name. + + Behaves identically to a built-in :class:`str` type. + + Attributes + ---------- + unit_info : :class:`str` + The name of the unit this value posseses. + ''' + if not PY2: + __slots__ = ("unit_info", ) + + def __new__(cls, value, unit_info=None): + if PY2 and isinstance(value, unicode): + value = value.encode('utf-8') + inst = str.__new__(cls, value) + inst.unit_info = unit_info + return inst + + @property + def __dict__(self): + return _MappingOverAttributeProxy(self) + + def __reduce__(self): + return self.__class__, (str(self), self.unit_info) + + def _repr_pretty_(self, p, cycle): + base = super(unitstr, self).__repr__() + if self.unit_info: + string = "%s %s" % (base, self.unit_info) + else: + string = base + p.text(string) + + +class cvstr(str): + '''A helper class to associate a controlled vocabullary accession + number with an otherwise plain :class:`str` object + + Attributes + ---------- + accession : str + The accession number for this parameter, e.g. MS:1000040 + unit_accession : str + The accession number for the unit of the value, if any + ''' + + if not PY2: + __slots__ = ('accession', 'unit_accession') + + _cache = {} + + def __new__(cls, value, accession=None, unit_accession=None): + try: + inst = cls._cache[value] + if inst.accession == accession and inst.unit_accession == unit_accession: + return inst + except KeyError: + pass + + if PY2 and isinstance(value, unicode): + value = value.encode('utf-8') + inst = str.__new__(cls, value) + inst.accession = _intern_unit_or_cv(accession) + inst.unit_accession = _intern_unit_or_cv(unit_accession) + cls._cache[value] = inst + return inst + + @property + def __dict__(self): + return _MappingOverAttributeProxy(self) + + def __reduce__(self): + return self.__class__, (str(self), self.accession, self.unit_accession) + + +class CVQueryEngine(object): + '''Traverse an arbitrarily nested dictionary looking + for keys which are :class:`cvstr` instances, or objects + with an attribute called ``accession``. + ''' + + def _accession(self, key): + return getattr(key, 'accession', None) + + def _query_dict(self, data, accession): + for key, value in data.items(): + if self._accession(key) == accession: + if not isinstance(value, str) or value != '': + return value + else: + return key + elif isinstance(value, dict): + inner = self._query_dict(value, accession) + if inner is not None: + return inner + elif isinstance(value, (list, tuple)): + inner = self._query_sequence(value, accession) + if inner is not None: + return inner + elif self._accession(value) == accession: + return value + + def _query_sequence(self, data, accession): + for value in data: + if isinstance(value, dict): + inner = self._query_dict(value, accession) + if inner is not None: + return inner + elif isinstance(value, (list, tuple)): + inner = self._query_sequence(value, accession) + if inner is not None: + return inner + elif self._accession(value) == accession: + return value + + def query(self, data, accession): + '''Search ``data`` for a key with the accession + number ``accession``. Returns :const:`None` if + not found. + ''' + if accession is None: + raise TypeError("`accession` cannot be None") + return self._query_dict(data, accession) + + def _is_empty(self, value): + if isinstance(value, basestring): + return value == '' + return False + + def _walk_dict(self, data, index): + for key, value in data.items(): + accession = self._accession(key) + if accession: + if not self._is_empty(value): + index[accession] = value + else: + index[accession] = key + elif isinstance(value, dict): + self._walk_dict(value, index) + elif isinstance(value, (list, tuple)): + self._walk_sequence(value, index) + accession = self._accession(value) + if accession: + index[accession] = value + return index + + def _walk_sequence(self, data, index): + for value in data: + if isinstance(value, dict): + self._walk_dict(value, index) + elif isinstance(value, (list, tuple)): + self._walk_sequence(value, index) + else: + accession = self._accession(value) + if accession: + index[accession] = value + + def index(self, data): + '''Construct a flat :class:`dict` whose keys are the + accession numbers for all qualified keys in ``data`` + and whose values are the mapped values from ``data``. + ''' + index = self._walk_dict(data, {}) + return index + + def __call__(self, data, accession=None): + '''If ``accession`` is :const:`None`, calls + :meth:`index` on ``data``, otherwise calls + :meth:`query` with ``data`` and ``accession``. + ''' + if accession is None: + return self.index(data) + else: + return self.query(data, accession) + +'''A ready-to-use instance of :class:`~.CVQueryEngine`''' +cvquery = CVQueryEngine() diff --git a/pyteomics/auxiliary/target_decoy.py b/pyteomics/auxiliary/target_decoy.py new file mode 100644 index 0000000..3c563de --- /dev/null +++ b/pyteomics/auxiliary/target_decoy.py @@ -0,0 +1,997 @@ +from __future__ import absolute_import +import re +import operator as op +import math + +try: + basestring +except NameError: + basestring = (str, bytes) + +try: + from collections.abc import Container, Sized +except ImportError: + from collections import Container, Sized +from bisect import bisect_right +from contextlib import contextmanager +try: + import pandas as pd +except ImportError: + pd = None + +from .structures import PyteomicsError +from .file_helpers import _keepstate, IteratorContextManager, _make_chain, ChainBase, TableJoiner + + +def _fix_docstring(f, **defaults): + for argname, v in defaults.items(): + if v is not None: + f.__doc__ = re.sub('{} : .*'.format(argname), + lambda m: m.group() + ', optional', f.__doc__) + + +def _calculate_qvalues(scores, isdecoy, peps=False, **kwargs): + """Actual q-value calculation. + + Parameters + ---------- + scores : numpy.ndarray + Sorted array of PSMs. + isdecoy : numpy.ndarray + Sorted array of bools (decoy/target) or floats (PEPs). + + Returns + ------- + out : numpy.ndarray + Calculated q-values. + """ + correction = kwargs.pop('correction', 0) + ratio = kwargs.pop('ratio', 1) + if ratio == 0: + raise PyteomicsError('Size ratio cannot be zero!') + remove_decoy = kwargs.get('remove_decoy', False) + formula = kwargs.pop('formula', (2, 1)[bool(remove_decoy)]) + if formula not in {1, 2}: + raise PyteomicsError('`formula` must be either 1 or 2') + + # score_label = kwargs['score_label'] + cumsum = isdecoy.cumsum(dtype=np.float64) + tfalse = cumsum.copy() + ind = np.arange(1., scores.shape[0] + 1., dtype=np.float64) + + if peps: + q = cumsum / ind + else: + if isinstance(correction, int): + if correction == 1: + tfalse += 1 + elif correction == 2: + p = 1. / (1. + ratio) + targ = ind - cumsum + for i in range(tfalse.size): + tfalse[i] = _expectation(cumsum[i], targ[i], p) + elif 0 < correction < 1: + p = 1. / (1. + ratio) + targ = ind - cumsum + for i in range(tfalse.size): + tfalse[i] = _confidence_value( + correction, cumsum[i], targ[i], p) + elif correction: + raise PyteomicsError('Invalid value for `correction`: {}.'.format(correction)) + + if formula == 1: + q = tfalse / (ind - cumsum) / ratio + else: + q = (cumsum + tfalse / ratio) / ind + + # Make sure that q-values are equal for equal scores (conservatively) + # and that q-values are monotonic + for i in range(scores.size - 1, 0, -1): + if (scores[i] == scores[i - 1] or q[i - 1] > q[i]): + q[i - 1] = q[i] + + return q + + +def _qvalues_df(psms, keyf, isdecoy, **kwargs): + full = kwargs.get('full_output', False) + remove_decoy = kwargs.get('remove_decoy', False) + peps = kwargs.get('pep') + decoy_or_pep_label = _decoy_or_pep_label(**kwargs) + q_label = kwargs.setdefault('q_label', 'q') + score_label = kwargs.setdefault('score_label', 'score') + if callable(keyf): + keyf = psms.apply(keyf, axis=1) + if callable(isdecoy): + isdecoy = psms.apply(isdecoy, axis=1) + if not isinstance(keyf, basestring): + if psms.shape[0]: + psms[score_label] = keyf + else: + psms[score_label] = [] + keyf = kwargs['score_label'] + if not isinstance(isdecoy, basestring): + if psms.shape[0]: + psms[decoy_or_pep_label] = isdecoy + else: + psms[decoy_or_pep_label] = [] + isdecoy = decoy_or_pep_label + reverse = kwargs.get('reverse', False) + + if not full: # create fields early + if peps is None: + fields = [(keyf, np.float64), (isdecoy, np.bool_), + (q_label, np.float64)] + else: + fields = [(isdecoy, np.float64), (q_label, np.float64)] + dtype = np.dtype(fields) + + psms.sort_values([keyf, isdecoy], ascending=[ + not reverse, True], inplace=True) + + if not psms.shape[0]: + if full: + psms[q_label] = [] + return psms + else: + return np.array([], dtype=dtype) + + q = _calculate_qvalues(psms[keyf].values, psms[ + isdecoy].values, peps is not None, **kwargs) + if remove_decoy: + q = q[~psms[isdecoy].values] + psms = psms[~psms[isdecoy]].copy() + if not full: + psms_ = np.empty_like(q, dtype=dtype) + if peps is None: + psms_[keyf] = psms[keyf] + psms_[isdecoy] = psms[isdecoy] + psms_[q_label] = q + psms = psms_ + else: + q_label = kwargs['q_label'] + psms[q_label] = q + return psms + + +def _decoy_or_pep_label(**kwargs): + peps = kwargs.get('pep') + return kwargs.get('decoy_label', 'is decoy') if peps is None else kwargs.get( + 'pep_label', peps if isinstance(peps, basestring) else 'PEP') + + +def _construct_dtype(*args, **kwargs): + full = kwargs.pop('full_output', False) + peps = kwargs.get('pep') + q_label = kwargs.setdefault('q_label', 'q') + score_label = kwargs.setdefault('score_label', 'score') + + fields = [(score_label, np.float64), + (_decoy_or_pep_label(**kwargs), + np.bool_ if peps is None else np.float64), + (q_label, np.float64)] + # if all args are NumPy arrays with common dtype, use it in the output + if full: + dtypes = {getattr(arg, 'dtype', None) for arg in args} + if len(dtypes) == 1 and None not in dtypes: + psm_dtype = dtypes.pop() + else: + psm_dtype = np.object_ + dtype = np.dtype(fields + [('psm', psm_dtype)]) + else: + dtype = np.dtype(fields) + return dtype + + +def _make_qvalues(read, is_decoy_prefix, is_decoy_suffix, key): + """Create a function that reads PSMs from a file and calculates q-values + for each value of `key`.""" + + def qvalues(*args, **kwargs): + """Read `args` and return a NumPy array with scores and q-values. + q-values are calculated either using TDA or based on provided values of PEP. + + Requires :py:mod:`numpy` (and optionally :py:mod:`pandas`). + + Parameters + ---------- + + positional args : file or str + Files to read PSMs from. All positional arguments are treated as + files. The rest of the arguments must be named. + + key : callable / array-like / iterable / str, keyword only + If callable, a function used for sorting of PSMs. Should accept + exactly one argument (PSM) and return a number (the smaller the better). + If array-like, should contain scores for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`DataFrame`). + + .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + reverse : bool, keyword only, optional + If :py:const:`True`, then PSMs are sorted in descending order, + i.e. the value of the key function is higher for better PSMs. + Default is :py:const:`False`. + + is_decoy : callable / array-like / iterable / str, keyword only + If callable, a function used to determine if the PSM is decoy or not. + Should accept exactly one argument (PSM) and return a truthy value if the + PSM should be considered decoy. + If array-like, should contain boolean values for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`DataFrame`). + + .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. + + pep : callable / array-like / iterable / str, keyword only, optional + If callable, a function used to determine the posterior error probability (PEP). + Should accept exactly one argument (PSM) and return a float. + If array-like, should contain float values for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`DataFrame`). + + .. note:: If this parameter is given, then PEP values will be used to calculate + q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with: + `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`. + `key` can still be provided. Without `key`, PSMs will be sorted by PEP. + + remove_decoy : bool, keyword only, optional + Defines whether decoy matches should be removed from the output. + Default is :py:const:`False`. + + .. note:: If set to :py:const:`False`, then by default the decoy + PSMs will be taken into account when estimating FDR. Refer to the + documentation of :py:func:`fdr` for math; basically, if + `remove_decoy` is :py:const:`True`, then formula 1 is used + to control output FDR, otherwise it's formula 2. This can be + changed by overriding the `formula` argument. + + formula : int, keyword only, optional + Can be either 1 or 2, defines which formula should be used for FDR + estimation. Default is 1 if `remove_decoy` is :py:const:`True`, + else 2 (see :py:func:`fdr` for definitions). + + ratio : float, keyword only, optional + The size ratio between the decoy and target databases. Default is + 1. In theory, the "size" of the database is the number of + theoretical peptides eligible for assignment to spectra that are + produced by *in silico* cleavage of that database. + + correction : int or float, keyword only, optional + Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. + + 0 (default): no correction; + + 1: enable "+1" correction. This accounts for the probability that a false + positive scores better than the first excluded decoy PSM; + + 2: this also corrects that probability for finite size of the sample, + so the correction will be slightly less than "+1". + + If a floating point number + is given, then instead of the expectation value for the number of false PSMs, + the confidence value is used. The value of `correction` is then interpreted as + desired confidence level. E.g., if correction=0.95, then the calculated q-values + do not exceed the "real" q-values with 95% probability. + + See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. + + q_label : str, optional + Field name for q-value in the output. Default is ``'q'``. + + score_label : str, optional + Field name for score in the output. Default is ``'score'``. + + decoy_label : str, optional + Field name for the decoy flag in the output. Default is ``'is decoy'``. + + pep_label : str, optional + Field name for PEP in the output. Default is ``'PEP'``. + + full_output : bool, keyword only, optional + If :py:const:`True`, then the returned array has PSM objects along + with scores and q-values. Default is :py:const:`False`. + + **kwargs : passed to the :py:func:`chain` function. + + Returns + ------- + out : numpy.ndarray + A sorted array of records with the following fields: + + - 'score': :py:class:`np.float64` + - 'is decoy': :py:class:`np.bool_` + - 'q': :py:class:`np.float64` + - 'psm': :py:class:`np.object_` (if `full_output` is :py:const:`True`) + """ + import numpy as np + + @_keepstate + def get_scores(*args, **kwargs): + scores = [] + with read(*args, **kwargs) as f: + for i, psm in enumerate(f): + row = [] + for func in (keyf, isdecoy): + if callable(func): + row.append(func(psm)) + elif isinstance(func, basestring): + row.append(psm[func]) + else: + row.append(func[i]) + row.append(None) + if full: + row.append(psm) + scores.append(tuple(row)) + return scores + + peps = kwargs.get('pep', None) + if peps is not None: + x = {'is_decoy', 'remove_decoy', 'formula', + 'ratio', 'correction'}.intersection(kwargs) + if x: + raise PyteomicsError( + "Can't use these parameters with `pep`: " + ', '.join(x)) + keyf = kwargs.pop('key', key) + reverse = kwargs.get('reverse', False) + if keyf is None: + keyf = peps + if reverse: + raise PyteomicsError( + 'reverse = True when using PEPs for sorting') + + if not callable(keyf) and not isinstance(keyf, (Sized, Container)): + keyf = np.array(list(keyf)) + + if peps is None: + if 'is_decoy' not in kwargs: + if 'decoy_suffix' in kwargs: + isdecoy = lambda x: is_decoy_suffix(x, kwargs['decoy_suffix']) + elif 'decoy_prefix' in kwargs: + isdecoy = lambda x: is_decoy_prefix(x, kwargs['decoy_prefix']) + else: + isdecoy = is_decoy_prefix + else: + isdecoy = kwargs['is_decoy'] + else: + isdecoy = peps + + if not callable(isdecoy) and not isinstance(isdecoy, (Sized, Container)): + isdecoy = np.array(list(isdecoy)) + + remove_decoy = kwargs.get('remove_decoy', False) + decoy_or_pep_label = _decoy_or_pep_label(**kwargs) + score_label = kwargs.setdefault('score_label', 'score') + q_label = kwargs.setdefault('q_label', 'q') + dtype = _construct_dtype(*args, **kwargs) + full = kwargs.get('full_output', False) + arr_flag = False + psms = None + + # time to check arg type + if pd is not None and all(isinstance(arg, pd.DataFrame) for arg in args): + psms = pd.concat(args) + return _qvalues_df(psms, keyf, isdecoy, **kwargs) + + if not all(isinstance(arg, np.ndarray) for arg in args): + if isinstance(keyf, basestring): + keyf = op.itemgetter(keyf) + if isinstance(isdecoy, basestring): + isdecoy = op.itemgetter(isdecoy) + if isinstance(peps, basestring): + peps = op.itemgetter(peps) + + if callable(keyf) or callable(isdecoy): + kwargs.pop('full_output', None) + scores = np.array(get_scores(*args, **kwargs), dtype=dtype) + else: + if all(isinstance(arg, np.ndarray) for arg in args): + psms = np.concatenate(args) + + if not isinstance(keyf, basestring): + keyf = np.array(keyf) + arr_flag = True + if not isinstance(isdecoy, basestring): + isdecoy = np.array(isdecoy) + arr_flag = True + + if arr_flag: + scores = np.empty(keyf.size if hasattr( + keyf, 'size') else isdecoy.size, dtype=dtype) + for func, label in zip((keyf, isdecoy), (score_label, decoy_or_pep_label)): + if not isinstance(func, basestring): + scores[label] = func + else: + scores[label] = psms[func] + else: + scores = np.empty(psms.shape[0], dtype=dtype) + scores[score_label] = psms[keyf] + scores[decoy_or_pep_label] = psms[isdecoy] + + if not scores.size: + if full and psms is not None: + return psms + return scores + + if not reverse: + keys = scores[decoy_or_pep_label], scores[score_label] + else: + keys = scores[decoy_or_pep_label], -scores[score_label] + lexsort = np.lexsort(keys) + scores = scores[lexsort] + if psms is not None: + psms = psms[lexsort] + + scores[q_label] = _calculate_qvalues(scores[score_label], scores[ + decoy_or_pep_label], peps is not None, **kwargs) + if remove_decoy: + if psms is not None: + psms = psms[~scores[decoy_or_pep_label]] + scores = scores[~scores[decoy_or_pep_label]] + + if full and psms is not None: + if isinstance(psms, np.ndarray): + fields = sorted(psms.dtype.fields, + key=lambda x: psms.dtype.fields[x][1]) + extra = [] + for func, label in zip((keyf, isdecoy), ('score', decoy_or_pep_label)): + if not (isinstance(func, basestring) or label in psms.dtype.fields): + extra.append(label) + elif label in psms.dtype.fields: + psms[label] = scores[label] + newdt = [(name, psms.dtype.fields[name][0]) for name in fields] + [ + (name, np.float64) for name in extra] + [(q_label, np.float64)] + psms_ = psms + psms = np.empty_like(psms_, dtype=newdt) + for f in fields: + psms[f] = psms_[f] + for f in extra: + psms[f] = scores[f] + else: + for func, label in zip((keyf, isdecoy), ('score', decoy_or_pep_label)): + if not isinstance(label, basestring): + psms[label] = scores[label] + psms[q_label] = scores[q_label] + return psms + return scores + + _fix_docstring(qvalues, is_decoy=is_decoy_prefix, key=key) + if read is _iter: + qvalues.__doc__ = qvalues.__doc__.replace("""positional args : file or str + Files to read PSMs from. All positional arguments are treated as + files.""", """positional args : iterables + Iterables to read PSMs from. All positional arguments are chained.""" + ).replace("""\n .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "") + + return qvalues + + +def _make_filter(read, is_decoy_prefix, is_decoy_suffix, key, qvalues): + """Create a function that reads PSMs from a file and filters them to + the desired FDR level (estimated by TDA), returning the top PSMs + sorted by `key`. + """ + def filter(*args, **kwargs): + try: + fdr = kwargs.pop('fdr') + except KeyError: + raise PyteomicsError('Keyword argument required: fdr') + + args = [list(arg) if not isinstance( + arg, (Container, Sized)) else arg for arg in args] + peps = kwargs.get('pep') + if peps is None: + remove_decoy = kwargs.pop('remove_decoy', True) + scores = qvalues(*args, remove_decoy=remove_decoy, **kwargs) + else: + scores = qvalues(*args, **kwargs) + keyf = kwargs.pop('key', key) + if keyf is None: + keyf = peps + reverse = kwargs.pop('reverse', False) + better = [op.lt, op.gt][bool(reverse)] + if 'is_decoy' not in kwargs: + if 'decoy_suffix' in kwargs: + isdecoy = lambda x: is_decoy_suffix(x, kwargs['decoy_suffix']) + elif 'decoy_prefix' in kwargs: + isdecoy = lambda x: is_decoy_prefix(x, kwargs['decoy_prefix']) + else: + isdecoy = is_decoy_prefix + else: + isdecoy = kwargs['is_decoy'] + kwargs.pop('formula', None) + decoy_or_pep_label = _decoy_or_pep_label(**kwargs) + score_label = kwargs.setdefault('score_label', 'score') + q_label = kwargs.get('q_label', 'q') + + try: + i = scores[q_label].searchsorted(fdr, side='right') + if isinstance(i, Sized): + i = i[0] + except AttributeError: + i = bisect_right(scores['q'], fdr) + if kwargs.pop('full_output', False): + if pd is not None and isinstance(scores, pd.DataFrame): + return scores.iloc[:i] + elif callable(keyf) or callable(isdecoy): + return scores['psm'][:i] + else: + return scores[:i] + elif not scores.size: + return (_ for _ in ()) + if peps is None: + label = score_label + else: + label = decoy_or_pep_label + cutoff = scores[label][i] if i < scores.size else ( + scores[label][-1] + (1, -1)[bool(reverse)]) + + def out(): + with read(*args, **kwargs) as f: + for p, s in zip(f, scores): + if peps is not None or not remove_decoy or not s[decoy_or_pep_label]: + if better(s[label], cutoff): + yield p + return out() + + def _filter(*args, **kwargs): + """Read `args` and yield only the PSMs that form a set with + estimated false discovery rate (FDR) not exceeding `fdr`. + + Requires :py:mod:`numpy` and, optionally, :py:mod:`pandas`. + + Parameters + ---------- + positional args : file or str + Files to read PSMs from. All positional arguments are treated as + files. The rest of the arguments must be named. + + fdr : float, keyword only, 0 <= fdr <= 1 + Desired FDR level. + + key : callable / array-like / iterable / str, keyword only + A function used for sorting of PSMs. Should accept exactly one + argument (PSM) and return a number (the smaller the better). The + default is a function that tries to extract e-value from the PSM. + + .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + reverse : bool, keyword only, optional + If :py:const:`True`, then PSMs are sorted in descending order, + i.e. the value of the key function is higher for better PSMs. + Default is :py:const:`False`. + + is_decoy : callable / array-like / iterable / str, keyword only + A function used to determine if the PSM is decoy or not. Should + accept exactly one argument (PSM) and return a truthy value if the + PSM should be considered decoy. + + .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. + + remove_decoy : bool, keyword only, optional + Defines whether decoy matches should be removed from the output. + Default is :py:const:`True`. + + .. note:: If set to :py:const:`False`, then by default the decoy + PSMs will be taken into account when estimating FDR. Refer to the + documentation of :py:func:`fdr` for math; basically, if + `remove_decoy` is :py:const:`True`, then formula 1 is used + to control output FDR, otherwise it's formula 2. This can be + changed by overriding the `formula` argument. + + formula : int, keyword only, optional + Can be either 1 or 2, defines which formula should be used for FDR + estimation. Default is 1 if `remove_decoy` is :py:const:`True`, + else 2 (see :py:func:`fdr` for definitions). + + ratio : float, keyword only, optional + The size ratio between the decoy and target databases. Default is + 1. In theory, the "size" of the database is the number of + theoretical peptides eligible for assignment to spectra that are + produced by *in silico* cleavage of that database. + + correction : int or float, keyword only, optional + Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. + + 0 (default): no correction; + + 1: enable "+1" correction. This accounts for the probability that a false + positive scores better than the first excluded decoy PSM; + + 2: this also corrects that probability for finite size of the sample, + so the correction will be slightly less than "+1". + + If a floating point number + is given, then instead of the expectation value for the number of false PSMs, + the confidence value is used. The value of `correction` is then interpreted as + desired confidence level. E.g., if correction=0.95, then the calculated q-values + do not exceed the "real" q-values with 95% probability. + + See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. + + pep : callable / array-like / iterable / str, keyword only, optional + If callable, a function used to determine the posterior error probability (PEP). + Should accept exactly one argument (PSM) and return a float. + If array-like, should contain float values for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`DataFrame`). + + .. note:: If this parameter is given, then PEP values will be used to calculate + q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with: + `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`. + `key` can still be provided. Without `key`, PSMs will be sorted by PEP. + + full_output : bool, keyword only, optional + If :py:const:`True`, then an array of PSM objects is returned. + Otherwise, an iterator / context manager object is returned, and the + files are parsed twice. This saves some RAM, but is ~2x slower. + Default is :py:const:`True`. + + .. note:: The name for the parameter comes from the fact that it is + internally passed to :py:func:`qvalues`. + + q_label : str, optional + Field name for q-value in the output. Default is ``'q'``. + + score_label : str, optional + Field name for score in the output. Default is ``'score'``. + + decoy_label : str, optional + Field name for the decoy flag in the output. Default is ``'is decoy'``. + + pep_label : str, optional + Field name for PEP in the output. Default is ``'PEP'``. + + **kwargs : passed to the :py:func:`chain` function. + + Returns + ------- + out : iterator or :py:class:`numpy.ndarray` or :py:class:`pandas.DataFrame` + """ + if kwargs.pop('full_output', True): + return filter(*args, full_output=True, **kwargs) + return IteratorContextManager(*args, parser_func=filter, **kwargs) + + _fix_docstring(_filter, is_decoy=is_decoy_prefix, key=key) + if read is _iter: + _filter.__doc__ = _filter.__doc__.replace("""positional args : file or str + Files to read PSMs from. All positional arguments are treated as + files.""", """positional args : iterables + Iterables to read PSMs from. All positional arguments are chained.""").replace( + """\n .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "") + return _filter + + +@contextmanager +def _itercontext(x, **kw): + try: + yield (row for i, row in x.iterrows()) + except AttributeError: + yield x + + +# _iter = _make_chain(_itercontext, 'iter') +_iter = ChainBase._make_chain(_itercontext) +qvalues = _make_qvalues(_iter, None, None, None) + +filter = _make_filter(_iter, None, None, None, qvalues) +filter.chain = _make_chain(filter, 'filter', True) +# filter.chain = TableJoiner._make_chain(filter) + + +try: + import numpy as np + _precalc_fact = np.log([math.factorial(n) for n in range(20)]) + + def log_factorial(x): + x = np.array(x) + pf = _precalc_fact + m = (x >= pf.size) + out = np.empty(x.shape) + out[~m] = pf[x[~m].astype(int)] + x = x[m] + out[m] = x * np.log(x) - x + 0.5 * np.log(2 * np.pi * x) + return out + + def _expectation(d, T, p=0.5): + if T is None: + return d + 1 + T = np.array(T, dtype=int) + m = np.arange(T.max() + 1, dtype=int) + pi = np.exp(_log_pi(d, m, p)) + return ((m * pi).cumsum() / pi.cumsum())[T] + + def _confidence_value(conf, d, T, p=0.5): + if T is not None: + T = np.array(T, dtype=int) + m = np.arange(T.max() + 1, dtype=int) + else: + m = np.arange(max(50 * d, 10000)) + log_pi = _log_pi(d, m, p) + pics = np.exp(log_pi).cumsum() + return np.searchsorted(pics, conf * (pics[T] if T is not None else 1)) + +except ImportError: + def log_factorial(n): + if n > 10: + return n * math.log(n) - n + 0.5 * math.log(2 * math.pi * n) + else: + return math.log(math.factorial(n)) + + def _expectation(*a, **k): + raise NotImplementedError('NumPy required') + + def _confidence_value(*a, **k): + raise NotImplementedError('NumPy required') + + +def _log_pi_r(d, k, p=0.5): + return k * math.log(p) + log_factorial(k + d) - log_factorial(k) - log_factorial(d) + + +def _log_pi(d, k, p=0.5): + return _log_pi_r(d, k, p) + (d + 1) * math.log(1 - p) + + +def _count_psms(psms, is_decoy, pep, decoy_prefix, decoy_suffix, is_decoy_prefix, is_decoy_suffix): + total, decoy = 0, 0 + if pep is not None: + is_decoy = pep + elif is_decoy is None: + if decoy_suffix is not None: + is_decoy = lambda x: is_decoy_suffix(x, decoy_suffix) + else: + is_decoy = lambda x: is_decoy_prefix(x, decoy_prefix) + if isinstance(is_decoy, basestring): + decoy = psms[is_decoy].sum() + total = psms.shape[0] + elif callable(is_decoy): + for psm in psms: + total += 1 + d = is_decoy(psm) + decoy += d if pep is not None else bool(d) + else: + if not isinstance(is_decoy, (Sized, Container)): + is_decoy = list(is_decoy) + if pep is not None: + decoy = sum(is_decoy) + else: + decoy = sum(map(bool, is_decoy)) + total = len(is_decoy) + return decoy, total + + +def _make_fdr(is_decoy_prefix, is_decoy_suffix): + def fdr(psms=None, formula=1, is_decoy=None, ratio=1, correction=0, pep=None, decoy_prefix='DECOY_', decoy_suffix=None): + """Estimate FDR of a data set using TDA or given PEP values. + Two formulas can be used. The first one (default) is: + + .. math:: + + FDR = \\frac{N_{decoy}}{N_{target} * ratio} + + The second formula is: + + .. math:: + + FDR = \\frac{N_{decoy} * (1 + \\frac{1}{ratio})}{N_{total}} + + .. note:: + This function is less versatile than :py:func:`qvalues`. To obtain FDR, + you can call :py:func:`qvalues` and take the last q-value. This function + can be used (with `correction = 0` or `1`) when :py:mod:`numpy` is not available. + + Parameters + ---------- + psms : iterable, optional + An iterable of PSMs, e.g. as returned by :py:func:`read`. + Not needed if `is_decoy` is an iterable. + + formula : int, optional + Can be either 1 or 2, defines which formula should be used for FDR + estimation. Default is 1. + + is_decoy : callable, iterable, or str + If callable, should accept exactly one argument (PSM) and return a truthy value + if the PSM is considered decoy. Default is :py:func:`is_decoy`. + If array-like, should contain float values for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`pandas.DataFrame`). + + .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. + + pep : callable, iterable, or str, optional + If callable, a function used to determine the posterior error probability (PEP). + Should accept exactly one argument (PSM) and return a float. + If array-like, should contain float values for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`pandas.DataFrame`). + + .. note:: If this parameter is given, then PEP values will be used to calculate FDR. + Otherwise, decoy PSMs will be used instead. This option conflicts with: + `is_decoy`, `formula`, `ratio`, `correction`. + + ratio : float, optional + The size ratio between the decoy and target databases. Default is 1. + In theory, the "size" of the database is the number of + theoretical peptides eligible for assignment to spectra that are + produced by *in silico* cleavage of that database. + + correction : int or float, optional + Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. + + 0 (default): no correction; + + 1: enable "+1" correction. This accounts for the probability that a false + positive scores better than the first excluded decoy PSM; + + 2: this also corrects that probability for finite size of the sample, + so the correction will be slightly less than "+1". + + If a floating point number + is given, then instead of the expectation value for the number of false PSMs, + the confidence value is used. The value of `correction` is then interpreted as + desired confidence level. E.g., if correction=0.95, then the calculated q-values + do not exceed the "real" q-values with 95% probability. + + See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. + + .. note:: + Requires :py:mod:`numpy`, if `correction` is a float or 2. + + .. note:: + Correction is only needed if the PSM set at hand was obtained using TDA + filtering based on decoy counting (as done by using :py:func:`!filter` without + `correction`). + + Returns + ------- + out : float + The estimation of FDR, (roughly) between 0 and 1. + """ + if formula not in {1, 2}: + raise PyteomicsError('`formula` must be either 1 or 2.') + if ratio == 0: + raise PyteomicsError('Size ratio cannot be zero!') + + decoy, total = _count_psms(psms, is_decoy, pep, decoy_prefix, decoy_suffix, is_decoy_prefix, is_decoy_suffix) + if pep is not None: + return float(decoy) / total + tfalse = decoy + if correction == 1 or (correction == 2 and total / decoy > 10): + tfalse += 1 + elif correction == 2: + p = 1. / (1. + ratio) + tfalse = _expectation(decoy, total - decoy, p) + elif 0 < correction < 1: + p = 1. / (1. + ratio) + tfalse = _confidence_value(correction, decoy, total - decoy, p) + if formula == 1: + if total == decoy: + raise PyteomicsError('Cannot compute FDR using formula 1: no target IDs found.') + return float(tfalse) / (total - decoy) / ratio + return (decoy + tfalse / ratio) / total + + _fix_docstring(fdr, is_decoy=is_decoy_prefix) + if is_decoy_prefix is None: + fdr.__doc__ = fdr.__doc__.replace( + """\n .. warning:: + The default function may not work + with your files, because format flavours are diverse. + + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "") + return fdr + + +fdr = _make_fdr(None, None) + +def _sigma_T(decoy, ratio): + return math.sqrt((decoy + 1) * (ratio + 1) / (ratio * ratio)) + +def sigma_T(psms, is_decoy, ratio=1): + """Calculates the standard error for the number of false positive target PSMs. + + The formula is:: + + .. math :: + + \\sigma(T) = \\sqrt{\\frac{(d + 1) \\cdot {p}}{(1 - p)^{2}}} = \\sqrt{\\frac{d+1}{r^{2}} \\cdot (r+1)} + + This estimation is accurate for low FDRs. + See the `article <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for more details. + """ + decoy, total = _count_psms(psms, is_decoy, None, None, None, None, None) + return _sigma_T(decoy, ratio) + +def sigma_fdr(psms=None, formula=1, is_decoy=None, ratio=1): + """Calculates the standard error of FDR using the formula for negative binomial distribution. + See :py:func:`sigma_T` for math. This estimation is accurate for low FDRs. + See also the `article <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for more details. + """ + + if formula not in {1, 2}: + raise PyteomicsError('`formula` must be either 1 or 2.') + decoy, total = _count_psms(psms, is_decoy, None, None, None, None, None) + sigmaT = _sigma_T(decoy, ratio) + if formula == 1: + return sigmaT / (total - decoy) / ratio + return sigmaT / total / ratio diff --git a/pyteomics/auxiliary/utils.py b/pyteomics/auxiliary/utils.py new file mode 100644 index 0000000..728b385 --- /dev/null +++ b/pyteomics/auxiliary/utils.py @@ -0,0 +1,317 @@ +from __future__ import print_function + +import base64 +import zlib +from functools import wraps +from collections import namedtuple + + +try: + basestring +except NameError: + basestring = (str, bytes) + +try: + import numpy as np +except ImportError: + np = None + +try: + import pynumpress +except ImportError: + pynumpress = None + +from .structures import PyteomicsError + +def print_tree(d, indent_str=' -> ', indent_count=1): + """Read a nested dict (with strings as keys) and print its structure. + """ + def structure(d): + out = {} + for k, v in d.items(): + if isinstance(v, dict): + out[k] = structure(v) + elif isinstance(v, list) and v and isinstance(v[0], dict): + out['{} [list]'.format(k)] = structure(v[0]) + else: + out[k] = None + return out + + def _print(d, level=0): + for k, v in d.items(): + print('{}{}'.format(indent_str * indent_count * level, k)) + if v is not None: + _print(v, level + 1) + _print(structure(d)) + + +def memoize(maxsize=1000): + """Make a memoization decorator. A negative value of `maxsize` means + no size limit.""" + def deco(f): + """Memoization decorator. Items of `kwargs` must be hashable.""" + memo = {} + + @wraps(f) + def func(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in memo: + if len(memo) == maxsize: + memo.popitem() + memo[key] = f(*args, **kwargs) + return memo[key] + return func + return deco + + +def _decode_base64_data_array(source, dtype, is_compressed): + """Read a base64-encoded binary array. + + Parameters + ---------- + source : str + A binary array encoded with base64. + dtype : dtype + The type of the array in numpy dtype notation. + is_compressed : bool + If True then the array will be decompressed with zlib. + + Returns + ------- + out : numpy.array + """ + + decoded_source = base64.b64decode(source.encode('ascii')) + if is_compressed: + decoded_source = zlib.decompress(decoded_source) + output = np.frombuffer(bytearray(decoded_source), dtype=dtype) + return output + + +_default_compression_map = { + 'no compression': lambda x: x, + 'zlib compression': zlib.decompress, +} + + +def _pynumpressDecompress(decoder): + def decode(data): + return decoder(np.frombuffer(data, dtype=np.uint8)) + return decode + + +def _zlibNumpress(decoder): + def decode(data): + return decoder(np.frombuffer(zlib.decompress(data), dtype=np.uint8)) + return decode + + +if pynumpress: + _default_compression_map.update( + { + 'MS-Numpress short logged float compression': _pynumpressDecompress(pynumpress.decode_slof), + 'MS-Numpress positive integer compression': _pynumpressDecompress(pynumpress.decode_pic), + 'MS-Numpress linear prediction compression': _pynumpressDecompress(pynumpress.decode_linear), + 'MS-Numpress short logged float compression followed by zlib compression': _zlibNumpress(pynumpress.decode_slof), + 'MS-Numpress positive integer compression followed by zlib compression': _zlibNumpress(pynumpress.decode_pic), + 'MS-Numpress linear prediction compression followed by zlib compression': _zlibNumpress(pynumpress.decode_linear), + }) + + +class ArrayConversionMixin(object): + _dtype_dict = {} + _array_keys = ['m/z array', 'intensity array'] + + def __init__(self, *args, **kwargs): + self._dtype_dict = {None: None} + dtype = kwargs.pop('dtype', None) + if isinstance(dtype, dict): + self._dtype_dict.update(dtype) + elif dtype: + self._dtype_dict = {k: dtype for k in self._array_keys} + self._dtype_dict[None] = dtype + self._convert_arrays = kwargs.pop('convert_arrays', 1) + if self._convert_arrays and np is None: + raise PyteomicsError('numpy is required for array conversion') + super(ArrayConversionMixin, self).__init__(*args, **kwargs) + + def __getstate__(self): + state = super(ArrayConversionMixin, self).__getstate__() + state['_dtype_dict'] = self._dtype_dict + state['_convert_arrays'] = self._convert_arrays + state['_array_keys'] = self._array_keys + return state + + def __setstate__(self, state): + super(ArrayConversionMixin, self).__setstate__(state) + self._dtype_dict = state['_dtype_dict'] + self._convert_arrays = state['_convert_arrays'] + self._array_keys = state['_array_keys'] + + def _build_array(self, k, data): + dtype = self._dtype_dict.get(k) + return np.array(data, dtype=dtype) + + def _convert_array(self, k, array): + dtype = self._dtype_dict.get(k) + if dtype is not None: + return array.astype(dtype) + return array + + def _build_all_arrays(self, info): + if self._convert_arrays: + for k in self._array_keys: + if k in info: + info[k] = self._build_array(k, info[k]) + + +class MaskedArrayConversionMixin(ArrayConversionMixin): + _masked_array_keys = ['charge array'] + _mask_value = 0 + + def __init__(self, *args, **kwargs): + self._convert_arrays = kwargs.pop('convert_arrays', 2) + kwargs['convert_arrays'] = self._convert_arrays + super(MaskedArrayConversionMixin, self).__init__(*args, **kwargs) + + def __getstate__(self): + state = super(MaskedArrayConversionMixin, self).__getstate__() + state['_masked_array_keys'] = self._masked_array_keys + state['_mask_value'] = self._mask_value + return state + + def __setstate__(self, state): + super(MaskedArrayConversionMixin, self).__setstate__(state) + self._masked_array_keys = state['_masked_array_keys'] + self._mask_value = state['_mask_value'] + + def _build_masked_array(self, k, data): + array = self._build_array(k, data) + return self._convert_masked_array(k, array) + + def _convert_masked_array(self, k, array): + return np.ma.masked_equal(array, self._mask_value) + + def _ensure_masked_array(self, k, data): + if isinstance(data, np.ndarray): + return self._convert_masked_array(k, data) + return self._build_masked_array(self, k, data) + + def _build_all_arrays(self, info): + super(MaskedArrayConversionMixin, self)._build_all_arrays(info) + if self._convert_arrays == 2: + for k in self._masked_array_keys: + if k in info: + info[k] = self._ensure_masked_array(k, info[k]) + + +if np is not None: + class BinaryDataArrayTransformer(object): + """A base class that provides methods for reading + base64-encoded binary arrays. + + Attributes + ---------- + compression_type_map : dict + Maps compressor type name to decompression function + """ + + compression_type_map = _default_compression_map + + class binary_array_record(namedtuple( + "binary_array_record", ("data", "compression", "dtype", "source", "key"))): + """Hold all of the information about a base64 encoded array needed to + decode the array. + """ + + def decode(self): + """Decode :attr:`data` into a numerical array + + Returns + ------- + np.ndarray + """ + return self.source._decode_record(self) + + def _make_record(self, data, compression, dtype, key=None): + return self.binary_array_record(data, compression, dtype, self, key) + + def _decode_record(self, record): + array = self.decode_data_array( + record.data, record.compression, record.dtype) + return self._finalize_record_conversion(array, record) + + def _finalize_record_conversion(self, array, record): + return array + + def _base64_decode(self, source): + decoded_source = base64.b64decode(source.encode('ascii')) + return decoded_source + + def _decompress(self, source, compression_type=None): + if compression_type is None: + return source + decompressor = self.compression_type_map.get(compression_type) + decompressed_source = decompressor(source) + return decompressed_source + + def _transform_buffer(self, binary, dtype): + if isinstance(binary, np.ndarray): + return binary.astype(dtype, copy=False) + return np.frombuffer(binary, dtype=dtype) + + def decode_data_array(self, source, compression_type=None, dtype=np.float64): + """Decode a base64-encoded, compressed bytestring into a numerical + array. + + Parameters + ---------- + source : bytes + A base64 string encoding a potentially compressed numerical + array. + compression_type : str, optional + The name of the compression method used before encoding the + array into base64. + dtype : type, optional + The data type to use to decode the binary array from the + decompressed bytes. + + Returns + ------- + np.ndarray + """ + binary = self._base64_decode(source) + binary = self._decompress(binary, compression_type) + if isinstance(binary, bytes): + binary = bytearray(binary) + array = self._transform_buffer(binary, dtype) + return array + + + class BinaryArrayConversionMixin(ArrayConversionMixin, BinaryDataArrayTransformer): + def _finalize_record_conversion(self, array, record): + key = record.key + return self._convert_array(key, array) + + +else: + BinaryDataArrayTransformer = None + BinaryArrayConversionMixin = None + + +def add_metaclass(metaclass): + """Class decorator for creating a class with a metaclass.""" + def wrapper(cls): + orig_vars = cls.__dict__.copy() + slots = orig_vars.get('__slots__') + if slots is not None: + if isinstance(slots, str): + slots = [slots] + for slots_var in slots: + orig_vars.pop(slots_var) + orig_vars.pop('__dict__', None) + orig_vars.pop('__weakref__', None) + if hasattr(cls, '__qualname__'): + orig_vars['__qualname__'] = cls.__qualname__ + return metaclass(cls.__name__, cls.__bases__, orig_vars) + return wrapper diff --git a/pyteomics/electrochem.py b/pyteomics/electrochem.py new file mode 100644 index 0000000..4fa2937 --- /dev/null +++ b/pyteomics/electrochem.py @@ -0,0 +1,499 @@ +""" +electrochem - electrochemical properties of polypeptides +======================================================== + +Summary +------- + +This module is used to calculate the +electrochemical properties of polypeptide molecules. + +The theory behind most of this module is based on the Henderson-Hasselbalch +equation and was thoroughly described in a number of sources [#Aronson]_, +[#Moore]_. + +Briefly, the formula for the charge of a polypeptide in given pH is the following: + +.. math:: + + Q_{peptide} = \sum{\\frac{Q_i}{1+10^{Q_i(pH-pK_i)}}}, + +where the sum is taken over all ionizable groups of the polypeptide, and +:math:`Q_i` is -1 and +1 for acidic and basic functional groups, +respectively. + +Charge and pI functions +----------------------- + + :py:func:`charge` - calculate the charge of a polypeptide + + :py:func:`pI` - calculate the isoelectric point of a polypeptide + + +GRand AVerage of hYdropathicity (GRAVY) +--------------------------------------- + + :py:func:`gravy` - calculate the GRAVY index of a polypeptide + + +Data +---- + + :py:data:`pK_lehninger` - a set of pK from [#Lehninger]_. + + :py:data:`pK_sillero` - a set of pK from [#Sillero]_. + + :py:data:`pK_dawson` - a set of pK from [#Dawson]_, the pK values for NH2- + and -OH are taken from [#Sillero]_. + + :py:data:`pK_rodwell` - a set of pK from [#Rodwell]_. + + :py:data:`pK_bjellqvist` - a set of pK from [#Bjellqvist]_. + + :py:data:`pK_nterm_bjellqvist` - a set of N-terminal pK from [#Bjellqvist]_. + + :py:data:`pK_cterm_bjellqvist` - a set of C-terminal pK from [#Bjellqvist]_. + + :py:data:`hydropathicity_KD` - a set of hydropathicity indexes from [#Kyte]_. + + +References +---------- + +.. [#Aronson] Aronson, J. N. The Henderson-Hasselbalch equation + revisited. Biochemical Education, 1983, 11 (2), 68. + `Link. <http://dx.doi.org/10.1016/0307-4412(83)90046-8>`_ + +.. [#Moore] Moore, D. S.. Amino acid and peptide net charges: A + simple calculational procedure. Biochemical Education, 1986, 13 (1), 10-12. + `Link. <http://dx.doi.org/10.1016/0307-4412(85)90114-1>`_ + +.. [#Lehninger] Nelson, D. L.; Cox, M. M. Lehninger Principles of + Biochemistry, Fourth Edition; W. H. Freeman, 2004; p. 1100. + +.. [#Sillero] Sillero, A.; Ribeiro, J. Isoelectric points of proteins: + Theoretical determination. Analytical Biochemistry, 1989, 179 (2), 319-325. + `Link. <http://dx.doi.org/10.1016/0003-2697(89)90136-X>`_ + +.. [#Dawson] Dawson, R. M. C.; Elliot, D. C.; Elliot, W. H.; Jones, K. M. + Data for biochemical research. Oxford University Press, 1989; p. 592. + +.. [#Rodwell] Rodwell, J. Heterogeneity of component bands in isoelectric + focusing patterns. Analytical Biochemistry, 1982, 119 (2), 440-449. + `Link. <http://dx.doi.org/10.1016/0003-2697(82)90611-X>`_ + +.. [#Bjellqvist] Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. + Reference points for comparisons of two-dimensional maps of proteins from + different human cell types defined in a pH scale where isoelectric points + correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. + `Link. <http://dx.doi.org/10.1002/elps.1150150171>`_ + +.. [#Kyte] Kyte, J.; Doolittle, R. F.. + A simple method for displaying the hydropathic character of a protein. + Journal of molecular biology 1982, 157 (1), 105-32. + `Link. <https://doi.org/10.1016/0022-2836(82)90515-0>`_ + +------------------------------------------------------------------------------- + +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from . import parser +from .auxiliary import PyteomicsError +from collections import Counter +try: + from collections.abc import Iterable +except ImportError: + from collections import Iterable + + +def charge(sequence, pH, **kwargs): + """Calculate the charge of a polypeptide in given pH or list of pHs using + a given list of amino acid electrochemical properties. + + .. warning:: + + Be cafeful when supplying a list with a parsed sequence or a dict with + amino acid composition as `sequence`. Such values must be obtained + with enabled `show_unmodified_termini` option. + + .. warning:: + + If you provide `pK_nterm` or `pK_cterm` and provide `sequence` as a dict, + it is assumed that it was obtained with ``term_aa=True`` (see + :py:func:`pyteomics.parser.amino_acid_composition` for details). + + Parameters + ---------- + sequence : str or list or dict + A string with a polypeptide sequence, a list with a parsed + sequence or a dict of amino acid composition. + pH : float or iterable of floats + pH or iterable of pHs for which the charge is calculated. + pK : dict {str: [(float, int), ...]}, optional + A set of pK of amino acids' ionizable groups. It is a dict, where keys + are amino acid labels and the values are lists of tuples (pK, + charge_in_ionized_state), a tuple per ionizable group. The default + value is `pK_lehninger`. + + pK_nterm : dict {str: [(float, int),]}, optional + pK_cterm : dict {str: [(float, int),]}, optional + Sets of pK of N-terminal and C-terminal (respectively) amino acids' + ionizable groups. Dicts with the same structure as ``pK``. These + values (if present) are used for N-terminal and C-terminal residues, + respectively. If given, `sequence` must be a :py:class:`str` or a + :py:class:`list`. The default value is an empty dict. + + Returns + ------- + out : float or list of floats + A single value of charge or a list of charges. + """ + + peptide_dict, pK = _prepare_charge_dict(sequence, **kwargs) + + # Process the case when pH is a single float. + pH_list = pH if isinstance(pH, Iterable) else [pH,] + + charge_list = _charge_for_dict(peptide_dict, pH_list, pK) + return charge_list[0] if not isinstance(pH, Iterable) else charge_list + + +def _prepare_charge_dict(sequence, **kwargs): + nterm = cterm = n_aa = c_aa = None + pK = kwargs.get('pK', pK_lehninger).copy() + pK_nterm = kwargs.get('pK_nterm', {}) + pK_cterm = kwargs.get('pK_cterm', {}) + + if isinstance(sequence, dict): + peptide_dict = sequence.copy() + for k, v in sequence.items(): + if k[-1] == '-': + if v > 1 or nterm: + raise PyteomicsError( + 'More that one N-terminal group in {}'.format( + sequence)) + nterm = k + if k[0] == '-': + if v > 1 or cterm: + raise PyteomicsError( + 'More that one C-terminal group in {}'.format( + sequence)) + cterm = k + if k[:5] == 'nterm': + if v > 1 or n_aa: + raise PyteomicsError( + 'More that one N-terminal residue in {}'.format( + sequence)) + n_aa = k[5:] + peptide_dict[n_aa] = peptide_dict.get(n_aa, 0) + 1 + if k[:5] == 'cterm': + if v > 1 or c_aa: + raise PyteomicsError( + 'More that one C-terminal residue in {}'.format( + sequence)) + c_aa = k[5:] + peptide_dict[c_aa] = peptide_dict.get(c_aa, 0) + 1 + + if nterm is None or cterm is None: + raise PyteomicsError('Peptide must have two explicit terminal groups') + if (n_aa is None or c_aa is None) and (pK_nterm or pK_cterm): + raise PyteomicsError('Two terminal residues must be present in ' + 'peptide (designated as "ntermX" and "ctermX", where "X" is ' + 'the one-letter residue label). Use ' + '``term_aa=True`` when calling ' + '`parser.amino_acid_composition`.') + + elif isinstance(sequence, (str, list)): + if isinstance(sequence, str): + if sequence.isupper() and sequence.isalpha(): + parsed_sequence = [parser.std_nterm] + list(sequence) + [parser.std_cterm] + else: + parsed_sequence = parser.parse(sequence, show_unmodified_termini=True) + elif isinstance(sequence, list): + if sequence[0][-1] != '-' or sequence[-1][0] != '-': + raise PyteomicsError('Parsed sequences must contain terminal ' + 'groups at 0-th and last positions.') + parsed_sequence = sequence + + n_aa = parsed_sequence[1] + c_aa = parsed_sequence[-2] + nterm = parsed_sequence[0] + cterm = parsed_sequence[-1] + peptide_dict = Counter(parsed_sequence) + + else: + raise PyteomicsError('Unsupported type of sequence: %s' % type(sequence)) + + if nterm in pK_nterm: + if n_aa in pK_nterm[nterm]: + pK[nterm] = pK_nterm[nterm][n_aa] + if cterm in pK_cterm: + if c_aa in pK_cterm[cterm]: + pK[cterm] = pK_cterm[cterm][c_aa] + + return peptide_dict, pK + + +def _charge_for_dict(peptide_dict, pH_list, pK): + # Calculate the charge for each value of pH. + charge_list = [] + for pH_value in pH_list: + charge = 0 + for aa in peptide_dict: + for ionizable_group in pK.get(aa, []): + charge += peptide_dict[aa] * ionizable_group[1] * ( + 1. / (1. + 10 ** (ionizable_group[1] * (pH_value - ionizable_group[0])))) + charge_list.append(charge) + + return charge_list + + +def pI(sequence, pI_range=(0.0, 14.0), precision_pI=0.01, **kwargs): + """Calculate the isoelectric point of a polypeptide using a given set + of amino acids' electrochemical properties. + + .. warning:: + + Be cafeful when supplying a list with a parsed sequence or a dict with + amino acid composition as `sequence`. Such values must be obtained + with enabled `show_unmodified_termini` option. + + Parameters + ---------- + sequence : str or list or dict + A string with a polypeptide sequence, a list with a parsed + sequence or a dict of amino acid composition. + pI_range : tuple (float, float) + The range of allowable pI values. Default is (0.0, 14.0). + precision_pI : float + The precision of the calculated pI. Default is 0.01. + pK : dict {str: [(float, int), ...]}, optional + A set of pK of amino acids' ionizable groups. It is a dict, where keys + are amino acid labels and the values are lists of tuples (pK, + charge_in_ionized_state), a tuple per ionizable group. The default + value is `pK_lehninger`. + pK_nterm : dict {str: [(float, int),]}, optional + pK_cterm : dict {str: [(float, int),]}, optional + Sets of pK of N-terminal and C-terminal (respectively) amino acids' + ionizable groups. Dicts with the same structure as ``pK``. These + values (if present) are used for N-terminal and C-terminal residues, + respectively. If given, `sequence` must be a :py:class:`str` or a + :py:class:`list`. The default value is an empty dict. + + Returns + ------- + out : float + """ + + pK = kwargs.get('pK', pK_lehninger.copy()) + pK_nterm = {} + pK_cterm = {} + if isinstance(sequence, str) or isinstance(sequence, list): + pK_nterm = kwargs.get('pK_nterm', {}) + pK_cterm = kwargs.get('pK_cterm', {}) + elif isinstance(sequence, dict) and (('pK_nterm' in kwargs) or ('pK_cterm' in kwargs)): + raise PyteomicsError('Can not use terminal features for %s' % type(sequence)) + + peptide_dict, pK = _prepare_charge_dict(sequence, pK=pK, pK_cterm=pK_cterm, pK_nterm=pK_nterm) + # The algorithm is based on the fact that charge(pH) is a monotonic function. + left_x, right_x = pI_range + left_y = _charge_for_dict(peptide_dict, [left_x], pK)[0] + right_y = _charge_for_dict(peptide_dict, [right_x], pK)[0] + while (right_x - left_x) > precision_pI: + if left_y * right_y > 0: + return left_x if abs(left_y) < abs(right_y) else right_x + middle_x = (left_x + right_x) / 2.0 + middle_y = _charge_for_dict(peptide_dict, [middle_x], pK)[0] + if middle_y * left_y < 0: + right_x = middle_x + right_y = middle_y + else: + left_x = middle_x + left_y = middle_y + return (left_x + right_x) / 2.0 + + +pK_lehninger = { + 'E': [(4.25, -1)], + 'R': [(12.48, 1)], + 'Y': [(10.07, -1)], + 'D': [(3.65, -1)], + 'H': [(6.00, +1)], + 'K': [(10.53, +1)], + 'C': [(8.18, -1)], + 'H-': [(9.69, +1)], + '-OH': [(2.34, -1)], + } +"""A set of pK from Nelson, D. L.; Cox, M. M. Lehninger Principles of +Biochemistry, Fourth Edition; W. H. Freeman, 2004; p. 1100. +""" + +pK_sillero = { + 'E': [(4.5, -1)], + 'R': [(12.0, +1)], + 'Y': [(10.0, -1)], + 'D': [(4.0, -1)], + 'H': [(6.4, +1)], + 'K': [(10.4, +1)], + 'C': [(9.0, -1)], + 'H-': [(8.2, +1)], + '-OH': [(3.2, -1)], + } +"""A set of pK from Sillero, A.; Ribeiro, J. Isoelectric points of proteins: +Theoretical determination. Analytical Biochemistry, vol. 179 (2), pp. 319-325, +1989. +""" + +pK_dawson = { + 'E': [(4.3, -1)], + 'R': [(12.0, +1)], + 'Y': [(10.1, -1)], + 'D': [(3.9, -1)], + 'H': [(6.0, +1)], + 'K': [(10.5, +1)], + 'C': [(8.3, -1)], + 'H-': [(8.2, +1)], + '-OH': [(3.2, -1)], + } +"""A set of pK from Dawson, R. M. C.; Elliot, D. C.; Elliot, W. H.; Jones, +K. M. Data for biochemical research. Oxford University Press, 1989; p. 592. +pKs for NH2- and -OH are taken from `pK_sillero`. +""" + +pK_rodwell = { + 'E': [(4.25, -1)], + 'R': [(11.5, +1)], + 'Y': [(10.7, -1)], + 'D': [(3.86, -1)], + 'H': [(6.0, +1)], + 'K': [(11.5, +1)], + 'C': [(8.33, -1)], + 'H-': [(8.0, +1)], + '-OH': [(3.1, -1)], +} +"""A set of pK from Rodwell, J. Heterogeneity of component bands in +isoelectric focusing patterns. Analytical Biochemistry, vol. 119 (2), +pp. 440-449, 1982. +""" + +pK_bjellqvist = { + 'E': [(4.45, -1)], + 'R': [(12.0, +1)], + 'Y': [(10.0, -1)], + 'D': [(4.05, -1)], + 'H': [(5.98, +1)], + 'K': [(10.0, +1)], + 'C': [(9.0, -1)], + 'H-': [(7.5, +1)], + '-OH': [(3.55, -1)], +} +""" +A set of pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. +Reference points for comparisons of two-dimensional maps of proteins from +different human cell types defined in a pH scale where isoelectric points +correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. +""" + +pK_nterm_bjellqvist = { + 'H-': { + 'A': [(7.59, +1)], + 'M': [(7.0, +1)], + 'S': [(6.93, +1)], + 'P': [(8.36, +1)], + 'T': [(6.82, +1)], + 'V': [(7.44, +1)], + 'E': [(7.7, +1)] + } + } +""" +A set of N-terminal pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. +Reference points for comparisons of two-dimensional maps of proteins from +different human cell types defined in a pH scale where isoelectric points +correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. +""" + +pK_cterm_bjellqvist = { + '-OH': { + 'D': [(4.55, -1)], + 'E': [(4.75, -1)] + } + } +""" +A set of C-terminal pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. +Reference points for comparisons of two-dimensional maps of proteins from +different human cell types defined in a pH scale where isoelectric points +correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. +""" + +hydropathicity_KD = { + "A": 1.800, + "R": -4.500, + "N": -3.500, + "D": -3.500, + "C": 2.500, + "Q": -3.500, + "E": -3.500, + "G": -0.400, + "H": -3.200, + "I": 4.500, + "L": 3.800, + "K": -3.900, + "M": 1.900, + "F": 2.800, + "P": -1.600, + "S": -0.800, + "T": -0.700, + "W": -0.900, + "Y": -1.300, + "V": 4.200, +} +""" +A set of hydropathicity indexes obtained from Kyte J., Doolittle F. J. Mol. Biol. 157:105-132 (1982). +""" + + +def gravy(sequence, hydropathicity=hydropathicity_KD): + """ + Calculate GRand AVerage of hYdropathicity (GRAVY) index for amino acid sequence. + + Parameters + ---------- + sequence : str + Polypeptide sequence in one-letter format. + hydropathicity : dict, optional + Hydropathicity indexes of amino acids. Default is :py:data:`hydropathicity_KD`. + + Returns + ------- + out : float + GRand AVerage of hYdropathicity (GRAVY) index. + + Examples + >>> gravy('PEPTIDE') + -1.4375 + """ + try: + return sum(hydropathicity[aa] for aa in sequence) / len(sequence) + except KeyError as e: + raise PyteomicsError("Hydropathicity for amino acid {} not provided.".format(e.args[0])) + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/pyteomics/fasta.py b/pyteomics/fasta.py new file mode 100644 index 0000000..61d38c3 --- /dev/null +++ b/pyteomics/fasta.py @@ -0,0 +1,1072 @@ +""" +fasta - manipulations with FASTA databases +========================================== + +FASTA is a simple file format for protein sequence databases. Please refer to +`the NCBI website <http://www.ncbi.nlm.nih.gov/blast/fasta.shtml>`_ +for the most detailed information on the format. + +Data manipulation +----------------- + +Classes +....... + +Several classes of FASTA parsers are available. All of them have common features: + + - context manager support; + + - header parsing; + + - direct iteration. + +Available classes: + + :py:class:`FASTABase` - common ancestor, suitable for type checking. + Abstract class. + + :py:class:`FASTA` - text-mode, sequential parser. + Good for iteration over database entries. + + :py:class:`IndexedFASTA` - binary-mode, indexing parser. + Supports direct indexing by header string. + + :py:class:`TwoLayerIndexedFASTA` - additionally supports + indexing by extracted header fields. + + :py:class:`UniProt` and :py:class:`IndexedUniProt`, + :py:class:`UniParc` and :py:class:`IndexedUniParc`, + :py:class:`UniMes` and :py:class:`IndexedUniMes`, + :py:class:`UniRef` and :py:class:`IndexedUniRef`, + :py:class:`SPD` and :py:class:`IndexedSPD`, + :py:class:`NCBI` and :py:class:`IndexedNCBI`, + :py:class:`RefSeq` and :py:class:`IndexedRefSeq`, - format-specific parsers. + +Functions +......... + + :py:func:`read` - returns an instance of the appropriate reader class, + for sequential iteration or random access. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`write` - write entries to a FASTA database. + + :py:func:`parse` - parse a FASTA header. + +Decoy sequence generation +------------------------- + +:py:func:`decoy_sequence` - generate a decoy sequence from a given sequence, using +one of the other functions listed in this section or any other callable. + +:py:func:`reverse` - generate a reversed decoy sequence. + +:py:func:`shuffle` - generate a shuffled decoy sequence. + +:py:func:`fused_decoy` - generate a "fused" decoy sequence. + + +Decoy database generation +------------------------- + + :py:func:`write_decoy_db` - generate a decoy database and write it to a file. + + :py:func:`decoy_db` - generate entries for a decoy database from a given FASTA + database. + + :py:func:`decoy_entries` - generate decoy entries for an iterator. + + :py:func:`decoy_chain` - a version of :py:func:`decoy_db` for multiple files. + + :py:func:`decoy_chain.from_iterable` - like :py:func:`decoy_chain`, but with + an iterable of files. + +Auxiliary +--------- + + :py:data:`std_parsers` - a dictionary with parsers for known FASTA header + formats. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import random +from collections import namedtuple +import re +import abc +from . import auxiliary as aux +from .auxiliary.utils import add_metaclass + + +Protein = namedtuple('Protein', ('description', 'sequence')) +DECOY_PREFIX = 'DECOY_' +RAW_HEADER_KEY = '__raw__' + + +def _add_raw_field(parser): + """ + Add :py:const:`RAW_HEADER_KEY` field to the parsed dictinary. + + Parameters + ---------- + parser : func + parser function. + + Returns + ------- + None. + + """ + def _new_parser(instance, descr): + parsed = parser(instance, descr) + if RAW_HEADER_KEY not in parsed: + parsed[RAW_HEADER_KEY] = descr + elif parsed[RAW_HEADER_KEY] != descr: + raise aux.PyteomicsError('Cannot save raw protein header, since the corresponsing' + 'key ({}) already exists.'.format(RAW_HEADER_KEY)) + return parsed + + return _new_parser + + +class FASTABase(object): + """Abstract base class for FASTA file parsers. + Can be used for type checking. + """ + parser = None + _ignore_comments = False + _comments = set('>;') + + def __init__(self, source, **kwargs): + self._ignore_comments = kwargs.pop('ignore_comments', False) + parser = kwargs.pop('parser', None) + if parser is not None: + self.parser = parser + super(FASTABase, self).__init__(source, **kwargs) + + def _is_comment(self, line): + return line[0] in self._comments + + def get_entry(self, key): + raise NotImplementedError + + +class FASTA(FASTABase, aux.FileReader): + """Text-mode, sequential FASTA parser. + Suitable for iteration over the file to obtain all entries in order. + """ + def __init__(self, source, ignore_comments=False, parser=None, encoding=None): + """Create a new FASTA parser object. Supports iteration, + yields `(description, sequence)` tuples. Supports `with` syntax. + + Parameters + ---------- + + source : str or file-like + File to read. If file object, it must be opened in *text* mode. + ignore_comments : bool, optional + If :py:const:`True` then ignore the second and subsequent lines of description. + Default is :py:const:`False`, which concatenates multi-line descriptions into + a single string. + parser : function or None, optional + Defines whether the FASTA descriptions should be parsed. If it is a + function, that function will be given the description string, and + the returned value will be yielded together with the sequence. + The :py:data:`std_parsers` dict has parsers for several formats. + Hint: specify :py:func:`parse` as the parser to apply automatic + format recognition. + Default is :py:const:`None`, which means return the header "as is". + encoding : str or None, optional + File encoding (if it is given by name). + """ + super(FASTA, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={}, + encoding=encoding, ignore_comments=ignore_comments, parser=parser) + + def _read(self): + accumulated_strings = [] + + # Iterate through '>' after the file is over to retrieve the last entry. + for string in itertools.chain(self._source, '>'): + stripped_string = string.strip() + + # Skip empty lines. + if not stripped_string: + continue + + is_comment = self._is_comment(stripped_string) + if is_comment: + # If it is a continuing comment + if len(accumulated_strings) == 1: + if not self._ignore_comments: + accumulated_strings[0] += (' ' + stripped_string[1:]) + else: + continue + + elif accumulated_strings: + description = accumulated_strings[0] + sequence = ''.join(accumulated_strings[1:]) + + # Drop the translation stop sign. + if sequence and sequence[-1] == '*': + sequence = sequence[:-1] + if self.parser is not None: + description = self.parser(description) + yield Protein(description, sequence) + accumulated_strings = [stripped_string[1:]] + else: + # accumulated_strings is empty; we're probably reading + # the very first line of the file + accumulated_strings.append(stripped_string[1:]) + else: + accumulated_strings.append(stripped_string) + + def get_entry(self, key): + raise aux.PyteomicsError('Direct indexing is not supported. ' + 'Use IndexedFASTA and its subclasses') + + +def _reconstruct(cls, args, kwargs): + kwargs['_skip_index'] = True + return cls(*args, **kwargs) + + +class IndexedFASTA(FASTABase, aux.TaskMappingMixin, aux.IndexedTextReader): + """Indexed FASTA parser. Supports direct indexing by matched labels.""" + delimiter = '\n>' + label = r'^[\n]?>(.*)\s*' + + def __init__(self, source, ignore_comments=False, parser=None, **kwargs): + """Create an indexed FASTA parser object. + + Parameters + ---------- + source : str or file-like + File to read. If file object, it must be opened in *binary* mode. + ignore_comments : bool, optional + If :py:const:`True` then ignore the second and subsequent lines of description. + Default is :py:const:`False`, which concatenates multi-line descriptions into + a single string. + parser : function or None, optional + Defines whether the FASTA descriptions should be parsed. If it is a + function, that function will be given the description string, and + the returned value will be yielded together with the sequence. + The :py:data:`std_parsers` dict has parsers for several formats. + Hint: specify :py:func:`parse` as the parser to apply automatic + format recognition. + Default is :py:const:`None`, which means return the header "as is". + encoding : str or None, optional, keyword only + File encoding. Default is UTF-8. + block_size : int or None, optional, keyword only + Number of bytes to consume at once. + delimiter : str or None, optional, keyword only + Overrides the FASTA record delimiter (default is ``'\\n>'``). + label : str or None, optional, keyword only + Overrides the FASTA record label pattern. Default is ``'^[\\n]?>(.*)'``. + label_group : int or str, optional, keyword only + Overrides the matched group used as key in the byte offset index. + This in combination with `label` can be used to extract fields from headers. + However, consider using :py:class:`TwoLayerIndexedFASTA` for this purpose. + """ + super(IndexedFASTA, self).__init__(source, ignore_comments=ignore_comments, parser=parser, + parser_func=self._read, pass_file=False, args=(), kwargs={}, **kwargs) + self._init_args = (source, ignore_comments, parser) + self._init_kwargs = kwargs + + def __reduce_ex__(self, protocol): + return (_reconstruct, + (self.__class__, self._init_args, self._init_kwargs), + self.__getstate__()) + + def _read_protein_lines(self, lines): + description = [] + sequence = [] + + for string in lines: + stripped_string = string.strip() + if not stripped_string: + continue + + is_comment = self._is_comment(stripped_string) + if is_comment: + if not description or not self._ignore_comments: + description.append(stripped_string[1:]) + else: + sequence.append(stripped_string) + + description = ' '.join(description) + sequence = ''.join(sequence) + # Drop the translation stop sign. + if sequence and sequence[-1] == '*': + sequence = sequence[:-1] + if self.parser is not None: + description = self.parser(description) + return Protein(description, sequence) + + def _item_from_offsets(self, offsets): + start, end = offsets + lines = self._read_lines_from_offsets(start, end) + return self._read_protein_lines(lines) + + def _read(self, **kwargs): + for key, offsets in self._offset_index.items(): + yield self._item_from_offsets(offsets) + + def get_entry(self, key): + return self.get_by_id(key) + + +class TwoLayerIndexedFASTA(IndexedFASTA): + """Parser with two-layer index. Extracted groups are mapped to full headers (where possible), + full headers are mapped to byte offsets. + + When indexed, the key is looked up in both indexes, allowing access by meaningful IDs + (like UniProt accession) and by full header string. + """ + header_group = 1 + header_pattern = None + def __init__(self, source, header_pattern=None, header_group=None, + ignore_comments=False, parser=None, **kwargs): + """Open `source` and create a two-layer index for convenient random access + both by full header strings and extracted fields. + + Parameters + ---------- + source : str or file-like + File to read. If file object, it must be opened in *binary* mode. + header_pattern : str or RE or None, optional + Pattern to match the header string. Must capture the group used + for the second index. If :py:const:`None` (default), second-level index is not created. + header_group : int or str or None, optional + Defines which group is used as key in the second-level index. + Default is 1. + ignore_comments : bool, optional + If :py:const:`True` then ignore the second and subsequent lines of description. + Default is :py:const:`False`, which concatenates multi-line descriptions into + a single string. + parser : function or None, optional + Defines whether the FASTA descriptions should be parsed. If it is a + function, that function will be given the description string, and + the returned value will be yielded together with the sequence. + The :py:data:`std_parsers` dict has parsers for several formats. + Hint: specify :py:func:`parse` as the parser to apply automatic + format recognition. + Default is :py:const:`None`, which means return the header "as is". + + Other arguments : the same as for :py:class:`IndexedFASTA`. + """ + super(TwoLayerIndexedFASTA, self).__init__(source, ignore_comments, parser, **kwargs) + if header_group is not None: + self.header_group = header_group + if header_pattern is not None: + self.header_pattern = header_pattern + if not kwargs.get('_skip_index', False): + self.build_second_index() + self._init_args = (source, header_pattern, header_group, ignore_comments, parser) + self._init_kwargs = kwargs + + def build_second_index(self): + """Create the mapping from extracted field to whole header string.""" + if self.header_pattern is None: + self._id2header = None + else: + index = {} + for key in self._offset_index: + match = re.match(self.header_pattern, key) + if match: + index[match.group(self.header_group)] = key + self._id2header = index + + def __getstate__(self): + state = super(TwoLayerIndexedFASTA, self).__getstate__() + state['id2header'] = self._id2header + return state + + def __setstate__(self, state): + super(TwoLayerIndexedFASTA, self).__setstate__(state) + self._id2header = state['id2header'] + + def get_by_id(self, key): + """Get the entry by value of header string or extracted field.""" + try: + return super(TwoLayerIndexedFASTA, self).get_by_id(key) + except KeyError: + if self._id2header: + header = self._id2header.get(key) + if header is not None: + return super(TwoLayerIndexedFASTA, self).get_entry(header) + raise KeyError(key) + + def get_header(self, key): + if key in self._id2header: + return self._id2header[key] + raise KeyError(key) + + def __contains__(self, key): + return super(TwoLayerIndexedFASTA, self).__contains__(key) or key in self._id2header + + +class _FastaParserFlavorMeta(abc.ABCMeta): + def __new__(mcs, name, bases, namespace): + if "parser" in namespace: + namespace["parser"] = _add_raw_field(namespace["parser"]) + if name != 'FlavoredMixin': + reader_type = None + for t in (FASTA, IndexedFASTA, TwoLayerIndexedFASTA): + if t in bases: + reader_type = t + + if reader_type is not None: + # this is a "concrete" reader class + # add a unified __init__ method for it + for c in bases: + if issubclass(c, FlavoredMixin): + flavor = c + break + else: + raise aux.PyteomicsError('Could not detect flavor of {}, not a subclass of `FlavoredMixin`.') + + def __init__(self, source, parse=True, **kwargs): + reader_type.__init__(self, source, **kwargs) + flavor.__init__(self, parse) + self._init_args = (source, parse) + self._init_kwargs = kwargs + + flavor_name = name[:-5] + type_name = "Text-mode" if reader_type is FASTA else "Indexed" + __init__.__doc__ = """Creates a :py:class:`{}` object. + + Parameters + ---------- + source : str or file + The file to read. If a file object, it needs to be in *{}* mode. + parse : bool, optional + Defines whether the descriptions should be parsed in the produced tuples. + Default is :py:const:`True`. + kwargs : passed to the :py:class:`{}` constructor. + """.format(name, 'text' if reader_type is FASTA else 'binary', reader_type.__name__) + namespace['__init__'] = __init__ + namespace['__doc__'] = """{} parser for {} FASTA files.""".format(type_name, flavor_name) + + return super(_FastaParserFlavorMeta, mcs).__new__(mcs, name, bases, namespace) + + +@add_metaclass(_FastaParserFlavorMeta) +class FlavoredMixin(): + """Parser aimed at a specific FASTA flavor. + Subclasses should define `parser` and `header_pattern`. + The `parse` argument in :py:meth:`__init__` defines whether description is + parsed in output. + """ + def __init__(self, parse=True): + if not parse: + self.parser = None + + +class UniProtMixin(FlavoredMixin): + header_pattern = r'^(?P<db>\w+)\|(?P<id>[-\w]+)\|(?P<entry>\w+)\s+(?P<name>.*?)(?:(\s+OS=(?P<OS>[^=]+))|(\s+OX=(?P<OX>\d+))|(\s+GN=(?P<GN>\S+))|(\s+PE=(?P<PE>\d))|(\s+SV=(?P<SV>\d+)))*\s*$' + header_group = 'id' + + def parser(self, header): + info = re.match(self.header_pattern, header).groupdict() + for key in ['OS', 'OX', 'GN', 'PE', 'SV']: + if info[key] is None: + del info[key] + info['gene_id'], info['taxon'] = info['entry'].split('_') + _intify(info, ('PE', 'SV', 'OX')) + return info + + +class UniProt(UniProtMixin, FASTA): + pass + + +class IndexedUniProt(UniProtMixin, TwoLayerIndexedFASTA): + pass + + +class UniRefMixin(FlavoredMixin): + header_pattern = r'^(?P<id>\S+)\s+(?P<cluster>.*?)(?:(\s+n=(?P<n>\d+))|(\s+Tax=(?P<Tax>.+?))|(\s+TaxID=(?P<TaxID>\S+))|(\s+RepID=(?P<RepID>\S+)))*\s*$' + header_group = 'id' + + def parser(self, header): + assert 'Tax' in header + info = re.match(self.header_pattern, header).groupdict() + for key in ['TaxID', 'Tax', 'RepID', 'n']: + if info[key] is None: + del info[key] + _intify(info, ('n',)) + return info + + +class UniRef(UniRefMixin, FASTA): + pass + + +class IndexedUniRef(UniRefMixin, TwoLayerIndexedFASTA): + pass + + +class UniParcMixin(FlavoredMixin): + header_pattern = r'(\S+)\s+status=(\w+)\s*$' + + def parser(self, header): + ID, status = re.match(self.header_pattern, header).groups() + return {'id': ID, 'status': status} + + +class UniParc(UniParcMixin, FASTA): + pass + + +class IndexedUniParc(UniParcMixin, TwoLayerIndexedFASTA): + pass + + +class UniMesMixin(FlavoredMixin): + header_pattern = r'^(\S+)\s+([^=]*\S)((\s+\w+=[^=]+(?!\w*=))+)\s*$' + + def parser(self, header): + assert 'OS=' in header and 'SV=' in header and 'PE=' not in header + ID, name, pairs, _ = re.match(self.header_pattern, header).groups() + info = {'id': ID, 'name': name} + info.update(_split_pairs(pairs)) + _intify(info, ('SV',)) + return info + + +class UniMes(UniMesMixin, FASTA): + pass + + +class IndexedUniMes(UniMesMixin, TwoLayerIndexedFASTA): + pass + + +class SPDMixin(FlavoredMixin): + header_pattern = r'^([^|]+?)\s*\|\s*(([^|]+?)_([^|]+?))\s*\|\s*([^|]+?)\s*$' + + def parser(self, header): + assert '=' not in header + ID, gene, gid, taxon, d = re.match(self.header_pattern, header).groups() + return {'id': ID, 'gene': gene, 'description': d, + 'taxon': taxon, 'gene_id': gid} + + +class SPD(SPDMixin, FASTA): + pass + + +class IndexedSPD(SPDMixin, TwoLayerIndexedFASTA): + pass + + +class NCBIMixin(FlavoredMixin): + header_pattern = r'^(\S+)\s+(.*\S)\s+\[(.*)\]' + + def parser(self, header): + ID, description, organism = re.match(self.header_pattern, header).groups() + return {'id': ID, 'description': description, 'taxon': organism} + + +class NCBI(NCBIMixin, FASTA): + pass + + +class IndexedNCBI(NCBIMixin, TwoLayerIndexedFASTA): + pass + + +class RefSeqMixin(FlavoredMixin): + header_pattern = r'^ref\|([^|]+)\|\s*([^\[]*\S)\s*\[(.*)\]' + + def parser(self, header): + ID, description, organism = re.match(self.header_pattern, header).groups() + return {'id': ID, 'description': description, 'taxon': organism} + + +class RefSeq(RefSeqMixin, FASTA): + pass + + +class IndexedRefSeq(RefSeqMixin, TwoLayerIndexedFASTA): + pass + + +def read(source=None, use_index=None, flavor=None, **kwargs): + """Parse a FASTA file. This function serves as a dispatcher between + different parsers available in this module. + + Parameters + ---------- + source : str or file or None, optional + A file object (or file name) with a FASTA database. Default is + :py:const:`None`, which means read standard input. + use_index : bool, optional + If :py:const:`True`, the created parser object will be an instance of + :py:class:`IndexedFASTA`. If :py:const:`False` (default), it will be + an instance of :py:class:`FASTA`. + flavor : str or None, optional + A supported FASTA header format. If specified, a format-specific + parser instance is returned. + + .. note:: See :py:data:`std_parsers` for supported flavors. + + Returns + ------- + out : iterator of tuples + A named 2-tuple with FASTA header (str or dict) and sequence (str). + Attributes 'description' and 'sequence' are also provided. + """ + try: + parser = std_parsers[flavor and flavor.lower()] + except KeyError: + raise aux.PyteomicsError('No parser for flavor: {}. Supported flavors: {}'.format( + flavor, ', '.join(map(str, std_parsers)))) + use_index = aux._check_use_index(source, use_index, False) + return parser[use_index](source, **kwargs) + + +@aux._file_writer() +def write(entries, output=None): + """ + Create a FASTA file with `entries`. + + Parameters + ---------- + entries : iterable of (str/dict, str) tuples + An iterable of 2-tuples in the form (description, sequence). + If description is a dictionary, it must have a special key, whose value + will be written as protein description. The special key is defined by the variable + :py:const:`RAW_HEADER_KEY`. + output : file-like or str, optional + A file open for writing or a path to write to. If the file exists, + it will be opened for writing. Default is :py:const:`None`, which + means write to standard output. + + .. note:: + The default mode for output files specified by name has been changed + from `a` to `w` in *pyteomics 4.6*. See `file_mode` to override the mode. + + file_mode : str, keyword only, optional + If `output` is a file name, defines the mode the file will be opened in. + Otherwise will be ignored. Default is `'w'`. + + .. note :: + The default changed from `'a'` in *pyteomics 4.6*. + + Returns + ------- + output_file : file object + The file where the FASTA is written. + """ + for descr, seq in entries: + if isinstance(descr, str): + output.write('>' + descr.replace('\n', '\n;') + '\n') + elif isinstance(descr, dict) and RAW_HEADER_KEY in descr: + output.write('>' + descr[RAW_HEADER_KEY].replace('\n', '\n;') + '\n') + else: + raise aux.PyteomicsError('Cannot use provided description: ' + repr(descr)) + output.write(''.join([('%s\n' % seq[i:i+70]) + for i in range(0, len(seq), 70)]) + '\n') + + return output.file + + +def reverse(sequence, keep_nterm=False, keep_cterm=False): + """ + Create a decoy sequence by reversing the original one. + + Parameters + ---------- + sequence : str + The initial sequence string. + keep_nterm : bool, optional + If :py:const:`True`, then the N-terminal residue will be kept. + Default is :py:const:`False`. + keep_cterm : bool, optional + If :py:const:`True`, then the C-terminal residue will be kept. + Default is :py:const:`False`. + + Returns + ------- + decoy_sequence : str + The decoy sequence. + """ + start = 1 if keep_nterm else 0 + end = len(sequence)-1 if keep_cterm else len(sequence) + if start == end: + return sequence + return sequence[:start] + sequence[start:end][::-1] + sequence[end:] + + +def shuffle(sequence, keep_nterm=False, keep_cterm=False, keep_nterm_M=False, fix_aa=''): + """ + Create a decoy sequence by shuffling the original one. + + Parameters + ---------- + sequence : str + The initial sequence string. + keep_nterm : bool, optional + If :py:const:`True`, then the N-terminal residue will be kept. + Default is :py:const:`False`. + keep_cterm : bool, optional + If :py:const:`True`, then the C-terminal residue will be kept. + Default is :py:const:`False`. + keep_nterm_M : bool, optional + If :py:const:`True`, then the N-terminal methionine will be kept. + Default is :py:const:`False`. + fix_aa : iterable, optional + Single letter codes for amino acids that should preserve their position + during shuffling. + Default is ''. + + Returns + ------- + decoy_sequence : str + The decoy sequence. + """ + + # empty sequence + if len(sequence) == 0: + return '' + + # presereve the first position + if (keep_nterm_M and sequence[0] == 'M') or keep_nterm: + return sequence[0] + shuffle(sequence[1:], keep_cterm=keep_cterm, + fix_aa=fix_aa) + + # presereve the last position + if keep_cterm: + return shuffle(sequence[:-1], fix_aa=fix_aa) + sequence[-1] + + + if not isinstance(fix_aa, str): + fix_aa = ''.join(fix_aa) + + fixed = [] + position = 0 + if len(fix_aa) > 0: # non-empty fixed list + shuffled = [] + for match in re.finditer(r'[{}]'.format(fix_aa), sequence): + fixed.append((match.start(), sequence[match.start()])) + shuffled.extend(sequence[position:match.start()]) + position = match.end() + shuffled.extend(sequence[position:]) + + else: # shuffle everything + shuffled = list(sequence) + + random.shuffle(shuffled) + + for fix in fixed: + shuffled.insert(fix[0], fix[1]) + + return ''.join(shuffled) + + +def fused_decoy(sequence, decoy_mode='reverse', sep='R', **kwargs): + """ + Create a "fused" decoy sequence by concatenating a decoy sequence with the original one. + The method and its use cases are described in: + + Ivanov, M. V., Levitsky, L. I., & Gorshkov, M. V. (2016). + `Adaptation of Decoy Fusion Strategy for Existing Multi-Stage Search Workflows. + <http://doi.org/10.1007/s13361-016-1436-7>`_ + Journal of The American Society for Mass Spectrometry, 27(9), 1579-1582. + + Parameters + ---------- + sequence : str + The initial sequence string. + decoy_mode : str or callable, optional + Type of decoy sequence to use. Should be one of the standard modes or any callable. + Standard modes are: + + - 'reverse' for :py:func:`reverse`; + - 'shuffle' for :py:func:`shuffle`; + - 'fused' for :py:func:`fused_decoy` (if you love recursion). + + Default is 'reverse'. + sep : str, optional + Amino acid motif that separates the decoy sequence from the target one. + This setting should reflect the enzyme specificity used in the search against the + database being generated. Default is 'R', which is suitable for trypsin searches. + **kwargs : given to the decoy generation function. + + Examples + -------- + >>> fused_decoy('PEPT') + 'TPEPRPEPT' + >>> fused_decoy('MPEPT', 'shuffle', 'K', keep_nterm=True) + 'MPPTEKMPEPT' + """ + decoy = decoy_sequence(sequence, decoy_mode, **kwargs) + return decoy + sep + sequence + + +_decoy_functions = {'reverse': reverse, 'shuffle': shuffle, 'fused': fused_decoy} + + +def decoy_sequence(sequence, mode='reverse', **kwargs): + """ + Create a decoy sequence out of a given sequence string. + + Parameters + ---------- + sequence : str + The initial sequence string. + mode : str or callable, optional + Type of decoy sequence. Should be one of the standard modes or any callable. + Standard modes are: + + - 'reverse' for :py:func:`reverse`; + - 'shuffle' for :py:func:`shuffle`; + - 'fused' for :py:func:`fused_decoy`. + + Default is 'reverse'. + **kwargs : given to the decoy function. + + Returns + ------- + decoy_sequence : str + The decoy sequence. + """ + fmode = mode + if isinstance(mode, str): + fmode = _decoy_functions.get(mode) + if fmode is None: + raise aux.PyteomicsError('Unsupported decoy mode: {}'.format(mode)) + return fmode(sequence, **kwargs) + + +def decoy_entries(entries, mode='reverse', prefix=DECOY_PREFIX, decoy_only=True, **kwargs): + """Iterate over protein `entries` (tuples) and produce decoy entries. + The `entries` are only iterated once. + + Parameters + ---------- + entries : iterable of tuples + Any iterable of (description, sequence) pairs. + mode : str or callable, optional + Algorithm of decoy sequence generation. 'reverse' by default. + See :py:func:`decoy_sequence` for more information. + prefix : str, optional + A prefix to the protein descriptions of decoy entries. The default + value is `'DECOY_'`. + decoy_only : bool, optional + If set to :py:const:`True`, only the decoy entries will be written to + `output`. If :py:const:`False`, each consumed entry is yielded unchanged, + followed by its decoy couterpart. + :py:const:`True` by default. + **kwargs : given to :py:func:`decoy_sequence`. + + Returns + ------- + out : iterator + An iterator over new entries. + """ + for item in entries: + if not decoy_only: + yield item + yield Protein(prefix + item[0], decoy_sequence(item[1], mode, **kwargs)) + + +@aux._file_reader() +def decoy_db(source=None, mode='reverse', prefix=DECOY_PREFIX, decoy_only=False, + ignore_comments=False, parser=None, **kwargs): + """Iterate over sequences for a decoy database out of a given ``source``. + + Parameters + ---------- + source : file-like object or str or None, optional + A path to a FASTA database or a file object itself. Default is + :py:const:`None`, which means read standard input. + mode : str or callable, optional + Algorithm of decoy sequence generation. 'reverse' by default. + See :py:func:`decoy_sequence` for more information. + prefix : str, optional + A prefix to the protein descriptions of decoy entries. The default + value is `'DECOY_'`. + decoy_only : bool, optional + If set to :py:const:`True`, only the decoy entries will be written to + `output`. If :py:const:`False`, the entries from `source` will be + written first. + :py:const:`False` by default. + ignore_comments : bool, optional + If True then ignore the second and subsequent lines of description. + Default is :py:const:`False`. + parser : function or None, optional + Defines whether the fasta descriptions should be parsed. If it is a + function, that function will be given the description string, and + the returned value will be yielded together with the sequence. + The :py:data:`std_parsers` dict has parsers for several formats. + Hint: specify :py:func:`parse` as the parser to apply automatic + format guessing. + Default is :py:const:`None`, which means return the header "as is". + **kwargs : given to :py:func:`decoy_sequence`. + + Returns + ------- + out : iterator + An iterator over entries of the new database. + """ + + # store the initial position + pos = source.tell() + if not decoy_only: + with read(source, ignore_comments, parser) as f: + for x in f: + yield x + # return to the initial position in the source file to read again + source.seek(pos) + + parser = parser or (lambda x: x) + with read(source, ignore_comments) as f: + for descr, seq in f: + yield Protein(parser(prefix + descr), decoy_sequence(seq, mode, **kwargs)) + + +@aux._file_writer() +def write_decoy_db(source=None, output=None, mode='reverse', prefix=DECOY_PREFIX, + decoy_only=False, **kwargs): + """Generate a decoy database out of a given ``source`` and write to file. + + If `output` is a path, the file will be open for appending, so no information + will be lost if the file exists. Although, the user should be careful when + providing open file streams as `source` and `output`. The reading and writing + will start from the current position in the files, which is where the last I/O + operation finished. One can use the :py:func:`file.seek` method to change it. + + Parameters + ---------- + source : file-like object or str or None, optional + A path to a FASTA database or a file object itself. Default is + :py:const:`None`, which means read standard input. + output : file-like object or str, optional + A path to the output database or a file open for writing. + Defaults to :py:const:`None`, the results go to the standard output. + mode : str or callable, optional + Algorithm of decoy sequence generation. 'reverse' by default. + See :py:func:`decoy_sequence` for more details. + prefix : str, optional + A prefix to the protein descriptions of decoy entries. The default + value is `'DECOY_'` + decoy_only : bool, optional + If set to :py:const:`True`, only the decoy entries will be written to + `output`. If :py:const:`False`, the entries from `source` will be + written as well. + :py:const:`False` by default. + file_mode : str, keyword only, optional + If `output` is a file name, defines the mode the file will be opened in. + Otherwise will be ignored. Default is 'a'. + **kwargs : given to :py:func:`decoy_sequence`. + + Returns + ------- + output : file + A (closed) file object for the created file. + """ + with decoy_db(source, mode, prefix, decoy_only, **kwargs) as entries: + write(entries, output) + return output.file + + +# auxiliary functions for parsing of FASTA headers +def _split_pairs(s): + return dict(map(lambda x: x.strip(), x.split('=')) + for x in re.split(r' (?=\w+=)', s.strip())) + + +def _intify(d, keys): + for k in keys: + if k in d: + d[k] = int(d[k]) + + +std_parsers = {'uniprot': (UniProt, IndexedUniProt), 'uniref': (UniRef, IndexedUniRef), + 'uniparc': (UniParc, IndexedUniParc), 'unimes': (UniMes, IndexedUniMes), + 'spd': (SPD, IndexedSPD), 'ncbi': (NCBI, IndexedNCBI), + 'refseq': (RefSeq, IndexedRefSeq), + None: (FASTA, IndexedFASTA)} +"""A dictionary with parsers for known FASTA header formats. For now, supported +formats are those described at +`UniProt help page <http://www.uniprot.org/help/fasta-headers>`_.""" + + +_std_mixins = {'uniprot': UniProtMixin, 'uniref': UniRefMixin, + 'uniparc': UniParcMixin, 'unimes': UniMesMixin, 'spd': SPDMixin, + 'ncbi': NCBIMixin, 'refseq': RefSeqMixin} + + +def parse(header, flavor='auto', parsers=None): + """Parse the FASTA header and return a nice dictionary. + + Parameters + ---------- + + header : str + FASTA header to parse + flavor : str, optional + Short name of the header format (case-insensitive). Valid values are + :py:const:`'auto'` and keys of the `parsers` dict. Default is + :py:const:`'auto'`, which means try all formats in turn and return the + first result that can be obtained without an exception. + parsers : dict, optional + A dict where keys are format names (lowercased) and values are functions + that take a header string and return the parsed header. + + Returns + ------- + + out : dict + A dictionary with the info from the header. The format depends on the + flavor. + """ + parser_function = lambda cls: cls().parser + flavor = flavor.lower() + # accept strings with and without leading '>' + if header and header[0] == '>': + header = header[1:] + + # choose the format + known = parsers or _std_mixins + + if flavor == 'auto': + for parser in known.values(): + try: + return parser_function(parser)(header) + except Exception: + pass + raise aux.PyteomicsError('Unknown FASTA header format: ' + header) + elif flavor in known: + try: + return parser_function(known[flavor])(header) + except Exception as e: + raise aux.PyteomicsError('Could not parse header as "{}". ' + 'The error message was: {}: {}. Header: "{}"'.format( + flavor, type(e).__name__, e.args[0], header)) + raise aux.PyteomicsError('Unknown flavor: {}'.format(flavor)) + + +chain = aux._make_chain(read, 'read') +decoy_chain = aux._make_chain(decoy_db, 'decoy_db') diff --git a/pyteomics/mass/__init__.py b/pyteomics/mass/__init__.py new file mode 100644 index 0000000..a5b9981 --- /dev/null +++ b/pyteomics/mass/__init__.py @@ -0,0 +1,6 @@ +from .mass import * +try: + from . import unimod +except ImportError: + # SQLAlchemy is not available + pass \ No newline at end of file diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py new file mode 100644 index 0000000..03afee2 --- /dev/null +++ b/pyteomics/mass/mass.py @@ -0,0 +1,1231 @@ +""" +mass - molecular masses and isotope distributions +================================================= + +Summary +------- + +This module defines general functions for mass and isotope abundance +calculations. For most of the functions, the user can define a given +substance in various formats, but all of them would be reduced to the +:py:func:`Composition <Composition.__init__>` object describing its +chemical composition. + + +Classes +------- + + :py:func:`Composition <Composition.__init__>` - a class storing chemical + composition of a substance. + + :py:class:`Unimod` - a class representing a Python interface to the + `Unimod database <http://unimod.org/>`_ + (see :py:mod:`pyteomics.mass.unimod` for a much more powerful alternative). + +Mass calculations +----------------- + + :py:func:`calculate_mass` - a general routine for mass / m/z + calculation. Can calculate mass for a polypeptide sequence, chemical + formula or elemental composition. Supplied with an ion type and + charge, the function would calculate m/z. + + :py:func:`fast_mass` - a less powerful but much faster function for + polypeptide mass calculation. + + :py:func:`fast_mass2` - a version of `fast_mass` that supports *modX* notation. + +Isotopic abundances +------------------- + + :py:func:`isotopic_composition_abundance` - calculate the relative + abundance of a given isotopic composition. + + :py:func:`most_probable_isotopic_composition` - finds the most + abundant isotopic composition for a molecule defined by a + polypeptide sequence, chemical formula or elemental composition. + + :py:func:`isotopologues` - iterate over possible isotopic conposition of a molecule, + possibly filtered by abundance. + +Data +---- + + :py:data:`nist_mass` - a dict with exact masses of the most abundant + isotopes. + + :py:data:`std_aa_comp` - a dict with the elemental compositions + of the standard twenty amino acid residues, selenocysteine and pyrrolysine. + + :py:data:`std_ion_comp` - a dict with the relative elemental + compositions of the standard peptide fragment ions. + + :py:data:`std_aa_mass` - a dict with the monoisotopic masses + of the standard twenty amino acid residues, selenocysteine and pyrrolysine. + +----------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +import math +from .. import parser +from ..auxiliary import PyteomicsError, _nist_mass, BasicComposition +from itertools import chain, product, combinations_with_replacement +from collections import defaultdict +try: + from urllib import urlopen +except ImportError: + from urllib.request import urlopen +from datetime import datetime +import re +import operator +import warnings + +nist_mass = _nist_mass +""" +A dict with the exact element masses downloaded from the NIST website: +http://www.nist.gov/pml/data/comp.cfm . There are entries for each +element containing the masses and relative abundances of several +abundant isotopes and a separate entry for undefined isotope with zero +key, mass of the most abundant isotope and 1.0 abundance. +""" + +PROTON = 'H+' + +def _make_isotope_string(element_name, isotope_num): + """Form a string label for an isotope.""" + if isotope_num == 0: + return element_name + else: + return '{}[{}]'.format(element_name, isotope_num) + + +def _parse_isotope_string(label): + """Parse an string with an isotope label and return the element name and + the isotope number. + + >>> _parse_isotope_string('C') + ('C', 0) + >>> _parse_isotope_string('C[12]') + ('C', 12) + """ + element_name, num = re.match(_isotope_string, label).groups() + isotope_num = int(num) if num else 0 + return element_name, isotope_num + + +# Initialize std_aa_comp and std_ion_comp before the Composition class +# description, fill it later. +std_aa_comp = {} +"""A dictionary with elemental compositions of the twenty standard +amino acid residues, selenocysteine, pyrrolysine, +and standard H- and -OH terminal groups. +""" + +std_ion_comp = {} +"""A dict with relative elemental compositions of the standard peptide +fragment ions. An elemental composition of a fragment ion is calculated as a +difference between the total elemental composition of an ion +and the sum of elemental compositions of its constituting amino acid residues. +""" + +_isotope_string = r'^([A-Z][a-z+]*)(?:\[(\d+)\])?$' +_atom = r'([A-Z][a-z+]*)(?:\[(\d+)\])?([+-]?\d+)?' +_formula = r'^({})*$'.format(_atom) + + +class Composition(BasicComposition): + """ + A Composition object stores a chemical composition of a + substance. Basically, it is a dict object, with the names + of chemical elements as keys and values equal to an integer number of + atoms of the corresponding element in a substance. + + The main improvement over dict is that Composition objects allow + adding and subtraction. + """ + _kw_sources = {'formula', 'sequence', 'parsed_sequence', 'split_sequence', 'composition'} + _carrier_spec = r"^(?P<formula>\S+?)(?:(?P<sign>[+-])(?P<charge>\d+)?)?$" + + def _from_parsed_sequence(self, parsed_sequence, aa_comp): + self.clear() + comp = defaultdict(int) + for label in parsed_sequence: + if label in aa_comp: + for elem, cnt in aa_comp[label].items(): + comp[elem] += cnt + else: + try: + mod, aa = parser._split_label(label) + for elem, cnt in chain( + aa_comp[mod].items(), aa_comp[aa].items()): + comp[elem] += cnt + + except (PyteomicsError, KeyError): + raise PyteomicsError('No information for %s in `aa_comp`' % label) + self._from_composition(comp) + + def _from_split_sequence(self, split_sequence, aa_comp): + self.clear() + comp = defaultdict(int) + for group in split_sequence: + i = 0 + while i < len(group): + for j in range(len(group) + 1, -1, -1): + try: + label = ''.join(group[i:j]) + for elem, cnt in aa_comp[label].items(): + comp[elem] += cnt + except KeyError: + continue + else: + i = j + break + if j == 0: + raise PyteomicsError("Invalid group starting from position %d: %s" % (i + 1, group)) + self._from_composition(comp) + + def _from_sequence(self, sequence, aa_comp): + parsed_sequence = parser.parse( + sequence, + labels=aa_comp, + show_unmodified_termini=True) + self._from_parsed_sequence(parsed_sequence, aa_comp) + + def _from_formula(self, formula): + if not re.match(_formula, formula): + raise PyteomicsError('Invalid formula: ' + formula) + for elem, isotope, number in re.findall(_atom, formula): + self[_make_isotope_string(elem, int(isotope) if isotope else 0)] += int(number) if number else 1 + + def _from_composition(self, comp): + for isotope_string, num_atoms in comp.items(): + element_name, isotope_num = _parse_isotope_string( + isotope_string) + + # Remove explicitly undefined isotopes (e.g. X[0]). + self[_make_isotope_string(element_name, isotope_num)] = num_atoms + + def __init__(self, *args, **kwargs): + """ + A Composition object stores a chemical composition of a + substance. Basically it is a dict object, in which keys are the names + of chemical elements and values contain integer numbers of + corresponding atoms in a substance. + + The main improvement over dict is that Composition objects allow + addition and subtraction. + + A Composition object can be initialized with one of the + following arguments: formula, sequence, parsed_sequence or + split_sequence. + + If none of these are specified, the constructor will look at the first + positional argument and try to build the object from it. Without + positional arguments, a Composition will be constructed directly from + keyword arguments. + + If there's an ambiguity, i.e. the argument is both a valid sequence + and a formula (such as 'HCN'), it will be treated as a sequence. You + need to provide the 'formula' keyword to override this. + + .. warning:: + + Be careful when supplying a list with a parsed sequence or a split + sequence as a keyword argument. It must be + obtained with enabled `show_unmodified_termini` option. + When supplying it as a positional argument, the option doesn't + matter, because the positional argument is always converted to + a sequence prior to any processing. + + Parameters + ---------- + formula : str, optional + A string with a chemical formula. All elements must be present in + `mass_data`. + sequence : str, optional + A polypeptide sequence string in modX notation. + parsed_sequence : list of str, optional + A polypeptide sequence parsed into a list of amino acids. + split_sequence : list of tuples of str, optional + A polypeptyde sequence parsed into a list of tuples + (as returned be :py:func:`pyteomics.parser.parse` with + ``split=True``). + aa_comp : dict, optional + A dict with the elemental composition of the amino acids (the + default value is std_aa_comp). + ion_comp : dict, optional + A dict with the relative elemental compositions of peptide ion + fragments (default is :py:data:`std_ion_comp`). + ion_type : str, optional + If specified, then the polypeptide is considered to be in the form + of the corresponding ion. + """ + defaultdict.__init__(self, int) + + aa_comp = kwargs.get('aa_comp', std_aa_comp) + + kw_given = self._kw_sources.intersection(kwargs) + if len(kw_given) > 1: + raise PyteomicsError('Only one of {} can be specified!\n' + 'Given: {}'.format(', '.join(self._kw_sources), + ', '.join(kw_given))) + elif kw_given: + kwa = kw_given.pop() + if kwa == 'formula': + self._from_formula(kwargs['formula']) + else: + getattr(self, '_from_' + kwa)(kwargs[kwa], aa_comp) + + # can't build from kwargs + elif args: + if isinstance(args[0], dict): + self._from_composition(args[0]) + elif isinstance(args[0], str): + try: + self._from_sequence(args[0], aa_comp) + except PyteomicsError: + try: + self._from_formula(args[0]) + except PyteomicsError: + raise PyteomicsError( + 'Could not create a Composition object from ' + 'string: "{}": not a valid sequence or ' + 'formula'.format(args[0])) + else: + try: + self._from_sequence(parser.tostring(args[0], True), aa_comp) + except Exception: + raise PyteomicsError('Could not create a Composition object' + ' from `{}`. A Composition object must be ' + 'specified by sequence, parsed or split sequence,' + ' formula or dict.'.format(args[0])) + else: + self._from_composition(kwargs) + + ion_comp = kwargs.get('ion_comp', std_ion_comp) + if 'ion_type' in kwargs: + self += ion_comp[kwargs['ion_type']] + + # Charge is not supported in kwargs + charge = self['H+'] + if 'charge' in kwargs: + if charge: + raise PyteomicsError('Charge is specified both by the number of protons and `charge` in kwargs') + else: + warnings.warn('charge and charge carrier should be specified when calling mass(). ' + 'Support for charge in Composition.__init__ will be removed in a future version.', + FutureWarning) + self['H+'] = kwargs['charge'] + + @classmethod + def _parse_carrier(cls, spec): + """Parse a charge carrier spec. + The spec syntax is: <formula>[+-][N] + <formula> is a chemical formula as supported by :py:meth:`_from_formula`. + [+-] is one of "+" or "-", N is a natural number (1 is assumed if omitted). + If both the sign and the charge are missing, the charge of this group can be + specified as the number of protons in `<formula>`. Otherwise, having protons + in `<formula>` is an error. + + Returns + ------- + out : tuple + Parsed :py:class:`Composition` and charge of the charge carrier. + """ + if spec is None: + return cls({PROTON: 1}), 1 + try: + formula, sign, charge = re.match(cls._carrier_spec, spec).groups() + except AttributeError: + raise PyteomicsError('Invalid charge carrier specification: ' + spec) + comp = cls(formula=formula) + if sign is not None and PROTON in comp: + raise PyteomicsError('Carrier contains protons and also has a charge specified.') + if sign is None: + # only formula is given + if PROTON not in comp: + charge = None + charge = comp[PROTON] + elif charge is None: + charge = (-1, 1)[sign == '+'] + else: + charge = int(charge) * (-1, 1)[sign == '+'] + return comp, charge + + @staticmethod + def _mass_to_mz(mass, composition=None, **kwargs): + mass_data = kwargs.get('mass_data', nist_mass) + absolute = kwargs.get('absolute', True) + average = kwargs.get('average', False) + + # Calculate m/z if required + charge = kwargs.get('charge') + if charge: + # get charge carrier mass and charge + charge_carrier = kwargs.get('charge_carrier') + ccharge = kwargs.get('carrier_charge') + if isinstance(charge_carrier, dict): + carrier_comp = Composition(charge_carrier) + if ccharge and PROTON in carrier_comp: + raise PyteomicsError('`carrier_charge` specified but the charge carrier contains protons.') + carrier_charge = ccharge or carrier_comp[PROTON] + if not carrier_charge: + raise PyteomicsError('Charge carrier charge not specified.') + else: + carrier_comp, carrier_charge = (composition or Composition)._parse_carrier(charge_carrier) + if carrier_charge and ccharge: + raise PyteomicsError('Both `carrier_charge` and charge in carrier spec are given.') + carrier_charge = ccharge or carrier_charge + if not carrier_charge: + raise PyteomicsError('Charge of the charge carrier group not specified.') + if charge % carrier_charge: + raise PyteomicsError('The `charge` must be a multiple of the carrier charge. Given: {} and {}'.format( + charge, carrier_charge)) + num = charge // carrier_charge + carrier_mass = carrier_comp.mass(mass_data=mass_data, average=average, charge=0) + + if charge and (composition is None or not composition['H+']): + mass += carrier_mass * num + if charge and composition and composition['H+']: + raise PyteomicsError('Composition contains protons and charge is explicitly specified.') + if charge is None and composition and composition['H+']: + warnings.warn('Charge is not specified, but the Composition contains protons. Assuming m/z calculation.') + charge = composition['H+'] + if charge: + mass /= charge + if charge and charge < 0 and absolute: + mass = abs(mass) + return mass + + def mass(self, **kwargs): + """Calculate the mass or *m/z* of a :py:class:`Composition`. + + Parameters + ---------- + average : bool, optional + If :py:const:`True` then the average mass is calculated. + Note that mass is not averaged for elements with specified isotopes. + Default is :py:const:`False`. + charge : int, optional + If not 0 then m/z is calculated. See also: `charge_carrier`. + charge_carrier : str or dict, optional + Chemical group carrying the charge. Defaults to a proton, "H+". + If string, must be a chemical formula, as supported by the + :class:`Composition` `formula` argument, + except it must end with a charge formatted as "[+-][N]". + If N is omitted, single charge is assumed. + Examples of `charge_carrier`: "H+", "NH3+" + (here, 3 is part of the composition, and + is a single charge), + "Fe+2" ("Fe" is the formula and "+2" is the charge). + .. note :: `charge` must be a multiple of `charge_carrier` charge. + + If dict, it is the atomic composition of the group. + In this case, the charge can be passed separately as `carrier_charge` + or it will be deduced from the number of protons in `charge_carrier`. + carrier_charge : int, optional + Charge of the charge carrier group (if `charge_carrier` is specified + as a composition dict). + + .. note :: `charge` must be a multiple of `charge_charge`. + + mass_data : dict, optional + A dict with the masses of the chemical elements (the default + value is :py:data:`nist_mass`). + ion_comp : dict, optional + A dict with the relative elemental compositions of peptide ion + fragments (default is :py:data:`std_ion_comp`). + ion_type : str, optional + If specified, then the polypeptide is considered to be in the form + of the corresponding ion. Do not forget to specify the charge state! + absolute : bool, optional + If :py:const:`True` (default), the m/z value returned will always be positive, + even for negatively charged ions. + + .. note :: + `absolute` only applies when `charge` is negative. + The mass can still be negative for negative compositions. + + Returns + ------- + mass : float + """ + composition = self + mass_data = kwargs.get('mass_data', nist_mass) + + # Calculate mass + mass = 0.0 + average = kwargs.get('average', False) + + for isotope_string, amount in composition.items(): + element_name, isotope_num = _parse_isotope_string(isotope_string) + # Calculate average mass if required and the isotope number is + # not specified. + if (not isotope_num) and average: + for isotope, data in mass_data[element_name].items(): + if isotope: + mass += (amount * data[0] * data[1]) + else: + mass += (amount * mass_data[element_name][isotope_num][0]) + + return self._mass_to_mz(mass, self, **kwargs) + + +std_aa_comp.update({ + 'A': Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1}), + 'C': Composition({'H': 5, 'C': 3, 'S': 1, 'O': 1, 'N': 1}), + 'D': Composition({'H': 5, 'C': 4, 'O': 3, 'N': 1}), + 'E': Composition({'H': 7, 'C': 5, 'O': 3, 'N': 1}), + 'F': Composition({'H': 9, 'C': 9, 'O': 1, 'N': 1}), + 'G': Composition({'H': 3, 'C': 2, 'O': 1, 'N': 1}), + 'H': Composition({'H': 7, 'C': 6, 'N': 3, 'O': 1}), + 'I': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), + 'J': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), + 'K': Composition({'H': 12, 'C': 6, 'N': 2, 'O': 1}), + 'L': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), + 'M': Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1}), + 'N': Composition({'H': 6, 'C': 4, 'O': 2, 'N': 2}), + 'P': Composition({'H': 7, 'C': 5, 'O': 1, 'N': 1}), + 'Q': Composition({'H': 8, 'C': 5, 'O': 2, 'N': 2}), + 'R': Composition({'H': 12, 'C': 6, 'N': 4, 'O': 1}), + 'S': Composition({'H': 5, 'C': 3, 'O': 2, 'N': 1}), + 'T': Composition({'H': 7, 'C': 4, 'O': 2, 'N': 1}), + 'V': Composition({'H': 9, 'C': 5, 'O': 1, 'N': 1}), + 'W': Composition({'C': 11, 'H': 10, 'N': 2, 'O': 1}), + 'Y': Composition({'H': 9, 'C': 9, 'O': 2, 'N': 1}), + 'U': Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1, 'Se' : 1}), + 'O': Composition({'H': 19, 'C': 12, 'O': 2, 'N': 3}), + 'H-': Composition({'H': 1}), + '-OH': Composition({'O': 1, 'H': 1}), + }) + + +std_ion_comp.update({ + 'M': Composition(formula=''), + 'M-H2O': Composition(formula='H-2O-1'), + 'M-NH3': Composition(formula='N-1H-3'), + 'a': Composition(formula='H-2O-1' + 'C-1O-1'), + 'a-H2O': Composition(formula='H-2O-1' + 'C-1O-1' + 'H-2O-1'), + 'a-NH3': Composition(formula='H-2O-1' + 'C-1O-1' + 'N-1H-3'), + 'b': Composition(formula='H-2O-1'), + 'b-H2O': Composition(formula='H-2O-1' + 'H-2O-1'), + 'b-NH3': Composition(formula='H-2O-1' + 'N-1H-3'), + 'c': Composition(formula='H-2O-1' + 'NH3'), + 'c-1': Composition(formula='H-2O-1' + 'NH3' + 'H-1'), + 'c-dot': Composition(formula='H-2O-1' + 'NH3' + 'H1'), + 'c+1': Composition(formula='H-2O-1' + 'NH3' + 'H1'), + 'c+2': Composition(formula='H-2O-1' + 'NH3' + 'H2'), + 'c-H2O': Composition(formula='H-2O-1' + 'NH3' + 'H-2O-1'), + 'c-NH3': Composition(formula='H-2O-1'), + 'x': Composition(formula='H-2O-1' + 'CO2'), + 'x-H2O': Composition(formula='H-2O-1' + 'CO2' + 'H-2O-1'), + 'x-NH3': Composition(formula='H-2O-1' + 'CO2' + 'N-1H-3'), + 'y': Composition(formula=''), + 'y-H2O': Composition(formula='H-2O-1'), + 'y-NH3': Composition(formula='N-1H-3'), + 'z': Composition(formula='H-2O-1' + 'ON-1H-1'), + 'z-dot': Composition(formula='H-2O-1' + 'ON-1'), + 'z+1': Composition(formula='H-2O-1' + 'ON-1H1'), + 'z+2': Composition(formula='H-2O-1' + 'ON-1H2'), + 'z+3': Composition(formula='H-2O-1' + 'ON-1H3'), + 'z-H2O': Composition(formula='H-2O-1' + 'ON-1H-1' + 'H-2O-1'), + 'z-NH3': Composition(formula='H-2O-1' + 'ON-1H-1' + 'N-1H-3'), + }) + + +def calculate_mass(*args, **kwargs): + """Calculates the monoisotopic mass of a polypeptide defined by a + sequence string, parsed sequence, chemical formula or + Composition object. + + One or none of the following keyword arguments is required: + **formula**, **sequence**, **parsed_sequence**, **split_sequence** + or **composition**. + All arguments given are used to create a :py:class:`Composition` object, + unless an existing one is passed as a keyword argument. + + Note that if a sequence string is supplied and terminal groups are not + explicitly shown, then the mass is calculated for a polypeptide with + standard terminal groups (NH2- and -OH). + + .. warning:: + + Be careful when supplying a list with a parsed sequence. It must be + obtained with enabled `show_unmodified_termini` option. + + Parameters + ---------- + formula : str, optional + A string with a chemical formula. + sequence : str, optional + A polypeptide sequence string in modX notation. + proforma : str, optional + A polypeptide sequeence string in `ProForma notation <https://www.psidev.info/proforma>`_, + or a :py:class:`pyteomics.proforma.ProForma` object. + parsed_sequence : list of str, optional + A polypeptide sequence parsed into a list of amino acids. + composition : Composition, optional + A Composition object with the elemental composition of a substance. + aa_comp : dict, optional + A dict with the elemental composition of the amino acids (the + default value is std_aa_comp). + average : bool, optional + If :py:const:`True` then the average mass is calculated. Note that mass + is not averaged for elements with specified isotopes. Default is + :py:const:`False`. + charge : int, optional + If not 0 then m/z is calculated: the mass is increased + by the corresponding number of proton masses and divided + by `charge`. + charge_carrier : str or dict, optional + Chemical group carrying the charge. Defaults to a proton, "H+". + If string, must be a chemical formula, as supported by the + :class:`Composition` `formula` argument, + except it must end with a charge formatted as "[+-][N]". + If N is omitted, single charge is assumed. + Examples of `charge_carrier`: "H+", "NH3+" + (here, 3 is part of the composition, and + is a single charge), + "Fe+2" ("Fe" is the formula and "+2" is the charge). + + .. note :: + `charge` must be a multiple of `charge_carrier` charge. + + If dict, it is the atomic composition of the group. + In this case, the charge can be passed separately as `carrier_charge` + or it will be deduced from the number of protons in `charge_carrier`. + carrier_charge : int, optional + Charge of the charge carrier group (if `charge_carrier` is specified + as a composition dict). + + .. note :: + `charge` must be a multiple of `charge_charge`. + + mass_data : dict, optional + A dict with the masses of the chemical elements (the default + value is :py:data:`nist_mass`). + ion_comp : dict, optional + A dict with the relative elemental compositions of peptide ion + fragments (default is :py:data:`std_ion_comp`). + ion_type : str, optional + If specified, then the polypeptide is considered to be in the form + of the corresponding ion. Do not forget to specify the charge state! + absolute : bool, optional + If :py:const:`True` (default), the m/z value returned will always be positive, + even for negatively charged ions. + + .. note :: + `absolute` only applies when `charge` is negative. + The mass can still be negative for negative compositions. + + Returns + ------- + mass : float + """ + if 'proforma' in kwargs: + # do not try to create a composition + from .. import proforma + proteoform = kwargs.pop('proforma') + if isinstance(proteoform, str): + proteoform = proforma.ProForma.parse(proteoform) + return Composition._mass_to_mz(proteoform.mass, **kwargs) + + # These parameters must be passed to mass(), not __init__ + mass_kw = {} + for k in ['charge', 'charge_carrier', 'carrier_charge', 'absolute']: + if k in kwargs: + mass_kw[k] = kwargs.pop(k) + # Make a copy of `composition` keyword argument. + composition = (Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs)) + kwargs.update(mass_kw) + return composition.mass(**kwargs) + + +def most_probable_isotopic_composition(*args, **kwargs): + """Calculate the most probable isotopic composition of a peptide + molecule/ion defined by a sequence string, parsed sequence, + chemical formula or :py:class:`Composition` object. + + Note that if a sequence string without terminal groups is supplied then the + isotopic composition is calculated for a polypeptide with standard + terminal groups (H- and -OH). + + For each element, only two most abundant isotopes are considered. + + Parameters + ---------- + formula : str, optional + A string with a chemical formula. + sequence : str, optional + A polypeptide sequence string in modX notation. + parsed_sequence : list of str, optional + A polypeptide sequence parsed into a list of amino acids. + composition : :py:class:`Composition`, optional + A :py:class:`Composition` object with the elemental composition of a + substance. + elements_with_isotopes : list of str + A list of elements to be considered in isotopic distribution + (by default, every element has a isotopic distribution). + aa_comp : dict, optional + A dict with the elemental composition of the amino acids (the + default value is :py:data:`std_aa_comp`). + mass_data : dict, optional + A dict with the masses of chemical elements (the default + value is :py:data:`nist_mass`). + ion_comp : dict, optional + A dict with the relative elemental compositions of peptide ion + fragments (default is :py:data:`std_ion_comp`). + + Returns + ------- + out: tuple (Composition, float) + A tuple with the most probable isotopic composition and its + relative abundance. + """ + + composition = (dict(kwargs['composition']) if 'composition' in kwargs + else Composition(*args, **kwargs)) + + # Removing isotopes from the composition. + for isotope_string in composition: + element_name, isotope_num = _parse_isotope_string(isotope_string) + if isotope_num: + composition[element_name] += composition.pop(isotope_string) + + mass_data = kwargs.get('mass_data', nist_mass) + elements_with_isotopes = kwargs.get('elements_with_isotopes') + isotopic_composition = Composition() + + for element_name in composition: + if not elements_with_isotopes or (element_name in elements_with_isotopes): + # Take the two most abundant isotopes. + first_iso, second_iso = sorted([(i[0], i[1][1]) for i in mass_data[element_name].items() if i[0]], + key=lambda x: -x[1])[:2] + + # Write the number of isotopes of the most abundant type. + first_iso_str = _make_isotope_string(element_name, first_iso[0]) + isotopic_composition[first_iso_str] = int(math.ceil( + composition[element_name])) * first_iso[1] + + # Write the number of the second isotopes. + second_iso_str = _make_isotope_string(element_name, second_iso[0]) + isotopic_composition[second_iso_str] = composition[element_name] - isotopic_composition[first_iso_str] + else: + isotopic_composition[element_name] = composition[element_name] + + return (isotopic_composition, + isotopic_composition_abundance(composition=isotopic_composition, mass_data=mass_data)) + + +def isotopic_composition_abundance(*args, **kwargs): + """Calculate the relative abundance of a given isotopic composition + of a molecule. + + Parameters + ---------- + formula : str, optional + A string with a chemical formula. + composition : Composition, optional + A Composition object with the isotopic composition of a substance. + mass_data : dict, optional + A dict with the masses of chemical elements (the default + value is :py:data:`nist_mass`). + + Returns + ------- + relative_abundance : float + The relative abundance of a given isotopic composition. + """ + + composition = (Composition(kwargs['composition']) + if 'composition' in kwargs + else Composition(*args, **kwargs)) + + isotopic_composition = defaultdict(dict) + + # Check if there are default and non-default isotopes of the same + # element and rearrange the elements. + for element in composition: + element_name, isotope_num = _parse_isotope_string(element) + + # If there is already an entry for this element and either it + # contains a default isotope or newly added isotope is default + # then raise an exception. + if (element_name in isotopic_composition) and (isotope_num == 0 or 0 in isotopic_composition[element_name]): + raise PyteomicsError( + 'Please specify the isotopic states of all atoms of %s or do not specify them at all.' % element_name) + else: + isotopic_composition[element_name][isotope_num] = composition[element] + + # Calculate relative abundance. + mass_data = kwargs.get('mass_data', nist_mass) + num1, num2, denom = 1, 1, 1 + for element_name, isotope_dict in isotopic_composition.items(): + num1 *= math.factorial(sum(isotope_dict.values())) + for isotope_num, isotope_content in isotope_dict.items(): + denom *= math.factorial(isotope_content) + if isotope_num: + num2 *= mass_data[element_name][isotope_num][1] ** isotope_content + + return num2 * (num1 / denom) + + +def isotopologues(*args, **kwargs): + """Iterate over possible isotopic states of a molecule. + The molecule can be defined by formula, sequence, parsed sequence, or composition. + The space of possible isotopic compositions is restrained by parameters + ``elements_with_isotopes``, ``isotope_threshold``, ``overall_threshold``. + + Parameters + ---------- + formula : str, optional + A string with a chemical formula. + sequence : str, optional + A polypeptide sequence string in modX notation. + parsed_sequence : list of str, optional + A polypeptide sequence parsed into a list of amino acids. + composition : :py:class:`Composition`, optional + A :py:class:`Composition` object with the elemental composition of a + substance. + report_abundance : bool, optional + If :py:const:`True`, the output will contain 2-tuples: `(composition, abundance)`. + Otherwise, only compositions are yielded. Default is :py:const:`False`. + elements_with_isotopes : container of str, optional + A set of elements to be considered in isotopic distribution + (by default, every element has an isotopic distribution). + isotope_threshold : float, optional + The threshold abundance of a specific isotope to be considered. + Default is :py:const:`5e-4`. + overall_threshold : float, optional + The threshold abundance of the calculateed isotopic composition. + Default is :py:const:`0`. + aa_comp : dict, optional + A dict with the elemental composition of the amino acids (the + default value is :py:data:`std_aa_comp`). + mass_data : dict, optional + A dict with the masses of chemical elements (the default + value is :py:data:`nist_mass`). + + Returns + ------- + out : iterator + Iterator over possible isotopic compositions. + """ + iso_threshold = kwargs.pop('isotope_threshold', 5e-4) + overall_threshold = kwargs.pop('overall_threshold', 0.0) + mass_data = kwargs.get('mass_data', nist_mass) + elements_with_isotopes = kwargs.get('elements_with_isotopes') + report_abundance = kwargs.get('report_abundance', False) + composition = Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs) + other_kw = kwargs.copy() + for k in Composition._kw_sources: + other_kw.pop(k, None) + + dict_elem_isotopes = {} + for element in composition: + if elements_with_isotopes is None or element in elements_with_isotopes: + element_name, isotope_num = _parse_isotope_string(element) + isotopes = {k: v for k, v in mass_data[element_name].items() if k != 0 and v[1] >= iso_threshold} + list_isotopes = [_make_isotope_string(element_name, k) for k in isotopes] + dict_elem_isotopes[element] = list_isotopes + else: + dict_elem_isotopes[element] = [element] + all_isotoplogues = [] + for element, list_isotopes in dict_elem_isotopes.items(): + n = composition[element] + list_comb_element_n = [] + for elementXn in combinations_with_replacement(list_isotopes, n): + list_comb_element_n.append(elementXn) + all_isotoplogues.append(list_comb_element_n) + + for isotopologue in product(*all_isotoplogues): + ic = Composition(formula=''.join(atom for el in isotopologue for atom in el), **other_kw) + if report_abundance or overall_threshold > 0.0: + abundance = isotopic_composition_abundance(composition=ic, **other_kw) + if abundance > overall_threshold: + if report_abundance: + yield (ic, abundance) + else: + yield ic + else: + yield ic + + +std_aa_mass = { + 'G': 57.02146372057, + 'A': 71.03711378471, + 'S': 87.03202840427001, + 'P': 97.05276384885, + 'V': 99.06841391299, + 'T': 101.04767846841, + 'C': 103.00918478471, + 'L': 113.08406397713001, + 'I': 113.08406397713001, + 'J': 113.08406397713001, + 'N': 114.04292744114001, + 'D': 115.02694302383001, + 'Q': 128.05857750527997, + 'K': 128.09496301399997, + 'E': 129.04259308796998, + 'M': 131.04048491299, + 'H': 137.05891185845002, + 'F': 147.06841391298997, + 'U': 150.95363508471, + 'R': 156.10111102359997, + 'Y': 163.06332853254997, + 'W': 186.07931294985997, + 'O': 237.14772686284996} +"""A dictionary with monoisotopic masses of the twenty standard +amino acid residues, selenocysteine and pyrrolysine. +""" + + +def fast_mass(sequence, ion_type=None, charge=None, **kwargs): + """Calculate monoisotopic mass of an ion using the fast + algorithm. May be used only if amino acid residues are presented in + one-letter code. + + Parameters + ---------- + sequence : str + A polypeptide sequence string. + ion_type : str, optional + If specified, then the polypeptide is considered to be + in a form of corresponding ion. Do not forget to + specify the charge state! + charge : int, optional + If not 0 then m/z is calculated: the mass is increased + by the corresponding number of proton masses and divided + by z. + mass_data : dict, optional + A dict with the masses of chemical elements (the default + value is :py:data:`nist_mass`). + aa_mass : dict, optional + A dict with the monoisotopic mass of amino acid residues + (default is std_aa_mass); + ion_comp : dict, optional + A dict with the relative elemental compositions of peptide ion + fragments (default is :py:data:`std_ion_comp`). + + Returns + ------- + mass : float + Monoisotopic mass or m/z of a peptide molecule/ion. + """ + aa_mass = kwargs.get('aa_mass', std_aa_mass) + try: + mass = sum(aa_mass[i] for i in sequence) + except KeyError as e: + raise PyteomicsError('No mass data for residue: ' + e.args[0]) + + mass_data = kwargs.get('mass_data', nist_mass) + mass += mass_data['H'][0][0] * 2 + mass_data['O'][0][0] + + if ion_type: + try: + icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type] + except KeyError: + raise PyteomicsError('Unknown ion type: {}'.format(ion_type)) + + mass += sum(mass_data[element][0][0] * num for element, num in icomp.items()) + + if charge: + mass = (mass + mass_data['H+'][0][0] * charge) / charge + + return mass + + +def fast_mass2(sequence, ion_type=None, charge=None, **kwargs): + """Calculate monoisotopic mass of an ion using the fast + algorithm. *modX* notation is fully supported. + + Parameters + ---------- + sequence : str + A polypeptide sequence string. + ion_type : str, optional + If specified, then the polypeptide is considered to be + in a form of corresponding ion. Do not forget to + specify the charge state! + charge : int, optional + If not 0 then m/z is calculated: the mass is increased + by the corresponding number of proton masses and divided + by z. + mass_data : dict, optional + A dict with the masses of chemical elements (the default + value is :py:data:`nist_mass`). + aa_mass : dict, optional + A dict with the monoisotopic mass of amino acid residues + (default is std_aa_mass); + ion_comp : dict, optional + A dict with the relative elemental compositions of peptide ion + fragments (default is :py:data:`std_ion_comp`). + + Returns + ------- + mass : float + Monoisotopic mass or m/z of a peptide molecule/ion. + """ + aa_mass = kwargs.get('aa_mass', std_aa_mass) + mass_data = kwargs.get('mass_data', nist_mass) + try: + comp = parser.amino_acid_composition(sequence, + show_unmodified_termini=True, + allow_unknown_modifications=True, + labels=aa_mass) + except PyteomicsError: + raise PyteomicsError('Mass not specified for label(s): {}'.format( + ', '.join(set(parser.parse(sequence)).difference(aa_mass)))) + + try: + mass = 0 + for aa, num in comp.items(): + if aa in aa_mass: + mass += aa_mass[aa] * num + elif parser.is_term_mod(aa): + assert num == 1 + mass += calculate_mass(formula=aa.strip('-'), mass_data=mass_data) + else: + mod, X = parser._split_label(aa) + mass += (aa_mass[mod] + aa_mass[X]) * num + except KeyError as e: + raise PyteomicsError('Unspecified mass for modification: "{}"'.format(e.args[0])) + + if ion_type: + try: + icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type] + except KeyError: + raise PyteomicsError('Unknown ion type: {}'.format(ion_type)) + + mass += sum(mass_data[element][0][0] * num + for element, num in icomp.items()) + + if charge: + mass = (mass + mass_data['H+'][0][0] * charge) / charge + + return mass + + +class Unimod(): + """A class for Unimod database of modifications. + The list of all modifications can be retrieved via `mods` attribute. + Methods for convenient searching are `by_title` and `by_name`. + For more elaborate filtering, iterate manually over the list. + + .. note:: + See :py:mod:`pyteomics.mass.unimod` for a new alternative class with + more features. + """ + + def __init__(self, source='http://www.unimod.org/xml/unimod.xml'): + """Create a database and fill it from XML file retrieved from `source`. + + Parameters + ---------- + + source : str or file, optional + A file-like object or a URL to read from. Don't forget the ``'file://'`` + prefix when pointing to local files. + """ + from lxml import etree + from ..xml import _local_name + + def process_mod(mod): + d = mod.attrib + new_d = {} + for key in ('date_time_modified', 'date_time_posted'): + new_d[key] = datetime.strptime(d.pop(key), '%Y-%m-%d %H:%M:%S') + comp = Composition() + for delta in self._xpath('delta', mod): # executed 1 time + for key in ('avge_mass', 'mono_mass'): + new_d[key] = float(delta.attrib.pop(key)) + for elem in self._xpath('element', delta): + e_d = elem.attrib + amount = int(e_d.pop('number')) + label = e_d.pop('symbol') + isotope, symbol = re.match(r'^(\d*)(\D+)$', label).groups() + if not isotope: + isotope = 0 + else: + isotope = int(isotope) + comp += Composition(formula=_make_isotope_string(symbol, isotope), mass_data=self._massdata) * amount + new_d['composition'] = comp + new_d['record_id'] = int(d.pop('record_id')) + new_d['approved'] = d.pop('approved') == '1' + new_d.update(d) + spec = [] + for sp in self._xpath('specificity', mod): + sp_d = sp.attrib + sp_new_d = {} + sp_new_d['hidden'] = (sp_d.pop('hidden') == '1') + sp_new_d['spec_group'] = int(sp_d.pop('spec_group')) + sp_new_d.update(sp_d) + notes = [] + for note in self._xpath('*', sp): + if note.text and note.text.strip(): + notes.append(note.text.strip()) + if notes: + sp_new_d['note'] = '\n'.join(notes) + spec.append(sp_new_d) + new_d['specificity'] = spec + + alt_names = [] + for alt_name in self._xpath('alt_name', mod): + alt_names.append(alt_name.text) + if alt_names: + new_d['alt_names'] = alt_names + + refs = [] + for ref in self._xpath('xref', mod): + ref_d = {} + for sub in ref.iterchildren(): + ref_d[_local_name(sub)] = sub.text + for key in ('text', 'source', 'url'): + if key not in ref_d: + ref_d[key] = None + refs.append(ref_d) + new_d['refs'] = refs + return new_d + + if isinstance(source, str): + self._tree = etree.parse(urlopen(source)) + else: + self._tree = etree.parse(source) + self._massdata = self._mass_data() + self._mods = [] + self._id = {} + for i, mod in enumerate(self._xpath('/unimod/modifications/mod')): + mod_dict = process_mod(mod) + self._mods.append(mod_dict) + self._id[mod_dict['record_id']] = i + + def _xpath(self, path, element=None): + from ..xml import xpath + if element is None: + return xpath(self._tree, path, 'umod') + return xpath(element, path, 'umod') + + def _mass_data(self): + massdata = defaultdict(dict) + elements = [x.attrib for x in self._xpath('/unimod/elements/elem')] + avg = {} + for elem in elements: + i, label = re.match(r'^(\d*)(\D+)$', elem['title']).groups() + if not i: + iso = 0 + else: + iso = int(i) + massdata[label][iso] = (float(elem['mono_mass']), float(iso == 0)) + if not iso: + avg[label] = float(elem['avge_mass']) + for elem, isotopes in massdata.items(): + isotopes[int(round(isotopes[0][0]))] = isotopes[0] + if len(isotopes) == 3: + m1, m2 = (x[1][0] for x in sorted(isotopes.items())[1:]) + m_avg = avg[elem] + a = (m2 - m_avg) / (m2 - m1) + b = (m_avg - m1) / (m2 - m1) + for state, abundance in zip(sorted(isotopes)[1:], (a, b)): + isotopes[state] = (isotopes[state][0], abundance) + return massdata + + @property + def mods(self): + """Get the list of Unimod modifications""" + return self._mods + + @property + def mass_data(self): + """Get element mass data extracted from the database""" + return self._massdata + + def by_title(self, title, strict=True): + """Search modifications by title. If a single modification is found, + it is returned. Otherwise, a list will be returned. + + Parameters + ---------- + title : str + The modification title. + strict : bool, optional + If :py:const:`False`, the search will return all modifications + whose title **contains** `title`, otherwise equality is required. + :py:const:`True` by default. + + Returns + ------- + out : dict or list + A single modification or a list of modifications. + """ + f = {True: operator.eq, False: operator.contains} + func = f[strict] + result = [m for m in self._mods if func(m['title'], title)] + if len(result) == 1: + return result[0] + return result + + def by_name(self, name, strict=True): + """Search modifications by name. If a single modification is found, + it is returned. Otherwise, a list will be returned. + + Parameters + ---------- + name : str + The full name of the modification(s). + strict : bool, optional + If :py:const:`False`, the search will return all modifications + whose full name **contains** `title`, otherwise equality is + required. :py:const:`True` by default. + + Returns + ------- + out : dict or list + A single modification or a list of modifications. + """ + f = {True: operator.eq, False: operator.contains} + func = f[strict] + result = [m for m in self._mods if func(m['full_name'], name)] + if len(result) == 1: + return result[0] + return result + + def by_id(self, i): + """Search modifications by record ID. If a modification is found, + it is returned. Otherwise, :py:exc:`KeyError` is raised. + + Parameters + ---------- + i : int or str + The Unimod record ID. + + Returns + ------- + out : dict + A single modification dict. + """ + if isinstance(i, str): + i = int(i) + return self._mods[self._id[i]] + + __getitem__ = by_id + + +def neutral_mass(mz, z, charge_carrier=_nist_mass[PROTON][0][0]): + return (mz * abs(z)) - (z * charge_carrier) + + +def mass_charge_ratio(neutral_mass, z, charge_carrier=_nist_mass[PROTON][0][0]): + return (neutral_mass + (z * charge_carrier)) / abs(z) diff --git a/pyteomics/mass/unimod.py b/pyteomics/mass/unimod.py new file mode 100644 index 0000000..471d00e --- /dev/null +++ b/pyteomics/mass/unimod.py @@ -0,0 +1,798 @@ +""" +unimod - interface to the Unimod database +========================================= + +This module provides an interface to the relational Unimod database. +The main class is :py:class:`Unimod`. + +Dependencies +------------ + +This module requires :py:mod:`lxml` and :py:mod:`sqlalchemy`. +""" + +# Copyright 2015 Joshua Klein, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +from lxml import etree +from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta +from sqlalchemy.orm import relationship, backref, object_session +from sqlalchemy.ext.associationproxy import association_proxy +from sqlalchemy import (Numeric, Unicode, + Column, Integer, ForeignKey, + UnicodeText, Boolean, event) +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from . import mass + +model_registry = set() + + +class SubclassRegisteringDeclarativeMeta(DeclarativeMeta): + def __new__(cls, name, parents, attrs): + new_type = super(SubclassRegisteringDeclarativeMeta, + cls).__new__(cls, name, parents, attrs) + model_registry.add(new_type) + return new_type + + +Base = declarative_base(metaclass=SubclassRegisteringDeclarativeMeta) + +_unimod_xml_download_url = 'http://www.unimod.org/xml/unimod_tables.xml' + +try: + basestring +except: + basestring = (str, bytes) + + +CompositionType = mass.Composition + + +def simple_repr(self): # pragma: no cover + template = '{self.__class__.__name__}({d})' + d = {'%s=%r' % (k, v) for k, v in self.__dict__.items() if not k.startswith('_')} + return template.format(self=self, d=', '.join(d)) + +Base.__repr__ = simple_repr + + +def remove_namespace(doc, namespace): + """Remove namespace in the passed document in place.""" + ns = u'{%s}' % namespace + nsl = len(ns) + for elem in doc.getiterator(): + if elem.tag.startswith(ns): + elem.tag = elem.tag[nsl:] + + +def preprocess_xml(doc_path): + """ + Parse and drop namespaces from an XML document. + + Parameters + ---------- + doc_path : str + + Returns + ------- + out : etree.ElementTree + """ + tree = etree.parse(doc_path) + root = tree.getroot() + for ns in root.nsmap.values(): + remove_namespace(tree, ns) + return tree + + +def _formula_parser(formula, session): + """ + Parse a unimod formula composed of elements, + isotopes, and other bricks. + + In order to look up a Brick's composition, this + function must have access to a session. + + Parameters + ---------- + formula : str + A Unimod formula of the form `A(n) B(m)...` + where A, B, ... are element names or bricks and + (n), (m)... are parenthesized possibly signed integers or + omitted in which case they are interpreted as 1 + session : Session + An active SQLAlchemy session for looking up bricks in the database + + Returns + ------- + out : CompositionType + """ + composition = CompositionType() + for token in formula.split(' '): + match = re.search(r'(?P<isotope>\d+)?(?P<elemet>[^\(]+)(?:\((?P<count>-?\d+)\))?', token) + if match: + isotope, element, count = match.groups() + if count is not None: + count = int(count) + else: + count = 1 + if isotope is not None: + name = mass._make_isotope_string(element, isotope) + else: + name = element + is_brick = session.query(Brick).filter(Brick.brick == name).first() + if is_brick is None: + composition[name] += count + else: + composition += is_brick.composition * count + return composition + + +def _composition_listener(attr): + """ + Attach event listeners to an InstrumentedAttribute + to trigger formula parsing on load and on change. + """ + @event.listens_for(attr, 'set') + def _update_composition_from_formula(target, value, oldvalue, initiator): + session = object_session(target) + if value == '' or value is None: + return + # If the object hasn't been associated with a session, + # we can't look up bricks. + if session is None: + return + target.composition = _formula_parser(value, session) + + @event.listens_for(attr.class_, 'load') + def _update_composition_on_load(target, context): + value = getattr(target, attr.prop.key) + if value == '' or value is None: + return + session = object_session(target) + target.composition = _formula_parser(value, session) + + +def has_composition(attr_name): + """ + A decorator to simplify flagging a Model with a column + to be treated as a formula for parsing. Calls :func:`_composition_listener` + internally. + """ + def decorator(model): + _composition_listener(getattr(model, attr_name)) + return model + return decorator + + +class HasFullNameMixin(object): + """ + A simple mixin to standardize equality operators + for models with a :attr:`full_name` attribute. + """ + def __eq__(self, other): + try: + return self.full_name == other.full_name + except AttributeError: + return False + + def __ne__(self, other): + return not self == other + + +class AlternativeName(Base): + __tablename__ = 'AlternativeName' + + _tag_name = 'alt_names_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + alt_name=attrib['alt_name'], + modification_id=int(attrib['mod_key']) + ) + return inst + + id = Column(Integer, primary_key=True) + alt_name = Column(Unicode(256), index=True) + modification_id = Column(Integer, ForeignKey('Modification.id'), index=True) + + +class AminoAcid(Base, HasFullNameMixin): + __tablename__ = 'AminoAcid' + + _tag_name = 'amino_acids_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + full_name=attrib['full_name'], + one_letter=attrib['one_letter'], + three_letter=attrib['three_letter'], + num_H=int(attrib['num_H']), + num_O=int(attrib['num_O']), + num_C=int(attrib['num_C']), + num_N=int(attrib['num_N']), + num_S=int(attrib['num_S']), + ) + return inst + + id = Column(Integer, primary_key=True) + num_H = Column(Integer) + num_O = Column(Integer) + num_C = Column(Integer) + num_N = Column(Integer) + num_S = Column(Integer) + full_name = Column(Unicode(25), index=True) + one_letter = Column(Unicode(10), index=True) + three_letter = Column(Unicode(10), index=True) + + +class Classification(Base): + __tablename__ = 'Classification' + + _tag_name = 'classifications_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + classification=attrib['classification'] + ) + return inst + + id = Column(Integer, primary_key=True) + classification = Column(Unicode(30), index=True) + + +class Position(Base): + __tablename__ = 'Position' + + _tag_name = 'positions_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + position=attrib['position'] + ) + return inst + + id = Column(Integer, primary_key=True) + position = Column(Unicode(20), index=True) + + +class Brick(Base, HasFullNameMixin): + __tablename__ = 'Brick' + + _tag_name = 'bricks_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + brick=attrib['brick'], + full_name=attrib['full_name'] + ) + return inst + + id = Column(Integer, primary_key=True) + brick = Column(Unicode(64), index=True) + full_name = Column(Unicode(128), index=True) + + elements = relationship('BrickToElement') + + @property + def composition(self): + composition = CompositionType() + for element_relation in self.elements: + symbol = element_relation.element + isotope, element = re.search(r'(?P<isotope>\d+)?(?P<element>\S+)', symbol).groups() + if isotope: + isotope = int(isotope) + iso_str = mass._make_isotope_string(element, isotope) + else: + iso_str = element + count = element_relation.count + composition[iso_str] = count + return composition + + +class Fragment(Base): + __tablename__ = 'Fragment' + + _tag_name = 'fragments_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + modification_id=int(attrib['mod_key']) + ) + return inst + + id = Column(Integer, primary_key=True) + modification_id = Column(Integer, ForeignKey('Modification.id'), index=True) + + _fragment_composition = relationship('FragmentComposition') + + @property + def composition(self): + composition = CompositionType() + session = object_session(self) + for fragment_composition_relation in self._fragment_composition: + symbol = fragment_composition_relation.brick_string + isotope, element = re.search(r'(?P<isotope>\d+)?(?P<element>\S+)', symbol).groups() + count = fragment_composition_relation.count + if count is not None: + count = int(count) + else: + count = 1 + if isotope: + name = mass._make_isotope_string(element, isotope) + else: + name = element + is_brick = session.query(Brick).filter(Brick.brick == name).first() + if is_brick is None: + composition[name] += count + else: + composition += is_brick.composition * count + return composition + + +class FragmentComposition(Base): + __tablename__ = 'FragmentComposition' + + _tag_name = 'fragment_comp_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + brick_string=attrib['brick'], + fragment_id=int(attrib['fragments_key']), + count=int(attrib['num_brick']) + ) + return inst + + id = Column(Integer, primary_key=True) + brick_string = Column(Unicode(64), ForeignKey(Brick.brick), index=True) + fragment_id = Column(Integer, ForeignKey('Fragment.id'), index=True) + count = Column(Integer) + + +class ModificationToBrick(Base): + __tablename__ = 'ModificationToBrick' + + _tag_name = 'mod2brick_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + brick_string=(attrib['brick']), + modification_id=int(attrib['mod_key']), + count=int(attrib['num_brick']) + ) + return inst + + id = Column(Integer, primary_key=True) + brick_string = Column(Unicode(64), ForeignKey(Brick.brick), index=True) + modification_id = Column(Integer, ForeignKey('Modification.id'), index=True) + count = Column(Integer) + + +class BrickToElement(Base): + __tablename__ = 'BrickToElement' + + _tag_name = 'brick2element_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + brick_id=int(attrib['brick_key']), + count=int(attrib['num_element']), + element=attrib['element'] + ) + return inst + + id = Column(Integer, primary_key=True) + brick_id = Column(Integer, ForeignKey(Brick.id), index=True) + element = Column(Unicode(16), ForeignKey('Element.element'), index=True) + element_obj = relationship('Element', uselist=False) + count = Column(Integer) + + +class Element(Base, HasFullNameMixin): + __tablename__ = 'Element' + + _tag_name = 'elements_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + average_mass=float(attrib['avge_mass']), + monoisotopic_mass=float(attrib['mono_mass']), + full_name=attrib['full_name'], + element=attrib['element'] + + ) + return inst + + id = Column(Integer, primary_key=True) + average_mass = Column(Numeric(12, 6, asdecimal=False)) + monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False)) + full_name = Column(Unicode(64), index=True) + element = Column(Unicode(16), index=True) + + +@has_composition('_composition') +class Modification(Base, HasFullNameMixin): + __tablename__ = 'Modification' + + _tag_name = 'modifications_row' + + id = Column(Integer, primary_key=True) + username_of_poster = Column(Unicode(128)) + average_mass = Column(Numeric(12, 6, asdecimal=False), index=True) + ex_code_name = Column(Unicode(64), index=True) + monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False), index=True) + full_name = Column(Unicode(128), index=True) + code_name = Column(Unicode(128), index=True) + _composition = Column(Unicode(128), index=True) + approved = Column(Boolean, index=True) + + notes = relationship('MiscNotesModifications') + specificities = relationship('Specificity') + bricks = relationship(ModificationToBrick) + _fragments = relationship(Fragment) + + _alt_names = relationship(AlternativeName, backref=backref('modification')) + # Maps the list of AlternativeName instances loaded dynamically from _alt_names + # into a list of plain strings, since the AlternativeName type contains no + # additional information. + alternative_names = association_proxy('_alt_names', 'alt_name') + fragments = association_proxy('_fragments', 'composition') + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + username_of_poster=attrib['username_of_poster'], + average_mass=float(attrib['avge_mass']), + monoisotopic_mass=float(attrib['mono_mass']), + ex_code_name=attrib['ex_code_name'], + code_name=attrib['code_name'], + full_name=attrib['full_name'], + approved=bool(int(attrib['approved'])), + _composition=attrib['composition'] + ) + for note in tag: + if note.tag == MiscNotesModifications._tag_name: + model_note = MiscNotesModifications._from_tag(note, inst.id) + if model_note is not None: + inst.notes.append(model_note) + return inst + + +class MiscNotesModifications(Base): + __tablename__ = 'MiscNotesModifications' + _tag_name = 'misc_notes' + + id = Column(Integer, primary_key=True) + modification_id = Column(Integer, ForeignKey(Modification.id), index=True) + text = Column(UnicodeText) + + @classmethod + def _from_tag(cls, tag, modification_id): + if tag.text is None: + return + return cls(text=tag.text, modification_id=modification_id) + + +class Specificity(Base): + __tablename__ = 'Specificity' + + _tag_name = 'specificity_row' + + id = Column(Integer, primary_key=True) + position_id = Column(Integer, ForeignKey(Position.id), index=True) + classification_id = Column(Integer, ForeignKey(Classification.id), index=True) + classification = relationship('Classification', uselist=False) + # Map through one_letter + amino_acid = Column(Unicode(10), ForeignKey(AminoAcid.one_letter), index=True) + modification_id = Column(Integer, ForeignKey(Modification.id), index=True) + hidden = Column(Boolean, index=True) + group = Column(Integer, index=True) + neutral_losses = relationship('SpecificityToNeutralLoss') + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + position_id=int(attrib['position_key']), + classification_id=int(attrib['classifications_key']), + hidden=bool(int(attrib['hidden'])), + amino_acid=attrib['one_letter'], + modification_id=int(attrib['mod_key']), + ) + return inst + + +class NeutralLoss(Base): + __tablename__ = 'NeutralLoss' + + _tag_name = 'neutral_losses_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + brick_string=(attrib['brick']), + count=int(attrib['num_brick']), + specificity_id=int(attrib['spec_key']) + ) + return inst + + id = Column(Integer, primary_key=True) + brick_string = Column(Unicode(64), index=True) + specificity_id = Column(Integer, ForeignKey(Specificity.id), index=True) + count = Column(Integer) + + +@has_composition('_composition') +class SpecificityToNeutralLoss(Base): + __tablename__ = 'SpecificityToNeutralLoss' + + _tag_name = 'spec2nl_row' + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls( + id=int(attrib['record_id']), + specificity_id=int(attrib['spec_key']), + monoisotopic_mass=float(attrib['nl_mono_mass']), + average_mass=float(attrib['nl_avge_mass']), + is_required_peptide_neutral_loss=bool(int(attrib['is_req_pep_nl'])), + is_peptide_neutral_loss=bool(int(attrib['is_pep_nl'])), + is_slave=bool(int(attrib['is_slave_nl'])), + _composition=attrib['nl_composition'] + ) + return inst + + id = Column(Integer, primary_key=True) + specificity_id = Column(Integer, ForeignKey(Specificity.id), index=True) + specificity = relationship(Specificity, uselist=False) + monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False), index=True) + average_mass = Column(Numeric(12, 6, asdecimal=False), index=True) + _composition = Column(Unicode(128)) + is_slave = Column(Boolean, index=True) + is_peptide_neutral_loss = Column(Boolean, index=True) + is_required_peptide_neutral_loss = Column(Boolean, index=True) + + +class CrossreferenceSource(Base): + __tablename__ = 'CrossreferenceSource' + _tag_name = 'xref_sources_row' + + id = Column(Integer, primary_key=True) + source = Column(Unicode(64), index=True) + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls() + inst.id = int(attrib['record_id']) + inst.source = attrib['xref_source'] + return inst + + +class Crossreference(Base): + __tablename__ = 'Crossreference' + + _tag_name = 'xrefs_row' + + id = Column(Integer, primary_key=True) + source_id = Column(Integer, ForeignKey(CrossreferenceSource.id), index=True) + source = relationship(CrossreferenceSource, uselist=False) + url = Column(Unicode(128)) + modification_id = Column(Integer, ForeignKey(Modification.id), index=True) + text = Column(UnicodeText) + + @classmethod + def from_tag(cls, tag): + attrib = tag.attrib + inst = cls() + inst.id = int(attrib['record_id']) + inst.url = attrib['xref_url'] + inst.source_id = int(attrib['xref_source_key']) + inst.modification_id = int(attrib['mod_key']) + text = [] + for node in tag.getchildren(): + if node.tag == 'xref_text': + if node.text is not None: + text.append(node.text) + inst.text = '\n'.join(text) + return inst + + +def load(doc_path, output_path='sqlite://'): + """ + Parse the relational table-like XML file provided by http://www.unimod.org/downloads.html + and convert each <tag>_row into an equivalent database entry. + + By default the table will be held in memory. + """ + tree = preprocess_xml(doc_path) + engine = create_engine(output_path) + Base.metadata.create_all(engine) + session = sessionmaker(bind=engine, autoflush=False)() + for model in model_registry: + if hasattr(model, '_tag_name') and hasattr(model, 'from_tag'): + for tag in tree.iterfind('.//' + model._tag_name): + session.add(model.from_tag(tag)) + session.commit() + return session + + +def session(path='sqlite:///unimod.db'): + engine = create_engine(path) + Base.metadata.create_all(engine) + session = sessionmaker(bind=engine, autoflush=False)() + return session + + +class Unimod(object): + """ + Main class representing the relational Unimod database. + + Examples + -------- + + If you just wish to get a new copy of the data and store it in a temporary + in-memory database, invoking the type without parameters works without issue. + + >>> new_db = Unimod() + + If you want to persist a snapshot of the Unimod database to disk and query it + from there, or to re-use a previously downloaded database copy, pass a database + driver prefixed path: + + >>> reused_db = Unimod("sqlite:///path/to/unimod.db") + + If the path did not previously exist, a new copy of Unimod will be downloaded + and stored there on the first use, but be immediately available on subsequent + uses. + """ + def __init__(self, path=None): + """ + Initialize the object from a database file. + + Parameters + ---------- + path : str or None, optional + If :py:class:`str`, should point to a database. + Use a dialect-specific prefix, like ``'sqlite://'``. + If :py:const:`None` (default), a relational + XML file will be downloaded from default location. + """ + if path is None: + self.path = None + self.session = load(_unimod_xml_download_url) + else: + self.path = path + try: + self.session = session(path) + if self.session.query(Modification).first() is None: + raise Exception() + except: + # Database may not yet exist at that location + self.session = load(_unimod_xml_download_url, path) + self.session.query(Modification).first() + + def get(self, identifier, strict=True): + """ + Get a modification matching `identifier`. + Replaces both :py:mod:`by_name` and :py:mod:`by_title` methods + in the old class. + + Parameters + ---------- + identifier : str + + strict : bool, optional + Defaults to :py:const:`True`. + + Returns + ------- + out : Modification + """ + if isinstance(identifier, int): + mod = self.session.query(Modification).get(identifier) + if mod is None: + raise KeyError(identifier) + return mod + elif isinstance(identifier, basestring): + if strict: + mod = self.session.query(Modification).filter( + (Modification.full_name == identifier) | + (Modification.code_name == identifier) | + (Modification.ex_code_name == identifier)).first() + if mod is None: + alt_name = self.session.query(AlternativeName).filter( + AlternativeName.alt_name == identifier).first() + if alt_name is None: + raise KeyError(identifier) + mod = alt_name.modification + return mod + else: + qname = '%%%s%%' % identifier + mod = self.session.query(Modification).filter( + (Modification.full_name.like(qname)) | + (Modification.code_name.like(qname)) | + (Modification.ex_code_name.like(qname))).first() + if mod is None: + alt_name = self.session.query(AlternativeName).filter( + AlternativeName.alt_name.like(qname)).first() + if alt_name is None: + raise KeyError(identifier) + mod = alt_name.modification + return mod + + by_title = by_name = get + + __getitem__ = get + + @property + def mods(self): + return self.session.query(Modification).all() + + def __iter__(self): + return iter(self.session.query(Modification).yield_per(1000)) + + def query(self, *args): + '''Compose an SQL query using SQLAlchemy's ORM interface. + + See :mod:`sqlalchemy`'s Session documentation for more details. + ''' + return self.session.query(*args) + + def execute(self, *args, **kwargs): + '''Execute an SQLAlchemy statement or a SQL string against the database, + returning the resulting database cursor. + + See :mod:`sqlalchemy`'s Session documentation for more details. + ''' + return self.session.execute(*args, **kwargs) diff --git a/pyteomics/mgf.py b/pyteomics/mgf.py new file mode 100644 index 0000000..811e43e --- /dev/null +++ b/pyteomics/mgf.py @@ -0,0 +1,830 @@ +""" +mgf - read and write MS/MS data in Mascot Generic Format +======================================================== + +Summary +------- + +`MGF <http://www.matrixscience.com/help/data_file_help.html>`_ is a simple +human-readable format for MS/MS data. It allows storing MS/MS peak lists and +exprimental parameters. + +This module provides classes and functions for access to data stored in +MGF files. +Parsing is done using :py:class:`MGF` and :py:class:`IndexedMGF` classes. +The :py:func:`read` function can be used as an entry point. +MGF spectra are converted to dictionaries. MS/MS data points are +(optionally) represented as :py:mod:`numpy` arrays. +Also, common parameters can be read from MGF file header with +:py:func:`read_header` function. +:py:func:`write` allows creation of MGF files. + +Classes +------- + + :py:class:`MGF` - a text-mode MGF parser. Suitable to read spectra from a file consecutively. + Needs a file opened in text mode (or will open it if given a file name). + + :py:class:`IndexedMGF` - a binary-mode MGF parser. When created, builds a byte offset index + for fast random access by spectrum titles. Sequential iteration is also supported. + Needs a seekable file opened in binary mode (if created from existing file object). + + :py:class:`MGFBase` - abstract class, the common ancestor of the two classes above. + Can be used for type checking. + +Functions +--------- + + :py:func:`read` - an alias for :py:class:`MGF` or :py:class:`IndexedMGF`. + + :py:func:`get_spectrum` - read a single spectrum with given title from a file. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`read_header` - get a dict with common parameters for all spectra + from the beginning of MGF file. + + :py:func:`write` - write an MGF file. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import numpy as np +except ImportError: + np = None +import itertools as it +import sys +import warnings +from . import auxiliary as aux + + +class MGFBase(aux.MaskedArrayConversionMixin): + """Abstract mixin class representing an MGF file. Subclasses implement different approaches to parsing.""" + _comments = set('#;!/') + _array_keys = ['m/z array', 'intensity array', 'charge array', 'ion array'] + _array_keys_unicode = [u'm/z array', u'intensity array', u'charge array', u'ion array'] + encoding = None + + def __init__(self, source=None, **kwargs): + """Create an MGF file object, set MGF-specific parameters. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MGF format. Default is + :py:const:`None`, which means read standard input. + + use_header : bool, optional, keyword only + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`True`. + + convert_arrays : one of {0, 1, 2}, optional, keyword only + If `0`, m/z, intensities and (possibly) charges or (possibly) ions will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional, keyword only + If `True` (default), fragment charges are reported. Disabling it improves performance. + + read_ions : bool, optional + If `True` (default: False), fragment ions are reported. Disabling it improves performance. + Note that right now, only one of (read_charges, read_ions) may be True. + + dtype : type or str or dict, optional, keyword only + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. + + encoding : str, optional, keyword only + File encoding. + """ + + super(MGFBase, self).__init__(source, **kwargs) + self._use_header = kwargs.pop('use_header', True) + self._read_charges = kwargs.pop('read_charges', True) + self._read_ions = kwargs.pop('read_ions', False) + # Make sure no charges are read if ions are read + if self._read_ions: + self._read_charges = False + if self._use_header: + self._read_header() + else: + self._header = None + + def __reduce_ex__(self, protocol): + return (self.__class__, (self._source_init,), self.__getstate__()) + + def __getstate__(self): + state = super(MGFBase, self).__getstate__() + state['use_header'] = self._use_header + state['header'] = self._header + return state + + def __setstate__(self, state): + super(MGFBase, self).__setstate__(state) + self._header = state['header'] + self._use_header = state['use_header'] + + @staticmethod + def parse_precursor_charge(charge_text, list_only=False): + return aux._parse_charge(charge_text, list_only=list_only) + + @staticmethod + def parse_peak_charge(charge_text, list_only=False): + return aux._parse_charge(charge_text, list_only=False) + + @staticmethod + def parse_peak_ion(ion_text): + return aux._parse_ion(ion_text) + + @property + def header(self): + if self._header is None: + self._read_header() + return self._header + + def _read_header_lines(self, header_lines): + header = {} + for line in header_lines: + if line.strip() == 'BEGIN IONS': + break + l = line.split('=') + if len(l) == 2: + key = l[0].lower() + val = l[1].strip() + header[key] = val + if 'charge' in header: + header['charge'] = self.parse_precursor_charge(header['charge'], True) + self._header = header + + def _read_spectrum_lines(self, lines): + """Read a single spectrum from ``self._source``. + + Returns + ------- + out : dict + """ + + masses = [] + intensities = [] + charges = [] + ions = [] + + params = self.header.copy() if self._use_header else {} + + for i, line in enumerate(lines): + sline = line.strip() + if sline == 'BEGIN IONS': + if i == 0: + continue + else: + raise aux.PyteomicsError('Error when parsing MGF: unexpected start of spectrum.') + if not sline or sline[0] in self._comments: + pass + elif sline == 'END IONS': + if 'pepmass' in params: + try: + pepmass = tuple(map(float, params['pepmass'].split())) + except ValueError: + raise aux.PyteomicsError('MGF format error: cannot parse ' + 'PEPMASS = {}'.format(params['pepmass'])) + else: + params['pepmass'] = pepmass + (None,) * (2-len(pepmass)) + if isinstance(params.get('charge'), aux.basestring): + params['charge'] = self.parse_precursor_charge(params['charge'], True) + if 'rtinseconds' in params: + params['rtinseconds'] = aux.unitfloat(params['rtinseconds'], 'second') + out = {'params': params, 'm/z array': masses, 'intensity array': intensities} + if self._read_charges: + out['charge array'] = charges + if self._read_ions: + out['ion array'] = ions + self._build_all_arrays(out) + if self.encoding and sys.version_info.major == 2: + for key, ukey in zip(self._array_keys + ['params'], self._array_keys_unicode + [u'params']): + if key in out: + out[ukey] = out.pop(key) + return out + + else: + if '=' in sline: # spectrum-specific parameters! + l = sline.split('=', 1) + params[l[0].lower()] = l[1].strip() + else: # this must be a peak list + l = sline.split() + try: + masses.append(float(l[0])) + intensities.append(float(l[1])) + if self._read_charges: + charges.append(self.parse_peak_charge(l[2]) if len(l) > 2 else 0) + if self._read_ions: + ions.append(self.parse_peak_ion(l[2]) if len(l) > 2 else "") + except ValueError: + raise aux.PyteomicsError( + 'Error when parsing %s. Line:\n%s' % (getattr(self._source, 'name', 'MGF file'), line)) + except IndexError: + pass + + def get_spectrum(self, title): + raise NotImplementedError() + + @staticmethod + def _get_time(spectrum): + try: + return spectrum['params']['rtinseconds'] + except KeyError: + raise aux.PyteomicsError('RT information not found.') + + +class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixin, aux.IndexSavingTextReader): + """ + A class representing an MGF file. Supports the `with` syntax and direct iteration for sequential + parsing. Specific spectra can be accessed by title using the indexing syntax in constant time. + If created using a file object, it needs to be opened in binary mode. + + When iterated, :py:class:`IndexedMGF` object yields spectra one by one. + Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array', + 'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + 'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints, + 'ion_array' is an array of Ions (str) + and 'params' stores a :py:class:`dict` of parameters (keys and values are + :py:class:`str`, keys corresponding to MGF, lowercased). + + Attributes + ---------- + + header : dict + The file header. + time : RTLocator + A property used for accessing spectra by retention time. + """ + delimiter = 'BEGIN IONS' + + def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True, + dtype=None, encoding='utf-8', index_by_scans=False, read_ions=False, _skip_index=False, **kwargs): + """ + Create an :py:class:`IndexedMGF` (binary-mode) reader for a given MGF file. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MGF format. Default is + :py:const:`None`, which means read standard input. + + .. note :: If a file object is given, it must be opened in binary mode. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`True`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + + read_ions : bool, optional + If `True` (default: False), fragment ion types are reported. Disabling it improves performance. + Note that right now, only one of (read_charges, read_ions) may be True. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. + + encoding : str, optional + File encoding. + + block_size : int, optinal + Size of the chunk (in bytes) used to parse the file when creating the byte offset index. + + Returns + ------- + + out : IndexedMGF + The reader object. + """ + self._index_by_scans = index_by_scans + self._read_ions = read_ions + self.label = r'SCANS=(\d+)\s*' if index_by_scans else r'TITLE=([^\n]*\S)\s*' + super(IndexedMGF, self).__init__(source, parser_func=self._read, pass_file=False, args=(), kwargs={}, + use_header=use_header, convert_arrays=convert_arrays, + read_charges=read_charges, + dtype=dtype, encoding=encoding, read_ions=read_ions, _skip_index=_skip_index, + **kwargs) + + def __reduce_ex__(self, protocol): + return (self.__class__, + (self._source_init, False, self._convert_arrays, self._read_charges, + None, self.encoding, self._index_by_scans, self._read_ions, True), + self.__getstate__()) + + @aux._keepstate_method + def _read_header(self): + try: + first = next(v for v in self._offset_index.values())[0] + except StopIteration: # the index is empty, no spectra in file + first = -1 + header_lines = self.read(first).decode(self.encoding).split('\n') + return self._read_header_lines(header_lines) + + def _item_from_offsets(self, offsets): + start, end = offsets + lines = self._read_lines_from_offsets(start, end) + return self._read_spectrum_lines(lines) + + def _read(self, **kwargs): + for _, offsets in self._offset_index.items(): + spectrum = self._item_from_offsets(offsets) + yield spectrum + + def get_spectrum(self, key): + return self.get_by_id(key) + + +class MGF(MGFBase, aux.FileReader): + """ + A class representing an MGF file. Supports the `with` syntax and direct iteration for sequential + parsing. Specific spectra can be accessed by title using the indexing syntax (if the file is seekable), + but it takes linear time to search through the file. Consider using :py:class:`IndexedMGF` for + constant-time access to spectra. + + :py:class:`MGF` object behaves as an iterator, **yielding** spectra one by one. + Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array', + 'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + 'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints, + 'ion_array' is a masked array of Ions (str) + and 'params' stores a :py:class:`dict` of parameters (keys and values are + :py:class:`str`, keys corresponding to MGF, lowercased). + + Attributes + ---------- + + header : dict + The file header. + + """ + + def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True, + read_ions=False, dtype=None, encoding=None): + """ + Create an :py:class:`MGF` (text-mode) reader for a given MGF file. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MGF format. Default is + :py:const:`None`, which means read standard input. + + ..note :: If a file object is given, it must be opened in text mode. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`True`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + + read_ions : bool, optional + If `True` (default: False), fragment ion types are reported. Disabling it improves performance. + Note that right now, only one of (read_charges, read_ions) may be True. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. + + encoding : str, optional + File encoding. + + Returns + ------- + + out : MGF + The reader object. + """ + super(MGF, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={}, + encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges, + read_ions=read_ions, dtype=dtype) + + @aux._keepstate_method + def _read_header(self): + return self._read_header_lines(self._source) + + def _read_spectrum(self): + return self._read_spectrum_lines(self._source) + + def _read(self): + for line in self._source: + if line.strip() == 'BEGIN IONS': + yield self._read_spectrum() + + @aux._keepstate_method + def get_spectrum(self, title): + for line in self._source: + sline = line.strip() + if sline[:5] == 'TITLE' and sline.split('=', 1)[1].strip() == title: + spectrum = self._read_spectrum() + spectrum['params']['title'] = title + return spectrum + + def __getitem__(self, key): + return self.get_spectrum(key) + + +def read(*args, **kwargs): + """Returns a reader for a given MGF file. Most of the parameters repeat the + instantiation signature of :py:class:`MGF` and :py:class:`IndexedMGF`. + Additional parameter `use_index` helps decide which class to instantiate + for given `source`. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MGF format. Default is + :py:const:`None`, which means read standard input. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`True`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + + read_ions : bool, optional + If `True` (default: False), fragment ion types are reported. Disabling it improves performance. + Note that right now, only one of (read_charges, read_ions) may be True. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. + + encoding : str, optional + File encoding. + + use_index : bool, optional + Determines which parsing method to use. If :py:const:`True` (default), an instance of + :py:class:`IndexedMGF` is created. This facilitates random access by spectrum titles. + If an open file is passed as `source`, it needs to be open in binary mode. + + If :py:const:`False`, an instance of :py:class:`MGF` is created. It reads + `source` in text mode and is suitable for iterative parsing. Access by spectrum title + requires linear search and thus takes linear time. + + block_size : int, optinal + Size of the chunk (in bytes) used to parse the file when creating the byte offset index. + (Accepted only for :py:class:`IndexedMGF`.) + + Returns + ------- + + out : MGFBase + Instance of :py:class:`MGF` or :py:class:`IndexedMGF`. + """ + if args: + source = args[0] + else: + source = kwargs.get('source') + use_index = kwargs.pop('use_index', None) + use_index = aux._check_use_index(source, use_index, True) + tp = IndexedMGF if use_index else MGF + return tp(*args, **kwargs) + + +def get_spectrum(source, title, *args, **kwargs): + """Read one spectrum (with given `title`) from `source`. + + See :py:func:`read` for explanation of parameters affecting the output. + + .. note :: Only the key-value pairs after the "TITLE =" line will be included in the output. + + Parameters + ---------- + + source : str or file or None + File to read from. + title : str + Spectrum title. + *args + Given to :py:func:`read`. + **kwargs + Given to :py:func:`read`. + + Returns + ------- + out : dict or None + A dict with the spectrum, if it is found, and None otherwise. + + """ + with read(source, *args, **kwargs) as f: + return f[title] + + +@aux._keepstate +def read_header(source): + """ + Read the specified MGF file, get search parameters specified in the header + as a :py:class:`dict`, the keys corresponding to MGF format (lowercased). + + Parameters + ---------- + + source : str or file + File name or file object representing an file in MGF format. + + Returns + ------- + + header : dict + """ + with aux._file_obj(source, 'r') as source: + header = {} + for line in source: + if line.strip() == 'BEGIN IONS': + break + l = line.split('=') + if len(l) == 2: + key = l[0].lower() + val = l[1].strip() + header[key] = val + if 'charge' in header: + header['charge'] = aux._parse_charge(header['charge'], True) + return header + + +_default_key_order = ['title', 'pepmass', 'rtinseconds', 'charge'] + + +def _pepmass_repr(k, pepmass): + outstr = k.upper() + '=' + if not isinstance(pepmass, (str, int, float)): # assume iterable + try: + outstr += ' '.join(str(x) for x in pepmass if x is not None) + except TypeError: + raise aux.PyteomicsError('Cannot handle parameter: PEPMASS = {}'.format(pepmass)) + else: + outstr += str(pepmass) + return outstr + + +def _charge_repr(k, charge): + try: + val = aux.Charge(charge) + except (TypeError, aux.PyteomicsError): + val = aux.ChargeList(charge) + return '{}={}'.format(k.upper(), val) + + +def _default_repr(key, val): + return '{}={}'.format(key.upper(), val) + + +_default_value_formatters = {'pepmass': _pepmass_repr, 'charge': _charge_repr} + + +@aux._file_writer() +def write(spectra, output=None, header='', key_order=_default_key_order, fragment_format=None, + write_charges=True, write_ions=False, use_numpy=None, param_formatters=_default_value_formatters): + """ + Create a file in MGF format. + + Parameters + ---------- + + spectra : iterable + A **sequence** of dictionaries with keys 'm/z array', 'intensity array', + and 'params'. 'm/z array' and 'intensity array' should be sequences of + :py:class:`int`, :py:class:`float`, or :py:class:`str`. Strings will + be written 'as is'. The sequences should be of equal length, otherwise + excessive values will be ignored. + + 'params' should be a :py:class:`dict` with keys corresponding to MGF + format. Keys must be strings, they will be uppercased and used as is, + without any format consistency tests. Values can be of any type allowing + string representation. + + 'charge array' or 'ion array' can also be specified. + + .. note :: + Passing a single spectrum will work, but will trigger a warning. This usage pattern is discouraged. + To ensure correct output when writing multiple spectra, + it is recommended to construct a sequence of spectra first and then call :py:func:`write` once. + + .. seealso :: + This discussion of usage patterns of :py:func:`write`: https://github.com/levitsky/pyteomics/discussions/109 + + output : str or file or None, optional + Path or a file-like object open for writing. If an existing file is + specified by file name, it will be opened for writing. + Default value is :py:const:`None`, which means using standard output. + + .. note:: + The default mode for output files specified by name has been changed + from `a` to `w` in *pyteomics 4.6*. See `file_mode` to override the mode. + + header : dict or (multiline) str or list of str, optional + In case of a single string or a list of strings, the header will be + written 'as is'. In case of dict, the keys (must be strings) will be + uppercased. + + write_charges : bool, optional + If :py:const:`False`, fragment charges from 'charge array' will not be written. + Default is :py:const:`True`. + + write_ions : bool, optional + If :py:const:`False`, fragment ions from 'ion array' will not be written. + If :py:const:`True`, then `write_charges` is set to :py:const:`False`. + Default is :py:const:`False`. + + fragment_format : str, optional + Format string for m/z, intensity and charge (or ion annotation) of a fragment. Useful to set + the number of decimal places, e.g.: + ``fragment_format='%.4f %.0f'``. Default is ``'{} {} {}'``. + + .. note:: + The supported format syntax differs depending on other parameters. + If `use_numpy` is :py:const:`True` and :py:mod:`numpy` is available, + fragment peaks will be written using :py:func:`numpy.savetxt`. Then, + `fragment_format` must be recognized by that function. + + Otherwise, plain Python string formatting is done. + See `the docs + <https://docs.python.org/library/string.html#format-specification-mini-language>`_ + for details on writing the format string. + If some or all charges are missing, an empty string is substituted + instead, so formatting as :py:class:`!float` or :py:class:`!int` will raise an exception. + Hence it is safer to just use ``{}`` for charges. + + key_order : list, optional + A list of strings specifying the order in which params will be written in + the spectrum header. Unlisted keys will be in arbitrary order. + Default is :py:data:`_default_key_order`. + + .. note:: This does not affect the order of lines in the global header. + + param_formatters : dict, optional + A dict mapping parameter names to functions. Each function must accept + two arguments (key and value) and return a string. + Default is :py:data:`_default_value_formatters`. + + use_numpy : bool, optional + Controls whether fragment peak arrays are written using :py:func:`numpy.savetxt`. + Using :py:func:`numpy.savetxt` is faster, but cannot handle sparse arrays of fragment charges. + You may want to disable this if you need to save spectra with 'charge arrays' with missing values. + + If not specified, will be set to the opposite of `write_chrages`. + If :py:mod:`numpy` is not available, this parameter has no effect. + + file_mode : str, keyword only, optional + If `output` is a file name, defines the mode the file will be opened in. + Otherwise will be ignored. Default is `'w'`. + + .. note :: + The default changed from `'a'` in *pyteomics 4.6*. + + encoding : str, keyword only, optional + Output file encoding (if `output` is specified by name). + + Returns + ------- + + output : file + """ + def key_value_line(key, val): + return param_formatters.get(key, _default_repr)(key, val) + '\n' + + nones = (None, np.nan, np.ma.masked) if np is not None else (None,) + + if fragment_format is None: + fragment_format = '{} {} {}' + np_format_2 = '%.5f %.1f' + np_format_3 = '%.5f %.1f %d' + np_format_i = '%.5f %.1f %s' + else: + np_format_2 = np_format_3 = np_format_i = fragment_format + format_str = fragment_format + '\n' + + if write_ions: + write_charges = False + if use_numpy is None: + use_numpy = not write_charges + + if isinstance(header, dict): + head_dict = header.copy() + head_lines = [key_value_line(k, v) for k, v in header.items()] + head_str = '\n'.join(head_lines) + else: + if isinstance(header, str): + head_str = header + head_lines = header.split('\n') + else: + head_lines = list(header) + head_str = '\n'.join(header) + head_dict = {} + for line in head_lines: + if not line.strip() or any(line.startswith(c) for c in MGF._comments): + continue + l = line.split('=') + if len(l) == 2: + head_dict[l[0].lower()] = l[1].strip() + if head_str: + output.write(head_str + '\n\n') + + if isinstance(spectra, dict) and 'm/z array' in spectra: + spectra = (spectra, ) + warnings.warn("Passing a single spectrum to `write()` is discouraged. " + "To write a set of spectra, pass them to `write()` all at once. " + "For more info, see: https://github.com/levitsky/pyteomics/discussions/109.") + + for spectrum in spectra: + output.write('BEGIN IONS\n') + found = set() + for key in it.chain(key_order, spectrum['params']): + if key not in found and key in spectrum['params']: + found.add(key) + val = spectrum['params'][key] + if val != head_dict.get(key): + output.write(key_value_line(key, val)) + + try: + success = True + if np is not None and use_numpy: + if (not write_charges or 'charge array' not in spectrum) and (not write_ions or 'ion array' not in spectrum): + X = np.empty((len(spectrum['m/z array']), 2)) + X[:, 0] = spectrum['m/z array'] + X[:, 1] = spectrum['intensity array'] + np.savetxt(output, X, fmt=np_format_2) + elif isinstance(spectrum.get('charge array'), np.ndarray): + X = np.empty((len(spectrum['m/z array']), 3)) + X[:, 0] = spectrum['m/z array'] + X[:, 1] = spectrum['intensity array'] + X[:, 2] = spectrum['charge array'] + np.savetxt(output, X, fmt=np_format_3) + elif isinstance(spectrum.get('ion array'), np.ndarray): + X = np.empty((len(spectrum['m/z array']), 3), dtype=object) + X[:, 0] = spectrum['m/z array'] + X[:, 1] = spectrum['intensity array'] + X[:, 2] = spectrum['ion array'] + np.savetxt(output, X, fmt=np_format_i) + else: + success = False + else: + success = False + + if not success: + for m, i, c in zip(spectrum['m/z array'], + spectrum['intensity array'], + spectrum.get('charge array', it.cycle((None,))) if write_charges else + spectrum.get('ion array', it.cycle((None,))) if write_ions else + it.cycle((None,))): + output.write(format_str.format( + m, i, + (c if c not in nones else ''))) + except KeyError: + raise aux.PyteomicsError("'m/z array' and 'intensity array' must be present in all spectra.") + output.write('END IONS\n\n') + return output + + +chain = aux._make_chain(read, 'read') diff --git a/pyteomics/ms1.py b/pyteomics/ms1.py new file mode 100644 index 0000000..ba7cc3b --- /dev/null +++ b/pyteomics/ms1.py @@ -0,0 +1,492 @@ +""" +ms1 - read and write MS/MS data in MS1 format +============================================= + +Summary +------- + +`MS1 <http://dx.doi.org/10.1002/rcm.1603>`_ is a simple +human-readable format for MS1 data. It allows storing MS1 peak lists and exprimental parameters. + +This module provides minimalistic infrastructure for access to data stored in MS1 files. +Two main classes are :py:class:`MS1`, which provides an iterative, text-mode parser, +and :py:class:`IndexedMS1`, which is a binary-mode parser that supports random access using scan IDs +and retention times. +The function :py:func:`read` helps dispatch between the two classes. +Also, common parameters can be read from MS1 file header with :py:func:`read_header` function. + +Classes +------- + + :py:class:`MS1` - a text-mode MS1 parser. Suitable to read spectra from a file consecutively. + Needs a file opened in text mode (or will open it if given a file name). + + :py:class:`IndexedMS1` - a binary-mode MS1 parser. When created, builds a byte offset index + for fast random access by spectrum ID. Sequential iteration is also supported. + Needs a seekable file opened in binary mode (if created from existing file object). + + :py:class:`MS1Base` - abstract class, the common ancestor of the two classes above. + Can be used for type checking. + +Functions +--------- + + :py:func:`read` - an alias for :py:class:`MS1` or :py:class:`IndexedMS1`. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`read_header` - get a dict with common parameters for all spectra + from the beginning of MS1 file. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import auxiliary as aux +try: + import numpy as np +except ImportError: + np = None + + +class MS1Base(aux.ArrayConversionMixin): + """Abstract class representing an MS1 file. Subclasses implement different approaches to parsing.""" + _array_keys = ['m/z array', 'intensity array'] + _float_keys = ['RTime', 'RetTime'] + + def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding=None, **kwargs): + """ + Create an instance of a :py:class:`MS1Base` parser. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS1 format. Default is + :py:const:`None`, which means read standard input. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`False`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array'. + + encoding : str, optional + File encoding. + """ + super(MS1Base, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, **kwargs) + if convert_arrays and np is None: + raise aux.PyteomicsError('numpy is required for array conversion') + self._use_header = use_header + if use_header: + self._header = self._read_header() + else: + self._header = None + self._source_name = getattr(source, 'name', str(source)) + + def reset(self): + super(MS1Base, self).reset() + self._pending_line = None + + @property + def header(self): + return self._header + + def _read_header_lines(self, lines): + header = {} + for line in lines: + if line[0] != 'H': + break + tokens = line.split('\t', 2) + if len(tokens) < 3: + tokens = line.split(None, 2) + key = tokens[1] + val = tokens[2].strip() + header[key] = val + return header + + def _make_scan(self, info): + for key in self._float_keys: + if key in info['params']: + info['params'][key] = float(info['params'][key]) + self._build_all_arrays(info) + return info + + def _handle_S(self, line, sline, params): + sline = line.strip().split(None, 3) + params['scan'] = tuple(sline[1:3]) + if len(sline) == 4: # in MS2 the S line contains the precursor m/z as a 4th column + params['precursor m/z'] = float(sline[3]) + + def _handle_I(self, line, sline, params): + params[sline[1]] = sline[2] + + def _handle_Z(self, line, sline, params): + params.setdefault('charge', []).append(float(sline[1])) + params.setdefault('neutral mass', []).append(float(sline[2])) + + def _handle_D(self, line, sline, params): + params.setdefault('analyzer', []).append(sline[1:]) + + def _handle_peak(self, line, sline, info): + try: + info['m/z array'].append(float(sline[0])) # this may cause + info['intensity array'].append(float(sline[1])) # exceptions... + except ValueError: + raise aux.PyteomicsError( + 'Error when parsing %s. Line: %s' % (self._source_name, line)) + except IndexError: + pass + + def _read_spectrum_lines(self, lines): + params = {} + info = {'params': params} + for k in self._array_keys: + info[k] = [] + if self._use_header: + params.update(self.header) + if self._pending_line: + reading_spectrum = True + self._handle_S(self._pending_line, None, params) + else: + reading_spectrum = False + line_count = 0 + for i, line in enumerate(lines): + line_count = i + sline = line.strip().split(None, 2) + if not sline: + continue + if not reading_spectrum: + if sline[0] == 'S': + reading_spectrum = True + self._handle_S(line, sline, params) + # otherwise we are not interested; do nothing, just move along + else: + if not sline: + pass + elif sline[0] == 'S': + self._pending_line = line + return self._make_scan(info) + + else: + if sline[0] == 'I': # spectrum-specific parameters! + self._handle_I(line, sline, params) + elif sline[0] == 'Z': # MS2-specific charge state guess + self._handle_Z(line, sline, params) + elif sline[0] == 'D': # MS2-specific analyzer annotation + self._handle_D(line, sline, params) + else: # this must be a peak list + self._handle_peak(line, sline, info) + self._pending_line = None + if line_count == 0: + return + return self._make_scan(info) + + def __getstate__(self): + state = super(MS1Base, self).__getstate__() + state['use_header'] = self._use_header + state['header'] = self._header + return state + + def __setstate__(self, state): + super(MS1Base, self).__setstate__(state) + self._use_header = state['use_header'] + self._header = state['header'] + + def __reduce_ex__(self, protocol): + return (self.__class__, + (self._source_init, False, self._convert_arrays, None, self.encoding), + self.__getstate__()) + + +class MS1(MS1Base, aux.FileReader): + """ + A class representing an MS1 file. Supports the `with` syntax and direct iteration for sequential + parsing. + + :py:class:`MS1` object behaves as an iterator, **yielding** spectra one by one. + Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', + 'intensity array', and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + and 'params' stores a :py:class:`dict` of parameters. + + Attributes + ---------- + + header : dict + The file header. + + """ + def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding=None, **kwargs): + """ + Create an :py:class:`MS1` (text-mode) reader for a given MS1 file. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS1 format. Default is + :py:const:`None`, which means read standard input. + + .. note :: If a file object is given, it must be opened in text mode. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`False`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array'. + + encoding : str, optional + File encoding. + + Returns + ------- + + out : MS1 + The reader object. + """ + super(MS1, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, + mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={}) + + @aux._keepstate_method + def _read_header(self): + return self._read_header_lines(self._source) + + def _read(self): + def get_next_spectrum(): + return self._read_spectrum_lines(self._source) + + for spectrum in iter(get_next_spectrum, None): + yield spectrum + + +class IndexedMS1(MS1Base, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixin, aux.IndexedTextReader): + """ + A class representing an MS1 file. Supports the `with` syntax and direct iteration for sequential + parsing. Specific spectra can be accessed by title using the indexing syntax in constant time. + If created using a file object, it needs to be opened in binary mode. + + When iterated, :py:class:`IndexedMS1` object yields spectra one by one. + Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', 'intensity array' and 'params'. + 'm/z array' and 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + and 'params' stores a :py:class:`dict` of parameters (keys and values are + :py:class:`str`, keys corresponding to MS1). + + .. warning :: + Labels for scan objects are constructed as the first number in the S line, as follows: + for a line ``S 0 1`` the label is `'0'`. If these labels are not unique + for the scans in the file, the indexed parser will not work correctly. Consider using + :py:class:`MS1` instead. + + Attributes + ---------- + + header : dict + The file header. + time : RTLocator + A property used for accessing spectra by retention time. + """ + + delimiter = '\nS' + label = r'^[\n]?S\s+(\S+)' + + def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding='utf-8', _skip_index=False, **kwargs): + """ + Create an :py:class:`IndexedMS1` (binary-mode) reader for a given MS1 file. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS1 format. Default is + :py:const:`None`, which means read standard input. + + .. note :: If a file object is given, it must be opened in binary mode. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`True`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array'. + + encoding : str, optional + File encoding. + + block_size : int, optinal + Size of the chunk (in bytes) used to parse the file when creating the byte offset index. + + Returns + ------- + + out : IndexedMS1 + The reader object. + """ + super(IndexedMS1, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, + parser_func=self._read, pass_file=False, args=(), kwargs={}, _skip_index=_skip_index, **kwargs) + + def __reduce_ex__(self, protocol): + return (self.__class__, + (self._source_init, False, self._convert_arrays, None, self.encoding, True), + self.__getstate__()) + + @aux._keepstate_method + def _read_header(self): + try: + first = next(v for v in self._offset_index.values())[0] + except StopIteration: # the index is empty, no spectra in file + first = -1 + header_lines = self.read(first).decode(self.encoding).split('\n') + return self._read_header_lines(header_lines) + + def _item_from_offsets(self, offsets): + start, end = offsets + lines = self._read_lines_from_offsets(start, end) + return self._read_spectrum_lines(lines) + + def _read(self, **kwargs): + for _, offsets in self._offset_index.items(): + spectrum = self._item_from_offsets(offsets) + yield spectrum + + def get_spectrum(self, key): + return self.get_by_id(key) + + def _get_time(self, spectrum): + try: + return spectrum['params']['RTime'] + except KeyError: + raise aux.PyteomicsError('RT information not found.') + + +def read_header(source, *args, **kwargs): + """ + Read the specified MS1 file, get the parameters specified in the header + as a :py:class:`dict`. + + Parameters + ---------- + + source : str or file + File name or file object representing an file in MS1 format. + + Returns + ------- + + header : dict + """ + kwargs['use_header'] = True + return read(source, *args, **kwargs).header + + +def read(*args, **kwargs): + """Read an MS1 file and return entries iteratively. + + Read the specified MS1 file, **yield** spectra one by one. + Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', + 'intensity array', and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + and 'params' stores a :py:class:`dict` of parameters. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS1 format. Default is + :py:const:`None`, which means read standard input. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`False`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array' and/or 'intensity array'. + + encoding : str, optional + File encoding. + + use_index : bool, optional + Determines which parsing method to use. If :py:const:`True`, an instance of + :py:class:`IndexedMS1` is created. This facilitates random access by scan titles. + If an open file is passed as `source`, it needs to be open in binary mode. + + If :py:const:`False` (default), an instance of :py:class:`MS1` is created. It reads + `source` in text mode and is suitable for iterative parsing. + + .. warning :: + Labels for scan objects are constructed as the first number in the S line, as follows: + for a line ``S 0 1`` the label is `'0'`. If these labels are not unique + for the scans in the file, the indexed parser will not work correctly. + + block_size : int, optinal + Size of the chunk (in bytes) used to parse the file when creating the byte offset index. + (Accepted only for :py:class:`IndexedMS1`.) + + Returns + ------- + + out : :py:class:`MS1Base` + An instance of :py:class:`MS1` or :py:class:`IndexedMS1`, depending on `use_index` and `source`. + """ + if args: + source = args[0] + else: + source = kwargs.get('source') + use_index = kwargs.pop('use_index', None) + use_index = aux._check_use_index(source, use_index, False) + tp = IndexedMS1 if use_index else MS1 + + return tp(*args, **kwargs) + + +chain = aux._make_chain(read, 'read') diff --git a/pyteomics/ms2.py b/pyteomics/ms2.py new file mode 100644 index 0000000..16afbbf --- /dev/null +++ b/pyteomics/ms2.py @@ -0,0 +1,396 @@ +""" +ms2 - read and write MS/MS data in MS2 format +============================================= + +Summary +------- + +`MS2 <http://dx.doi.org/10.1002/rcm.1603>`_ is a simple +human-readable format for MS2 data. It allows storing MS2 peak lists and +exprimental parameters. + +This module provides minimalistic infrastructure for access to data stored in +MS2 files. +Two main classes are :py:class:`MS2`, which provides an iterative, text-mode parser, +and :py:class:`IndexedMS2`, which is a binary-mode parser that supports random access using scan IDs +and retention times. +The function :py:func:`read` helps dispatch between the two classes. +Also, common parameters can be read from MS2 file header with +:py:func:`read_header` function. + +Classes +------- + + :py:class:`MS2` - a text-mode MS2 parser. Suitable to read spectra from a file consecutively. + Needs a file opened in text mode (or will open it if given a file name). + + :py:class:`IndexedMS2` - a binary-mode MS2 parser. When created, builds a byte offset index + for fast random access by spectrum ID. Sequential iteration is also supported. + Needs a seekable file opened in binary mode (if created from existing file object). + + :py:class:`MS2Base` - abstract class, the common ancestor of the two classes above. + Can be used for type checking. + +Functions +--------- + + :py:func:`read` - an alias for :py:class:`MS2` or :py:class:`IndexedMS1`. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`read_header` - get a dict with common parameters for all spectra + from the beginning of MS2 file. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pyteomics import auxiliary as aux +from pyteomics.ms1 import MS1, IndexedMS1, MS1Base + + +class MS2Base(aux.MaskedArrayConversionMixin, MS1Base): + """Abstract class representing an MS2 file. Subclasses implement different approaches to parsing.""" + _array_keys = ['m/z array', 'intensity array', 'charge array', 'resolution array'] + _float_keys = ['RTime', 'RetTime', 'IonInjectionTime', 'PrecursorInt'] + + def __init__(self, source=None, use_header=False, convert_arrays=2, dtype=None, read_charges=True, read_resolutions=True, encoding=None, **kwargs): + """ + Create an instance of a :py:class:`MS2Base` parser. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS1 format. Default is + :py:const:`None`, which means read standard input. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`False`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. + + read_resolutions : bool, optional + If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. + Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array', 'resolution array'. + + encoding : str, optional + File encoding. + """ + super(MS2Base, self).__init__(source=source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, + encoding=encoding, **kwargs) + self._read_charges = read_charges + self._read_resolutions = read_resolutions + + def _handle_peak(self, line, sline, info): + super(MS2Base, self)._handle_peak(line, sline, info) + if self._read_charges: + if len(sline) > 2: + sline = line.strip().split() + try: + info['charge array'].append(int(sline[2])) + except ValueError: + raise aux.PyteomicsError("Error parsing fragment charge on line: " + line) + else: + info['charge array'].append(0) + if self._read_resolutions: + if len(sline) > 2: + sline = line.strip().split() + try: + info['resolution array'].append(int(sline[3])) + except ValueError: + raise aux.PyteomicsError("Error parsing fragment peak resolution on line: " + line) + else: + info['resolution array'].append(0) + + def _make_scan(self, info): + if not self._read_charges: + del info['charge array'] + if not self._read_resolutions: + del info['resolution array'] + return super(MS2Base, self)._make_scan(info) + + def __reduce_ex__(self, protocol): + return (self.__class__, + (self._source_init, False, self._convert_arrays, None, self._read_charges, self._read_resolutions, self.encoding), + self.__getstate__()) + + +class MS2(MS2Base, MS1): + """ + A class representing an MS2 file. Supports the `with` syntax and direct iteration for sequential + parsing. + + :py:class:`MS2` object behaves as an iterator, **yielding** spectra one by one. + Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', + 'intensity array', and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + and 'params' stores a :py:class:`dict` of parameters. + + Attributes + ---------- + + header : dict + The file header. + + """ + def __init__(self, *args, **kwargs): + """ + Create an :py:class:`MS2` (text-mode) reader for a given MS2 file. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS2 format. Default is + :py:const:`None`, which means read standard input. + + .. note :: If a file object is given, it must be opened in text mode. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`False`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. + + read_resolutions : bool, optional + If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. + Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array'. + + encoding : str, optional + File encoding. + + Returns + ------- + + out : MS2 + The reader object. + """ + super(MS2, self).__init__(*args, **kwargs) + + +class IndexedMS2(IndexedMS1, MS2Base): + """ + A class representing an MS2 file. Supports the `with` syntax and direct iteration for sequential + parsing. Specific spectra can be accessed by title using the indexing syntax in constant time. + If created using a file object, it needs to be opened in binary mode. + + When iterated, :py:class:`IndexedMS2` object yields spectra one by one. + Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array', + 'intensity array', 'charge array' and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + 'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints, + and 'params' stores a :py:class:`dict` of parameters (keys and values are + :py:class:`str`, keys corresponding to MS2). + + .. warning :: + Labels for scan objects are constructed as the first number in the S line, as follows: + for a line ``S 0 1 123.4`` the label is `'0'`. If these labels are not unique + for the scans in the file, the indexed parser will not work correctly. Consider using + :py:class:`MS2` instead. + + Attributes + ---------- + + header : dict + The file header. + time : RTLocator + A property used for accessing spectra by retention time. + """ + def __init__(self, source=None, use_header=False, convert_arrays=2, dtype=None, read_charges=True, read_resolutions=True, + encoding='utf-8', _skip_index=False, **kwargs): + """ + Create an :py:class:`IndexedMS2` (binary-mode) reader for a given MS2 file. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS2 format. Default is + :py:const:`None`, which means read standard input. + + .. note :: If a file object is given, it must be opened in binary mode. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`True`. + + convert_arrays : one of {0, 1, 2}, optional + If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. + If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. + If `2`, charges will be reported as a masked array (default). + The default option is the slowest. `1` and `2` require :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. + + read_resolutions : bool, optional + If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. + Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array', 'intensity array', 'charge array'. + + encoding : str, optional + File encoding. + + block_size : int, optinal + Size of the chunk (in bytes) used to parse the file when creating the byte offset index. + + Returns + ------- + + out : IndexedMS2 + The reader object. + """ + super(IndexedMS2, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, + read_charges=read_charges, read_resolutions=read_resolutions, encoding=encoding, _skip_index=_skip_index, **kwargs) + + def __reduce_ex__(self, protocol): + return (self.__class__, + (self._source_init, False, self._convert_arrays, None, self._read_charges, self._read_resolutions, self.encoding, True), + self.__getstate__()) + + +def read_header(source, *args, **kwargs): + """ + Read the specified MS2 file, get the parameters specified in the header + as a :py:class:`dict`. + + Parameters + ---------- + + source : str or file + File name or file object representing an file in MS2 format. + + Returns + ------- + + header : dict + """ + kwargs['use_header'] = True + return read(source, *args, **kwargs).header + + +def read(*args, **kwargs): + """Read an MS2 file and return entries iteratively. + + Read the specified MS2 file, **yield** spectra one by one. + Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', + 'intensity array', and 'params'. 'm/z array' and + 'intensity array' store :py:class:`numpy.ndarray`'s of floats, + and 'params' stores a :py:class:`dict` of parameters. + + Parameters + ---------- + + source : str or file or None, optional + A file object (or file name) with data in MS2 format. Default is + :py:const:`None`, which means read standard input. + + use_header : bool, optional + Add the info from file header to each dict. Spectrum-specific parameters + override those from the header in case of conflict. + Default is :py:const:`False`. + + convert_arrays : bool, optional + If :py:const:`False`, m/z and intensities will be returned as regular lists. + If :py:const:`True` (default), they will be converted to regular :py:class:`numpy.ndarray`'s. + Conversion requires :py:mod:`numpy`. + + read_charges : bool, optional + If `True` (default), fragment charges are reported. Disabling it improves performance. + Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. + + read_resolutions : bool, optional + If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. + Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. + + dtype : type or str or dict, optional + dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. + Keys should be 'm/z array' and/or 'intensity array'. + + encoding : str, optional + File encoding. + + use_index : bool, optional + Determines which parsing method to use. If :py:const:`True`, an instance of + :py:class:`IndexedMS2` is created. This facilitates random access by scan titles. + If an open file is passed as `source`, it needs to be open in binary mode. + + .. warning :: + Labels for scan objects are constructed as the first number in the S line, as follows: + for a line ``S 0 1 123.4`` the label is `'0'`. If these labels are not unique + for the scans in the file, the indexed parser will not work correctly. + + If :py:const:`False` (default), an instance of :py:class:`MS2` is created. It reads + `source` in text mode and is suitable for iterative parsing. + + block_size : int, optinal + Size of the chunk (in bytes) used to parse the file when creating the byte offset index. + (Accepted only for :py:class:`IndexedMS2`.) + + Returns + ------- + + out : + An instance of :py:class:`MS2` or :py:class:`IndexedMS2`, depending on `use_index` and `source`. + """ + if args: + source = args[0] + else: + source = kwargs.get('source') + use_index = kwargs.pop('use_index', None) + use_index = aux._check_use_index(source, use_index, False) + tp = IndexedMS2 if use_index else MS2 + + return tp(*args, **kwargs) + + +chain = aux._make_chain(read, 'read') diff --git a/pyteomics/mzid.py b/pyteomics/mzid.py new file mode 100644 index 0000000..2df70bf --- /dev/null +++ b/pyteomics/mzid.py @@ -0,0 +1,453 @@ +""" +mzid - mzIdentML file reader +============================ + +Summary +------- + +`mzIdentML <http://www.psidev.info/mzidentml>`_ is one of the standards +developed by the Proteomics Informatics working group of the HUPO Proteomics +Standard Initiative. + +This module provides a minimalistic way to extract information from mzIdentML +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`MzIdentML`) to iterate over entries in +``<SpectrumIdentificationResult>`` elements, i.e. groups of identifications +for a certain spectrum. Note that each entry can contain more than one PSM +(peptide-spectrum match). They are accessible with "SpectrumIdentificationItem" +key. +:py:class:`MzIdentML` objects also support direct indexing by element ID. + +Data access +----------- + + :py:class:`MzIdentML` - a class representing a single MzIdentML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through peptide-spectrum matches in an mzIdentML + file. Data from a single PSM group are converted to a human-readable dict. + Basically creates an :py:class:`MzIdentML` object and reads it. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`DataFrame` - read MzIdentML files into a :py:class:`pandas.DataFrame`. + +Target-decoy approach +--------------------- + + :py:func:`filter` - read a chain of mzIdentML files and filter to a certain + FDR using TDA. + + :py:func:`filter.chain` - chain a series of filters applied independently to + several files. + + :py:func:`filter.chain.from_iterable` - chain a series of filters applied + independently to an iterable of files. + + :py:func:`filter_df` - filter MzIdentML files and return a :py:class:`pandas.DataFrame`. + + :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be + consiudered decoy. + + :py:func:`fdr` - estimate the false discovery rate of a set of identifications + using the target-decoy approach. + + :py:func:`qvalues` - get an array of scores and local FDR values for a PSM + set using the target-decoy approach. + +Controlled Vocabularies +~~~~~~~~~~~~~~~~~~~~~~~ +mzIdentML relies on controlled vocabularies to describe its contents extensibly. See +`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ +for more details on how they are used. + +Handling Time Units and Other Qualified Quantities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +mzIdentML contains information which may be described as using a variety of different time units. +See `Unit Handling <../data.html#unit-handling>`_ for more information. + + +Deprecated functions +-------------------- + + :py:func:`version_info` - get information about mzIdentML version and schema. + You can just read the corresponding attribute of the :py:class:`MzIdentML` + object. + + :py:func:`get_by_id` - get an element by its ID and extract the data from it. + You can just call the corresponding method of the :py:class:`MzIdentML` + object. + + :py:func:`iterfind` - iterate over elements in an mzIdentML file. + You can just call the corresponding method of the :py:class:`MzIdentML` + object. + +Dependencies +------------ + +This module requires :py:mod:`lxml`. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from . import auxiliary as aux +from . import xml, _schema_defaults + + +class MzIdentML(xml.MultiProcessingXML, xml.IndexSavingXML): + """Parser class for MzIdentML files.""" + file_format = 'mzIdentML' + _root_element = 'MzIdentML' + _default_schema = _schema_defaults._mzid_schema_defaults + _default_version = '1.1.0' + _default_iter_tag = 'SpectrumIdentificationResult' + _structures_to_flatten = {'Fragmentation'} + _indexed_tags = {'SpectrumIdentificationResult', 'SpectrumIdentificationItem', + 'SearchDatabase', 'SourceFile', 'SpectraData', 'Sample', + 'DBSequence', 'Peptide', 'PeptideEvidence', + 'Measure', 'TranslationTable', 'MassTable', 'Enzyme', + 'Organization', 'AnalysisSoftware', 'BibliographicReference', 'Person', 'Provider', + 'SpectrumIdentificationList', 'SpectrumIdentificationProtocol', 'SpectrumIdentification', + 'ProteinDetectionList', 'ProteinDetectionProtocol', 'ProteinDetection', + 'ProteinDetectionHypothesis', 'ProteinAmbiguityGroup', + } + + _element_handlers = xml.XML._element_handlers.copy() + _element_handlers.update({ + "Modification": xml.XML._promote_empty_parameter_to_name, + "SpectrumIDFormat": xml.XML._promote_empty_parameter_to_name, + "FileFormat": xml.XML._promote_empty_parameter_to_name, + "Role": xml.XML._promote_empty_parameter_to_name + }) + + def __init__(self, *args, **kwargs): + kwargs.setdefault('retrieve_refs', True) + super(MzIdentML, self).__init__(*args, **kwargs) + + def _get_info_smart(self, element, **kwargs): + """Extract the info in a smart way depending on the element type""" + name = xml._local_name(element) + kwargs = dict(kwargs) + rec = kwargs.pop("recursive", None) + + # Try not to recursively unpack the root element + # unless the user really wants to. + if name == self._root_element: + return self._get_info(element, + recursive=(rec if rec is not None else False), + **kwargs) + else: + return self._get_info(element, + recursive=(rec if rec is not None else True), + **kwargs) + + def _retrieve_refs(self, info, **kwargs): + """Retrieves and embeds the data for each attribute in `info` that + ends in _ref. Removes the id attribute from `info`""" + for k, v in dict(info).items(): + if k.endswith('_ref'): + try: + by_id = self.get_by_id(v, retrieve_refs=True) + except KeyError: + warnings.warn('Ignoring unresolved reference: ' + v) + else: + info.update(by_id) + del info[k] + info.pop('id', None) + +def read(source, **kwargs): + """Parse `source` and iterate through peptide-spectrum matches. + + .. note:: This function is provided for backward compatibility only. + It simply creates an :py:class:`MzIdentML` instance using + provided arguments and returns it. + + Parameters + ---------- + source : str or file + A path to a target mzIdentML file or the file object itself. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + retrieve_refs : bool, optional + If :py:const:`True`, additional information from references will be + automatically added to the results. The file processing time will + increase. Default is :py:const:`True`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + build_id_cache : bool, optional + Defines whether a cache of element IDs should be built and stored on the + created :py:class:`MzIdentML` instance. Default value is the value of + `retrieve_refs`. + + .. note:: This parameter is ignored when ``use_index`` is ``True`` (default). + + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored. + + indexed_tags : container of bytes, optional + Defines which elements need to be indexed. Empty set by default. + + Returns + ------- + out : MzIdentML + An iterator over the dicts with PSM properties. + """ + kwargs = kwargs.copy() + kwargs.setdefault('retrieve_refs', True) + kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) + return MzIdentML(source, **kwargs) + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified "XPath". + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create an :py:class:`MzIdentML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + retrieve_refs : bool, optional + If :py:const:`True`, additional information from references will be + automatically added to the results. The file processing time will + increase. Default is :py:const:`False`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + build_id_cache : bool, optional + Defines whether a cache of element IDs should be built and stored on the + created :py:class:`MzIdentML` instance. Default value is the value of + `retrieve_refs`. + + Returns + ------- + out : iterator + """ + kwargs = kwargs.copy() + kwargs['build_id_cache'] = kwargs.get('build_id_cache', + kwargs.get('retrieve_refs')) + return MzIdentML(source, **kwargs).iterfind(path, **kwargs) + +version_info = xml._make_version_info(MzIdentML) + +def get_by_id(source, elem_id, **kwargs): + """Parse `source` and return the element with `id` attribute equal + to `elem_id`. Returns :py:const:`None` if no such element is found. + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`get_by_id` calls on one file, you should + create an :py:class:`MzIdentML` object and use its + :py:meth:`!get_by_id` method. + + Parameters + ---------- + source : str or file + A path to a target mzIdentML file of the file object itself. + + elem_id : str + The value of the `id` attribute to match. + + Returns + ------- + out : :py:class:`dict` or :py:const:`None` + """ + return MzIdentML(source, **kwargs).get_by_id(elem_id, **kwargs) + + +# chain = aux._make_chain(read, 'read') +chain = aux.ChainBase._make_chain(MzIdentML) + + +def is_decoy(psm, prefix=None): + """Given a PSM dict, return :py:const:`True` if all proteins in the dict + are marked as decoy, and :py:const:`False` otherwise. + + Parameters + ---------- + psm : dict + A dict, as yielded by :py:func:`read`. + prefix : ignored + + Returns + ------- + out : bool + """ + return all(pe['isDecoy'] for sii in psm['SpectrumIdentificationItem'] + for pe in sii['PeptideEvidenceRef']) + + +def DataFrame(*args, **kwargs): + """Read MzIdentML files into a :py:class:`pandas.DataFrame`. + + Requires :py:mod:`pandas`. + + .. warning :: Only the first 'SpectrumIdentificationItem' element is considered in every + 'SpectrumIdentificationResult'. + + Parameters + ---------- + *args + Passed to :py:func:`chain`. + **kwargs + Passed to :py:func:`chain`. + + sep : str or None, keyword only, optional + Some values related to PSMs (such as protein information) are variable-length + lists. If `sep` is a :py:class:`str`, they will be packed into single string using + this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is + :py:const:`None`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + data = [] + + sep = kwargs.pop('sep', None) + with chain(*args, **kwargs) as f: + for item in f: + info = {} + for k, v in item.items(): + if isinstance(v, (str, int, float)): + info[k] = v + sii = item.get('SpectrumIdentificationItem', [None])[0] + if sii is not None: + info.update((k, v) for k, v in sii.items() if isinstance(v, (str, int, float))) + evref = sii.get('PeptideEvidenceRef') + if evref: + prot_descr, accessions, isd, starts, ends, lengths = [], [], [], [], [], [] + for d in evref: + prot_descr.append(d.get('protein description')) + accessions.append(d.get('accession')) + isd.append(d.get('isDecoy')) + starts.append(d.get('start')) + ends.append(d.get('end')) + lengths.append(d.get('length')) + isd = all(isd) + if sep is not None: + if all(isinstance(prd, str) for prd in prot_descr): + prot_descr = sep.join(prot_descr) + + if all(isinstance(acc, str) for acc in accessions): + accessions = sep.join(accessions) + + if all(prd is None for prd in prot_descr): + prot_descr = None + if all(acc is None for acc in accessions): + accessions = None + + info.update((k, v) for k, v in evref[0].items() if isinstance(v, (str, int, float, list))) + info['protein description'] = prot_descr + info['accession'] = accessions + info['isDecoy'] = isd + info['start'] = starts + info['end'] = ends + info['length'] = lengths + data.append(info) + df = pd.DataFrame(data) + return df + + +def filter_df(*args, **kwargs): + """Read MzIdentML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. + Positional arguments can be MzIdentML files or DataFrames. + + Requires :py:mod:`pandas`. + + .. warning :: Only the first 'SpectrumIdentificationItem' element is considered in every + 'SpectrumIdentificationResult'. + + Parameters + ---------- + key : str / iterable / callable, keyword only, optional + Default is 'mascot:expectation value'. + is_decoy : str / iterable / callable, keyword only, optional + Default is 'isDecoy'. + *args + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + **kwargs + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + kwargs.setdefault('key', 'mascot:expectation value') + kwargs.setdefault('is_decoy', 'isDecoy') + if all(isinstance(arg, pd.DataFrame) for arg in args): + df = pd.concat(args) + else: + df = DataFrame(*args, **kwargs) + return aux.filter(df, **kwargs) + + +fdr = aux._make_fdr(is_decoy, None) +_key = lambda x: min( + sii['mascot:expectation value'] for sii in x['SpectrumIdentificationItem']) +qvalues = aux._make_qvalues(chain, is_decoy, None, _key) +filter = aux._make_filter(chain, is_decoy, None, _key, qvalues) +filter.chain = aux._make_chain(filter, 'filter', True) diff --git a/pyteomics/mzml.py b/pyteomics/mzml.py new file mode 100644 index 0000000..11a9613 --- /dev/null +++ b/pyteomics/mzml.py @@ -0,0 +1,546 @@ +""" +mzml - reader for mass spectrometry data in mzML format +======================================================= + +Summary +------- + +mzML is a standard rich XML-format for raw mass spectrometry data storage. +Please refer to `psidev.info <http://www.psidev.info/index.php?q=node/257>`_ +for the detailed specification of the format and structure of mzML files. + +This module provides a minimalistic way to extract information from mzML +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`MzML` or :py:class:`PreIndexedMzML`) +to iterate over entries in ``<spectrum>`` elements. +:py:class:`MzML` and :py:class:`PreIndexedMzML` also support direct indexing +with spectrum IDs. + +Data access +----------- + + :py:class:`MzML` - a class representing a single mzML file. + Other data access functions use this class internally. + + :py:class:`PreIndexedMzML` - a class representing a single mzML file. + Uses byte offsets listed at the end of the file for quick access to spectrum elements. + + :py:func:`read` - iterate through spectra in mzML file. Data from a + single spectrum are converted to a human-readable dict. Spectra themselves are + stored under 'm/z array' and 'intensity array' keys. + + :py:func:`chain` - read multiple mzML files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + +Controlled Vocabularies +~~~~~~~~~~~~~~~~~~~~~~~ +mzML relies on controlled vocabularies to describe its contents extensibly. See +`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ +for more details on how they are used. + +Handling Time Units and Other Qualified Quantities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +mzML contains information which may be described as using a variety of different time units. +See `Unit Handling <../data.html#unit-handling>`_ for more information. + +Deprecated functions +-------------------- + + :py:func:`version_info` - get version information about the mzML file. + You can just read the corresponding attribute of the :py:class:`MzML` object. + + :py:func:`iterfind` - iterate over elements in an mzML file. + You can just call the corresponding method of the :py:class:`MzML` object. + +Dependencies +------------ + +This module requires :py:mod:`lxml` and :py:mod:`numpy`. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import warnings +import numpy as np +from . import xml, auxiliary as aux, _schema_defaults +from .xml import etree + +NON_STANDARD_DATA_ARRAY = 'non-standard data array' + +STANDARD_ARRAYS = set([ + 'm/z array', + 'intensity array', + 'charge array', + 'signal to noise array', + 'time array', + 'wavelength array', + 'flow rate array', + 'pressure array', + 'temperature array', + 'mean charge array', + 'resolution array', + 'baseline array', + 'noise array', + 'sampled noise m/z array', + 'sampled noise intensity array', + 'sampled noise baseline array', + 'ion mobility array', + 'deconvoluted ion mobility drift time array', + 'deconvoluted inverse reduced ion mobility array', + 'deconvoluted ion mobility array', + 'raw ion mobility drift time array', + 'raw inverse reduced ion mobility array', + 'raw ion mobility array', + 'mean inverse reduced ion mobility array', + 'mean ion mobility array', + 'mean ion mobility drift time array', + 'mass array', + 'scanning quadrupole position lower bound m/z array', + 'scanning quadrupole position upper bound m/z array', +]) + + +class MzML(aux.BinaryArrayConversionMixin, aux.TimeOrderedIndexedReaderMixin, xml.MultiProcessingXML, xml.IndexSavingXML): + """Parser class for mzML files.""" + file_format = 'mzML' + _root_element = 'mzML' + _default_schema = _schema_defaults._mzml_schema_defaults + _default_version = '1.1.0' + _default_iter_tag = 'spectrum' + _structures_to_flatten = {'binaryDataArrayList', 'referenceableParamGroupRef'} + _indexed_tags = {'spectrum', 'chromatogram'} + + def __init__(self, *args, **kwargs): + self.decode_binary = kwargs.pop('decode_binary', True) + self._referenceable_param_groups = {} + super(MzML, self).__init__(*args, **kwargs) + + def __getstate__(self): + state = super(MzML, self).__getstate__() + state['decode_binary'] = self.decode_binary + return state + + def __setstate__(self, state): + super(MzML, self).__setstate__(state) + self.decode_binary = state['decode_binary'] + + def _handle_referenceable_param_group(self, param_group_ref, **kwargs): + ref_name = param_group_ref.attrib['ref'] + if ref_name not in self._referenceable_param_groups: + params = self._referenceable_param_groups[ref_name] = self._retrieve_param_group(ref_name) + return params + return self._referenceable_param_groups[ref_name] + + @xml._keepstate + def _retrieve_param_group(self, ref_name): + group = self.get_by_id(ref_name) + group.pop("id", None) + return [xml._XMLParam(k, v, None) for k, v in group.items()] + + def _detect_array_name(self, info): + """Determine what the appropriate name for this + array is by inspecting the available param-based + keys. + + Parameters + ---------- + info : dict + The collapsed binary tag plus + associated *Param data + + Returns + ------- + out : str + The name for this array entry + """ + # If this is a non-standard array, we hope the userParams + # will conform to the same array suffix pattern. + is_non_standard = False + + # Accumulate possible name candidates + candidates = [] + for k in info: + if k.endswith(' array') and not info[k]: + if NON_STANDARD_DATA_ARRAY == k: + is_non_standard = True + else: + candidates.append(k) + # A non-standard data array term key might have the name for the data array + # as the value. + nonstandard_name = info.get(NON_STANDARD_DATA_ARRAY) + if nonstandard_name: + return nonstandard_name + if isinstance(info.get('name'), list): + for val in info['name']: + if val.endswith(' array'): + if NON_STANDARD_DATA_ARRAY == val: + is_non_standard = True + else: + candidates.append(val) + # Name candidate resolution + n_candidates = len(candidates) + # Easy case, exactly one name given + if n_candidates == 1: + return candidates[0] + # We are missing information, but at least + # if we know the array is non-standard we + # can report it as such. Otherwise fall back + # to "binary". This fallback signals special + # behavior elsewhere. + if n_candidates == 0: + invalid = {"encodedLength", "dataProcessingRef", "arrayLength", + "binary"} + for k in info: + if k in invalid: + continue + candidates.append(k) + if len(candidates) == 0: + if is_non_standard: + return NON_STANDARD_DATA_ARRAY + warnings.warn("No options for non-standard data array") + return "binary" + else: + warnings.warn( + "Multiple options for naming binary array after no valid name found: %r" % candidates) + return max(candidates, key=len) + # Multiple choices means we need to make a decision which could + # mask data from the user. This should never happen but stay safe. + # There are multiple options to choose from. There is no way to + # make a good choice here. We first prefer the standardized + # arrays before falling back to just guessing. + else: + candidates = set(candidates) + # Maybe we just have a repeated term? + if len(candidates) == 1: + return next(iter(candidates)) + warnings.warn( + "Multiple options for naming binary array: %r" % candidates) + standard_options = candidates & STANDARD_ARRAYS + if standard_options: + return max(standard_options, key=len) + return max(candidates, key=len) + + def _determine_array_dtype(self, info): + dtype = None + types = {'32-bit float': np.float32, '64-bit float': np.float64, + '32-bit integer': np.int32, '64-bit integer': np.int64, + 'null-terminated ASCII string': np.uint8} + for t, code in types.items(): + if t in info: + dtype = code + del info[t] + break + # sometimes it's under 'name' + else: + if 'name' in info: + for t, code in types.items(): + if t in info['name']: + dtype = code + info['name'].remove(t) + break + return dtype + + def _determine_compression(self, info): + known_compression_types = set(self.compression_type_map) + found_compression_types = known_compression_types & set(info) + if found_compression_types: + found_compression_types = tuple(found_compression_types) + if len(found_compression_types) == 1: + del info[found_compression_types[0]] + return found_compression_types[0] + warnings.warn("Multiple options for binary array compression: %r" % ( + found_compression_types,)) + return found_compression_types[0] + elif "name" in info: + found_compression_types = known_compression_types & set(info['name']) + if found_compression_types: + found_compression_types = tuple(found_compression_types) + if len(found_compression_types) == 1: + del info['name'][found_compression_types[0]] + return found_compression_types[0] + else: + warnings.warn("Multiple options for binary array compression: %r" % ( + found_compression_types,)) + return found_compression_types[0] + else: + return 'no compression' + + def _handle_binary(self, info, **kwargs): + """Special handling when processing and flattening + a <binary> tag and its sibling *Param tags. + + Parameters + ---------- + info : dict + Unprocessed binary array data and metadata + + Returns + ------- + out : dict + The processed and flattened data array and metadata + """ + dtype = self._determine_array_dtype(info) + compressed = self._determine_compression(info) + name = self._detect_array_name(info) + binary = info.pop('binary') + if not self.decode_binary: + info[name] = self._make_record(binary, compressed, dtype, name) + return info + + if binary: + array = self.decode_data_array(binary, compressed, dtype) + else: + array = np.array([], dtype=dtype) + + if name == 'binary': + info[name] = self._convert_array(None, array) + else: + info = {name: self._convert_array(name, array)} + return info + + def _get_info_smart(self, element, **kw): + name = xml._local_name(element) + kwargs = dict(kw) + rec = kwargs.pop('recursive', None) + if name in {'indexedmzML', 'mzML'}: + info = self._get_info(element, + recursive=(rec if rec is not None else False), + **kwargs) + else: + info = self._get_info(element, + recursive=(rec if rec is not None else True), + **kwargs) + if 'binary' in info and isinstance(info, dict): + info = self._handle_binary(info, **kwargs) + + if 'binaryDataArray' in info and isinstance(info, dict): + for array in info.pop('binaryDataArray'): + info.update(array) + intkeys = {'ms level'} + for k in intkeys: + if k in info: + try: + info[k] = int(info[k]) + except (ValueError, TypeError): + pass + return info + + def _retrieve_refs(self, info, **kwargs): + """Retrieves and embeds the data for each attribute in `info` that + ends in _ref. Removes the id attribute from `info`""" + for k, v in dict(info).items(): + if k == 'ref': + by_id = self.get_by_id(v, retrieve_refs=True) + if by_id is None: + warnings.warn('Ignoring unresolved reference: ' + v) + else: + info.update(by_id) + del info[k] + info.pop('id', None) + + @staticmethod + def _get_time(scan): + return scan['scanList']['scan'][0]['scan start time'] + + +def read(source, read_schema=False, iterative=True, use_index=False, dtype=None, huge_tree=False, decode_binary=True): + """Parse `source` and iterate through spectra. + + Parameters + ---------- + source : str or file + A path to a target mzML file or the file object itself. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzML header. Otherwise, use default parameters. + Not recommended without Internet connection or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + spectrum elements. Default is :py:const:`False`. + + dtype : type or dict, optional + dtype to convert arrays to, one for both m/z and intensity arrays or one for each key. + If :py:class:`dict`, keys should be 'm/z array' and 'intensity array'. + + decode_binary : bool, optional + Defines whether binary data should be decoded and included in the output + (under "m/z array", "intensity array", etc.). + Default is :py:const:`True`. + + huge_tree : bool, optional + This option is passed to the `lxml` parser and defines whether + security checks for XML tree depth and node size should be disabled. + Default is :py:const:`False`. + Enable this option for trusted files to avoid XMLSyntaxError exceptions + (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). + + Returns + ------- + out : iterator + An iterator over the dicts with spectrum properties. + """ + + return MzML(source, read_schema=read_schema, iterative=iterative, + use_index=use_index, dtype=dtype, huge_tree=huge_tree, + decode_binary=decode_binary) + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified "XPath". + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create an :py:class:`MzML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header. Otherwise, use default + parameters. Not recommended without Internet connection or + if you don't like to get the related warnings. + + decode_binary : bool, optional + Defines whether binary data should be decoded and included in the output + (under "m/z array", "intensity array", etc.). + Default is :py:const:`True`. + + Returns + ------- + out : iterator + """ + return MzML(source, **kwargs).iterfind(path, **kwargs) + +version_info = xml._make_version_info(MzML) + +# chain = aux._make_chain(read, 'read') + +chain = aux.ChainBase._make_chain(MzML) + + +class PreIndexedMzML(MzML): + """Parser class for mzML files, subclass of :py:class:`MzML`. + Uses byte offsets listed at the end of the file for quick access to spectrum elements. + """ + def _build_index(self): + """ + Build up a `dict` of `dict` of offsets for elements. Calls :meth:`_find_index_list` + and assigns the return value to :attr:`_offset_index` + """ + index = self._find_index_list() + if index: + self._offset_index = index + else: + warnings.warn('Could not extract the embedded offset index. Falling back to default indexing procedure.') + super(PreIndexedMzML, self)._build_index() + + @xml._keepstate + def _iterparse_index_list(self, offset): + index_map = xml.HierarchicalOffsetIndex() + index = index_map._inner_type() + self._source.seek(offset) + try: + for event, elem in etree.iterparse(self._source, events=('start', 'end'), remove_comments=True): + if event == 'start': + if elem.tag == 'index': + index = {} + index_map[elem.attrib['name']] = index + else: + if elem.tag == 'offset': + index[elem.attrib['idRef']] = int(elem.text) + elem.clear() + except etree.XMLSyntaxError: + # The iteration has reached the end of the indexList tag and the parser + # encounters the later elements in the document. + pass + return index_map + + @xml._keepstate + def _find_index_list_offset(self): + """ + Search relative to the bottom of the file upwards to find the offsets + of the index lists. + + Returns + ------- + list of int + A list of byte offsets for `<indexList>` elements + """ + self._source.seek(-1024, 2) + text = self._source.read(1024) + index_offsets = list(map(int, re.findall(br'<indexListOffset>(\d+)</indexListOffset>', text))) + return index_offsets + + @xml._keepstate + def _find_index_list(self): + """ + Extract lists of index offsets from the end of the file. + + Returns + ------- + dict of str -> dict of str -> int + """ + offsets = self._find_index_list_offset() + index_list = xml.HierarchicalOffsetIndex() + for offset in offsets: + # Sometimes the offset is at the very beginning of the file, + # due to a bug in an older version of ProteoWizard. If this crude + # check fails, don't bother searching the entire file, and fall back + # on the base class's mechanisms. + # + # Alternative behavior here would be to start searching for the start + # of the index from the bottom of the file, but this version of Proteowizard + # also emits invalid offsets which do not improve retrieval time. + if offset < 1024: + continue + index_list = self._iterparse_index_list(offset) + return index_list diff --git a/pyteomics/mzmlb.py b/pyteomics/mzmlb.py new file mode 100644 index 0000000..5cda3ca --- /dev/null +++ b/pyteomics/mzmlb.py @@ -0,0 +1,618 @@ +# -*- coding: utf8 -*- +""" +mzmlb - reader for mass spectrometry data in mzMLb format +========================================================= + +.. warning:: + This is a **Provisional Implementation**. The mzMLb format has been published + but is not yet broadly available. + +Summary +------- +mzMLb is an HDF5 container format wrapping around the standard rich XML-format +for raw mass spectrometry data storage. Please refer to [1]_ for more information +about mzMLb and its features. Please refer to +`psidev.info <https://www.psidev.info/mzML>`_ for the detailed +specification of the format and structure of mzML files. + +This module provides a minimalistic way to extract information from mzMLb +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`MzMLb` to iterate over entries in ``<spectrum>`` elements. +:py:class:`MzMLb` also support direct indexing with spectrum IDs or indices. + +Data access +----------- + + :py:class:`MzMLb` - a class representing a single mzMLb file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through spectra in mzMLb file. Data from a + single spectrum are converted to a human-readable dict. Spectra themselves are + stored under 'm/z array' and 'intensity array' keys. + + :py:func:`chain` - read multiple mzMLb files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + +Controlled Vocabularies +~~~~~~~~~~~~~~~~~~~~~~~ +mzMLb relies on controlled vocabularies to describe its contents extensibly. See +`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ +for more details on how they are used. + +Handling Time Units and Other Qualified Quantities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +mzMLb contains information which may be described as using a variety of different time units. +See `Unit Handling <../data.html#unit-handling>`_ for more information. + +References +---------- +.. [1] Bhamber, R. S., Jankevics, A., Deutsch, E. W., Jones, A. R., & Dowsey, A. W. (2021). + MzMLb: A Future-Proof Raw Mass Spectrometry Data Format Based on Standards-Compliant + mzML and Optimized for Speed and Storage Requirements. Journal of Proteome Research, + 20(1), 172–183. https://doi.org/10.1021/acs.jproteome.0c00192 +""" + +import io +import warnings +import logging +from collections import namedtuple + +import h5py +try: + logging.getLogger("hdf5plugin").addHandler(logging.NullHandler()) + import hdf5plugin +except ImportError: + hdf5plugin = None + +import numpy as np + +from pyteomics.mzml import MzML as _MzML +from pyteomics.auxiliary.file_helpers import HierarchicalOffsetIndex, TaskMappingMixin, TimeOrderedIndexedReaderMixin, FileReader +from pyteomics import auxiliary as aux, xml + + +def delta_predict(data, copy=True): + '''Reverse the lossy transformation of the delta compression + helper. + + Parameters + ---------- + data : :class:`numpy.ndarray` + The data to transform + copy : bool + Whether to make a copy of the data array or transform it in-place. + + Returns + ------- + :class:`numpy.ndarray` + The transformed data array + ''' + if copy: + out = data.copy() + else: + out = data + for i in range(2, len(data)): + out[i] = out[i] + out[i - 1] - out[0] + return out + + +def linear_predict(data, copy=True): + '''Reverse the lossy transformation of the linear interpolation compression + helper. + + Parameters + ---------- + data : :class:`numpy.ndarray` + The data to transform + copy : bool + Whether to make a copy of the data array or transform it in-place. + + Returns + ------- + :class:`numpy.ndarray` + The transformed data array + ''' + if copy: + out = data.copy() + else: + out = data + for i in range(2, len(data)): + out[i] = out[i] + 2 * out[i - 1] - out[i - 2] - out[1] + return out + + +class HDF5ByteBuffer(io.RawIOBase): + '''Helper class that looks file-like so that we can pass a HDF5 byte dataset to + an arbitrary XML parser. + + Implements :class:`~io.RawIOBase` for reading. + ''' + def __init__(self, buffer, offset=None): + if offset is None: + offset = 0 + self.buffer = buffer + self.offset = offset + self.size = self.buffer.size + self.mode = 'rb' + + def readable(self): + return True + + def seekable(self): + return True + + def isatty(self): + return False + + def seek(self, offset, whence=0): + if whence == io.SEEK_SET: + self.offset = offset + elif whence == io.SEEK_CUR: + self.offset += offset + elif whence == io.SEEK_END: + self.offset = self.size - offset + else: + raise ValueError("Bad whence %r" % whence) + return self.offset + + def tell(self): + return self.offset + + def close(self): + return + + @property + def closed(self): + return False + + def readinto(self, b): + n = len(b) + temp = self._read(n) + m = len(temp) + b[:m] = temp[:] + return m + + def readall(self): + return bytes(self._read(-1)) + + def read(self, n=-1): + return bytes(self._read(n)) + + def write(self, b): + raise ValueError("Read-only stream") + + def _read(self, n=-1): + if n == -1: + n = self.size + 1 + dat = bytearray(np.array(self.buffer[self.offset:self.offset + n])) + self.offset += n + return dat + + +class external_array_slice(namedtuple('external_array_slice', + ['array_name', 'offset', 'length', 'source', 'transform', 'key', 'dtype'])): + def decode(self): + """Decode :attr:`data` into a numerical array + + Returns + ------- + np.ndarray + """ + return self.source._decode_record(self) + + +class ExternalDataMzML(_MzML): + '''An MzML parser that reads data arrays from an external provider. + + This is an implementation detail of :class:`MzMLb`. + ''' + def __init__(self, *args, **kwargs): + self._external_data_registry = kwargs.pop("external_data_registry", None) + super(ExternalDataMzML, self).__init__(*args, **kwargs) + + def _make_record(self, array_name, offset, length, transform, name, dtype): + return external_array_slice(array_name, offset, length, self, transform, name, dtype) + + def _transform_array(self, array, transform): + if transform is None: + return array + elif "linear prediction" == transform: + return linear_predict(array, copy=False) + elif "delta prediction" == transform: + return delta_predict(array, copy=False) + else: + raise ValueError("Transformation not recognized") + + def _retrieve_external_array(self, array_name, length, offset): + array = self._external_data_registry.get(array_name, length, offset) + return array + + def decode_data_array(self, array_name, offset, length, transform=None, dtype=np.float64): + array = self._retrieve_external_array(array_name, length, offset) + array = self._transform_array(array, transform) + return array + + def _decode_record(self, record): + array = self.decode_data_array( + record.array_name, record.offset, record.length, record.transform, record.dtype) + return self._finalize_record_conversion(array, record) + + def _handle_binary(self, info, **kwargs): + if not self.decode_binary: + self.decode_binary = True + # Binary decoding works totally differently here, not supporting the previous signatures + # that the parent method will use. Pretend we are decoding because it is a no-op in the + # parent method. + result = super(ExternalDataMzML, self)._handle_binary(info, **kwargs) + self.decode_binary = False + else: + result = super(ExternalDataMzML, self)._handle_binary(info, **kwargs) + try: + array_name = info['external HDF5 dataset'] + except KeyError: + array_name = info['external dataset'] + offset = int(info['external offset']) + length = int(info['external array length']) + + transform = None + # The zlib compression in these two terms happens automatically during HDF5 encoding and + # the reader needn't even know about it. Need an example of how Numpress will be signaled. + if "linear prediction" in info or "truncation, linear prediction and zlib compression" in info: + transform = 'linear prediction' + elif "delta prediction" in info or "truncation, delta prediction and zlib compression" in info: + transform = 'delta prediction' + + if not self.decode_binary: + name = self._detect_array_name(info) + result[name] = self._make_record( + array_name, offset, length, transform, name, + self._external_data_registry.dtype_of(array_name)) + return result + + array = self._retrieve_external_array(array_name, length, offset) + + if len(result) == 1: + name = next(iter(result)) + else: + name = self._detect_array_name(info) + result[name] = self._convert_array(name, array) + return result + + def reset(self): + super(ExternalDataMzML, self).reset() + self._external_data_registry.clear() + + +class chunk_interval_cache_record(namedtuple("chunk_interval_cache_record", ("start", "end", "array"))): + def contains(self, start, end): + if self.start <= start: + if end < self.end: + return True + return False + + def get(self, start, end): + return self.array[start - self.start:end - self.start] + + def __eq__(self, other): + return self.start == other.start and self.end == other.end + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self.start) + + +class ExternalArrayRegistry(object): + '''Read chunks out of a single long array + + This is an implementation detail of :class:`MzMLb` + + Attributes + ---------- + registry : Mapping + A mapping from array name to the out-of-core array object. + chunk_size : int + The number of entries to chunk together and keep in memory. + chunk_cache : dict + A mapping from array name to cached array blocks. + ''' + def __init__(self, registry, chunk_size=None): + if chunk_size is None: + chunk_size = 2 ** 20 + else: + chunk_size = int(chunk_size) + self.registry = registry + self.chunk_cache = {} + self.chunk_size = chunk_size + + def clear(self): + self.chunk_cache.clear() + + def _get_raw(self, array_name, start, end): + return self.registry[array_name][start:end] + + def _make_cache_record(self, array_name, start, end): + return chunk_interval_cache_record(start, end, self._get_raw(array_name, start, end)) + + def get(self, array_name, length, offset=0): + start = offset + end = start + length + try: + cache_record = self.chunk_cache[array_name] + if cache_record.contains(start, end): + return cache_record.get(start, end) + else: + cache_record = self._make_cache_record( + array_name, start, start + max(length, self.chunk_size)) + self.chunk_cache[array_name] = cache_record + return cache_record.get(start, end) + except KeyError: + cache_record = self._make_cache_record( + array_name, start, start + max(length, self.chunk_size)) + self.chunk_cache[array_name] = cache_record + return cache_record.get(start, end) + return self.registry[array_name][offset:offset + length] + + def dtype_of(self, array_name): + return self.registry[array_name].dtype + + def __call__(self, array_name, length, offset=0): + return self.get(array_name, length, offset) + + +class MzMLb(TimeOrderedIndexedReaderMixin, TaskMappingMixin): + '''A parser for mzMLb [1]_. + + Provides an identical interface to :class:`~pyteomics.mzml.MzML`. + + Attributes + ---------- + path : str, Path-like, or file-like object + The mzMLb file path or a file-like object providing it. + handle : :class:`h5py.File` + The raw HDF5 file container. + mzml_parser : :class:`~.ExternalDataMzML` + The mzML parser for the XML stream inside the HDF5 file with + special behavior for retrieving the out-of-band data arrays + from their respective storage locations. + schema_version : str + The mzMLb HDF5 schema version, distinct from the mzML schema inside it. + + + References + ---------- + [1] Bhamber, R. S., Jankevics, A., Deutsch, E. W., Jones, A. R., & Dowsey, A. W. (2021). + MzMLb: A Future-Proof Raw Mass Spectrometry Data Format Based on Standards-Compliant + mzML and Optimized for Speed and Storage Requirements. Journal of Proteome Research, + 20(1), 172–183. https://doi.org/10.1021/acs.jproteome.0c00192 + ''' + _default_iter_tag = ExternalDataMzML._default_iter_tag + + file_format = "mzMLb" + + def __init__(self, path, hdfargs=None, mzmlargs=None, allow_updates=False, + use_index=True, **kwargs): + if hdfargs is None: + hdfargs = {} + if mzmlargs is None: + mzmlargs = {} + mzmlargs.update(kwargs) + + self.path = path + self._hdfargs = hdfargs + self._mzmlargs = mzmlargs + self._allow_updates = allow_updates + self.handle = h5py.File(self.path, 'r+' if self._allow_updates else 'r', **hdfargs) + self.schema_version = self.handle['mzML'].attrs.get('version') + self._check_compressor() + + self._xml_buffer = io.BufferedReader(HDF5ByteBuffer(self.handle['mzML'])) + self._array_registry = ExternalArrayRegistry(self.handle) + self._make_mzml_parser(mzmlargs) + + super(MzMLb, self).__init__(**kwargs) + + def _check_compressor(self): + for key in self.handle.keys(): + if "spectrum_MS_" in key or "chromatogram_MS_": + data = self.handle[key] + try: + filts = data._filters + except AttributeError: + continue + if '32001' in filts: + if hdf5plugin is None: + warnings.warn( + ("Blosc meta-compressor detected, but hdf5plugin is " + "not installed, may not be able to access %r") % (key)) + + def _make_mzml_parser(self, kwargs): + self._mzml_parser = ExternalDataMzML( + self._xml_buffer, external_data_registry=self._array_registry, + use_index=False, **kwargs) + self._mzml_parser._offset_index = self._build_index() + self._mzml_parser._use_index = True + + @property + def name(self): + if hasattr(self.path, 'name'): + return self.path.name + return self.path + + def _build_index(self): + index = HierarchicalOffsetIndex() + for label in [u'spectrum', u'chromatogram']: + sub = index[label] + ids = bytearray(np.array(self.handle['mzML_{}Index_idRef'.format(label)])).split(b"\x00") + offsets = self.handle["mzML_{}Index".format(label)][:-1] + for i, o in enumerate(offsets): + sub[ids[i].decode('utf8')] = o + return index + + def get_by_id(self, id): + """Parse the file and return the element with `id` attribute equal + to `elem_id`. Returns :py:const:`None` if no such element is found. + + Parameters + ---------- + elem_id : str + The value of the `id` attribute to match. + + Returns + ------- + out : :py:class:`dict` or :py:const:`None` + """ + return self._mzml_parser.get_by_id(id) + + def get_by_ids(self, ids): + return self._mzml_parser.get_by_ids(ids) + + def get_by_index(self, i): + return self._mzml_parser.get_by_index(i) + + def get_by_indexes(self, indexes): + return self._mzml_parser.get_by_indexes(indexes) + + def get_by_index_slice(self, s): + return self._mzml_parser.get_by_index_slice(s) + + def get_by_key_slice(self, s): + return self._mzml_parser.get_by_key_slice(s) + + def __contains__(self, key): + return key in self.index + + def __getitem__(self, i): + return self._mzml_parser[i] + + def __len__(self): + return len(self._mzml_parser) + + def __iter__(self): + return iter(self._mzml_parser) + + def __next__(self): + return next(self._mzml_parser) + + def next(self): + return self.__next__() + + def __reduce__(self): + return self.__class__, (self.path, self._hdfargs, self._mzmlargs, self._allow_updates) + + def close(self): + self.handle.close() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def iterfind(self, *args, **kwargs): + iterf = self._mzml_parser.iterfind(*args, **kwargs) + iterf.parser = self + return iterf + + def _iterfind_impl(self, path, *args, **kwargs): + return self._mzml_parser._iterfind_impl(path, *args, **kwargs) + + @property + def index(self): + return self._mzml_parser.index + + @property + def _offset_index(self): + return self._mzml_parser._offset_index + + @property + def default_index(self): + return self._mzml_parser.default_index + + def _get_time(self, scan): + return self._mzml_parser._get_time(scan) + + @property + def mzml_parser(self): + return self._mzml_parser + + def _task_map_iterator(self): + """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC + queue used by :meth:`map` + + Returns + ------- + :class:`Iteratable` + """ + return iter(self.index[self._default_iter_tag]) + + def read(self, n=-1): + return self._mzml_parser.read(n) + + def reset(self): + self._mzml_parser.reset() + + def seek(self, offset, whence=0): + self._mzml_parser.seek(offset, whence) + + def tell(self): + return self._mzml_parser.tell() + + def get_dataset(self, name): + '''Get an HDF5 dataset by its name or path relative to + the root node. + + .. warning:: + Because this accesses HDF5 data directly, it may be possible to mutate + the underlying file if :attr:`allow_updates` is :const:`True`. + + Parameters + ---------- + name : :class:`str` + The dataset name or path. + + Returns + ------- + :class:`h5py.Dataset` or :class:`h5py.Group` + + Raises + ------ + KeyError : + The name is not found. + ''' + return self.handle[name] + + +def read(source, dtype=None): + """Parse `source` and iterate through spectra. + + Parameters + ---------- + source : str or file + A path to a target mzMLb file or the file object itself. + dtype : type or dict, optional + dtype to convert arrays to, one for both m/z and intensity arrays or one for each key. + If :py:class:`dict`, keys should be 'm/z array' and 'intensity array'. + + Returns + ------- + out : iterator + An iterator over the dicts with spectrum properties. + """ + reader = MzMLb(source, dtype=dtype) + return reader + + +# The MzMLb class is detatched from the normal :class:`FileReader`-based inheritance tree, +# this grafts it back on for :func:`isinstance` and :func:`issubclass` tests at least. +FileReader.register(MzMLb) + + +version_info = xml._make_version_info(MzMLb) + +# chain = aux._make_chain(read, 'read') + +chain = aux.ChainBase._make_chain(MzMLb) diff --git a/pyteomics/mztab.py b/pyteomics/mztab.py new file mode 100644 index 0000000..148d4cf --- /dev/null +++ b/pyteomics/mztab.py @@ -0,0 +1,783 @@ +""" +mztab - mzTab file reader +========================= + +Summary +------- + +`mzTab <https://github.com/HUPO-PSI/mzTab>`_ is one of the standards +developed by the Proteomics Informatics working group of the HUPO Proteomics +Standard Initiative. + +This module provides a way to read mzTab files into a collection of +:py:class:`pandas.DataFrame` instances in memory, along with a mapping +of the file-level metadata. MzTab specifications 1.0 and 2.0 are supported. + +Data access +----------- + + :py:class:`MzTab` - a class representing a single mzTab file. + +Helpers +------- + + :py:class:`Group` - a collection of metadata relating to one entity. + + +Internals +--------- + + :py:class:`_MzTabTable` - a single table in an mzTab file. + + +Property Management +~~~~~~~~~~~~~~~~~~~ + +:mod:`mztab` uses metaprogramming to generate its metadata accessors, generated by +these classes working in concert. + + :py:class:`MetadataBackedProperty` + + :py:class:`MetadataBackedCollection` + + :py:class:`MetadataPropertyAnnotator` + +------------------------------------------------------------------------------- +""" + +import re +import warnings + +try: + import pandas as pd +except ImportError: + pd = None + + +from collections import OrderedDict + +from pyteomics.auxiliary import _file_obj +from pyteomics.auxiliary import cvstr +from pyteomics.auxiliary.utils import add_metaclass + + +def _require_pandas(): + if pd is None: + raise ImportError( + "To load an mzTab file into pandas.DataFrame objects, you must install pandas!") + + +class MetadataBackedProperty(object): + '''Our descriptor type which uses the instance's metadata attribute to carry its values''' + + def __init__(self, name, variant_required=None): + if variant_required is None: + variant_required = () + self.name = name + self.variant_required = variant_required + self.__doc__ = self.build_docstring() + + def __repr__(self): + return "{self.__class__.__name__}(name={self.name!r}, variant_required={self.variant_required})".format(self=self) + + def __get__(self, obj, objtype=None): + if obj is None and objtype is not None: + # So the property can be seen for what it is + return self + value = obj.metadata.get(self.name) + if value is None and self.variant_required and obj.variant in self.variant_required: + raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format( + self.name, obj.variant)) + return value + + def __set__(self, obj, value): + obj.metadata[self.name] = value + + def __delete__(self, obj): + del obj.metadata[self.name] + + def build_docstring(self): + doc = '''Accesses the {self.name!r} key in the :attr:`metadata` mapping attached +to this object. +''' + if self.variant_required: + if len(self.variant_required) > 1: + plural = 's' + else: + plural = '' + requires = ' or '.join(['-%s' % v for v in self.variant_required]) + doc += ''' +This key must be present when the file is of {requires} variant{plural}. + '''.format(requires=requires, plural=plural) + doc += ''' +Returns +------- +object + ''' + doc = doc.format(self=self) + return doc + + +class MetadataBackedCollection(object): + def __init__(self, name, variant_required=None): + if variant_required is None: + variant_required = () + self.name = name + self.variant_required = variant_required + self.__doc__ = self.build_docstring() + + def __get__(self, obj, objtype=None): + if obj is None and objtype is not None: + # So the property can be seen for what it is + return self + groups = obj.gather(obj.metadata) + value = groups.get(self.name) + if value is None and self.variant_required and obj.variant in self.variant_required: + raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format( + self.name, obj.variant)) + return value + + def build_docstring(self): + doc = '''Accesses the {self.name!r} key group gathered in the :attr:`metadata` mapping attached +to this object. + +This group is dynamically generated on each access and may be expensive for repeated use. +''' + if self.variant_required: + if len(self.variant_required) > 1: + plural = 's' + else: + plural = '' + requires = ' or '.join(['-%s' % v for v in self.variant_required]) + doc += ''' +This key must be present when the file is of {requires} variant{plural}. + '''.format(requires=requires, plural=plural) + doc += ''' +Returns +------- +:class:`~.Group` + ''' + doc = doc.format(self=self) + return doc + + +class MetadataPropertyAnnotator(type): + '''A simple metaclass to do some class-creation time introspection + and descriptor binding. + + Uses a list of strings or 3-tuples from :attr:`__metadata_properties__` to + bind :class:`MetadataBackedProperty` or :class:`MetadataBackedCollection` + onto the class during its creation. + + The specification for a property is a tuple of three values: + 1. The metadata key to fetch + 2. The property name to expose on the object + 3. The variant(s) which require this metadata key be present + + :obj:`("mzTab-version", "version", ("M", "P"))` would be interpreted as + Expose a property "version" on instances which serves the key "mzTab-version" + from the instance's :attr:`metadata`, and raise an error if it is absent in + the "M" or "P" variants. + + Alternatively a specification may be a single string which will be interpreted + as the metadata key, and used to generate the property name replacing all '-' + with '_' and assumed to be optional in all variants. + + If a metadata key ends with "[]" the property is assumed to be a collection. mzTab + makes heavy use of "<collection_name>[<index>]..." keys to define groups of homogenous + object types, often with per-element attributes. + + .. code-block:: + + variable_mod[1] CHEMMOD:15.9949146221 + variable_mod[1]-site M + variable_mod[1]-position Anywhere + variable_mod[2] CHEMMOD:42.0105646863 + variable_mod[2]-site N-term + variable_mod[2]-position Protein N-term + + A specification :obj:`("variable_mod[]", "variable_mods", ())` would create a property + that returns: + + .. code-block:: python + + >>>instance.variable_mods + Group([(1, + {'name': 'CHEMMOD:15.9949146221', + 'position': 'Anywhere', + 'site': 'M'}), + (2, + {'name': 'CHEMMOD:42.0105646863', + 'position': 'Protein N-term', + 'site': 'N-term'})]) + + For precise description of the property collection algorithm, see + :meth:`~_MzTabParserBase.collapse_properties` and + :meth:`~_MzTabParserBase.gather`. + + If any base classes have a :attr:`__metadata_properties__` attribute, it will + also be included unless :attr:`__inherit_metadata_properties__` is set to + :const:`False`. Any names explicitly set by the current class override this + automatic property generation. + ''' + def __new__(mcls, name, bases, attrs): + props = attrs.get('__metadata_properties__', []) + inherit_props = attrs.get("__inherit_metadata_properties__", True) + # Gather from parent classes so we can use inheritance for overriding this + # behavior too. + if inherit_props: + for base in bases: + props.extend(getattr(base, '__metadata_properties__', [])) + + keys = set(attrs) + + # Iterate in reverse to ensure that classes nearer to the new classes override + # more basal classes, ending with the new class to make sure overrides are + # applied. + for prop in reversed(props): + # If the property definition is a single string, interpret the specification + # as the property name, and apply some simple normalization to make it a valid + # Python attribute name and assume the property is always optional. + if isinstance(prop, str): + prop_name = prop + attr_name = prop_name.replace("mzTab-", '').replace('-', '_') + variant_required = None + else: + # Otherwise unpack the triple + prop_name, attr_name, variant_required = prop + # Attach the new descriptor to the class definition to be created. These descriptors + # will then be used when instances of that class try to get/set those attribute names. + if attr_name in keys: + continue + if prop_name.endswith('[]'): + # If the property name ends with "[]", then we're dealing with a collection so + # use the :class:`MetadataBackedCollection` descriptor + attrs[attr_name] = MetadataBackedCollection( + prop_name[:-2], variant_required=variant_required) + else: + # Otherwise it is a scalar-valued property, using the :class:`MetadataBackedProperty` + # descriptor + prop = attrs[attr_name] = MetadataBackedProperty( + prop_name, variant_required=variant_required) + + return super(MetadataPropertyAnnotator, mcls).__new__(mcls, name, bases, attrs) + + +class _MzTabParserBase(object): + def _parse_param(self, tuplet): + """Parse a controlled vocabulary or user specified parameter tuplet + into a Python object + + Parameters + ---------- + tuplet : str + A square brace enclosed tuplet of values describing the parameter + + Returns + ------- + tuple + The reduced representation of the parameter + """ + cv, acc, name, value = re.split(r"\s*,\s*", tuplet[1:-1]) + param_name = cvstr(name, acc) + if value: + return (param_name, value) + else: + return (param_name) + + def collapse_properties(self, proplist): + '''Collapse a flat property list into a hierchical structure. + + This is intended to operate on :py:class:`Mapping` objects, including + :class:`dict`, :class:`pandas.Series` and :class:`pandas.DataFrame`. + + .. code-block:: python + + { + "ms_run[1]-format": "Andromeda:apl file format", + "ms_run[1]-location": "file://...", + "ms_run[1]-id_format": "scan number only nativeID format" + } + + to + + .. code-block:: python + + { + "ms_run": [ + { + "format": "Andromeda:apl file format", + "location": "file://...", + "id_format": "scan number only nativeID format" + } + ] + } + + Parameters + ---------- + proplist: :class:`Mapping` + Key-Value pairs to collapse + + Returns + ------- + :class:`OrderedDict`: + The collapsed property list + ''' + entities = OrderedDict() + rest = {} + for key, value in proplist.items(): + try: + entity, prop_name = key.rsplit("-", 1) + except ValueError: + rest[key] = value + continue + try: + entity_dict = entities[entity] + except KeyError: + entity_dict = entities[entity] = {} + entity_dict[prop_name] = value + for key, value in proplist.items(): + if key in entities: + entity = entities[key] + if 'name' not in entity: + entity['name'] = value + for key, value in rest.items(): + if key in entities: + entities[key]['name'] = value + else: + entities[key] = value + return entities + + def _collapse_collections(self, entities): + gathered = Group() + for key, props in entities.items(): + if '[' in key: + k, ix = key.split('[', 1) + if '[' in ix: + # If we have multiple [ in a key, we are dealing with a path + path = extract_path(key) + for k, ix in path[:-1]: + store = gathered[k] + store = store[int(ix)] + k, ix = path[-1] + store[k][int(ix)] = props + + else: + ix = int(ix[:-1]) + gathered[k][ix] = props + else: + gathered[key] = props + return gathered + + def _cast_value(self, value): + """Convert a cell value to the appropriate Python type + + Parameters + ---------- + value : str + The cell value as text + + Returns + ------- + object + The most specialized type recognized + """ + if value == 'null': + return None + # is it a parameter? + if value.startswith("["): + try: + if "|" in value: + return [self._cast_value(v) for v in value.split("|")] + else: + return self._parse_param(value) + except ValueError: + return value + else: + # begin guessing dtype + try: + value = int(value) + except ValueError: + try: + value = float(value) + except ValueError: + pass + return value + + def gather(self, mapping): + '''Collapse property lists using :meth:`collapse_properties` + and then gather collections of entites into lists. + + Parameters + ---------- + mapping : dict + The flattened hierarchy of properties to re-construct + + Returns + ------- + Group : + A :class:`Group` of all entities and collections of entities + ''' + return self._collapse_collections(self.collapse_properties(mapping)) + + +class _MzTabTable(_MzTabParserBase): + + """An internal class for accumulating information about an single table + represented in an mzTab file + + Attributes + ---------- + header : list + The column names for the table + name : str + The table's name, human readable + rows : list + An accumulator of table rows + """ + + def __init__(self, name, header=None, rows=None): + if rows is None: + rows = [] + self.name = name + self.header = header + self.rows = rows + + def __repr__(self): + n_cols = len(self.header) if self.header is not None else 0 + n_rows = len(self.rows) + template = "<_MzTabTable {name} with {n_cols} columns and {n_rows} rows>" + return template.format(n_cols=n_cols, n_rows=n_rows, name=self.name) + + def add(self, row): + self.rows.append([self._cast_value(v) for v in row]) + + def __len__(self): + return len(self.rows) + + def __getitem__(self, i): + if isinstance(i, int): + return self.gather({h: r for h, r in zip(self.header, self.rows[i])}) + elif isinstance(i, slice): + out = [] + for i in range(i.start or 0, i.stop or len(self), i.step or 1): + out.append(self[i]) + return out + raise TypeError("Cannot access table with object of type %r" % type(i)) + + def as_dict(self): + return {"rows": [dict(zip(self.header, row)) for row in self.rows], + "name": self.name} + + def as_df(self, index=None): + """Convert the table to a DataFrame in memory. + + Returns + ------- + pd.DataFrame + """ + _require_pandas() + table = pd.DataFrame(data=self.rows, columns=self.header) + if index is not None and len(table.index) > 0: + table = table.set_index(index, drop=False) + table.name = self.name + return table + + def clear(self): + self.header = None + self.rows = [] + + +DATA_FRAME_FORMAT = 'df' +DICT_FORMAT = 'dict' +RAW_FORMAT = 'raw' + +PATH_PARSER = re.compile(r"([^\[]+)\[(\d+)\]_?") + + +def extract_path(path): + '''Parse `key[index]_next_key[next_index]...` sequences into + lists of (key, index) pairs. + + Parameters + ---------- + path : str + The path key to parse + + Returns + ------- + list + ''' + return [(t, int(i)) for t, i in PATH_PARSER.findall(path)] + + +class Group(OrderedDict): + '''A type for holding collections of arbitrarily nested keys from rows + and metadata mappings. + + Implemented as an autovivifying :class:`OrderedDict` variant. As such implements + the :class:`~collections.abc.Mapping` interface. + ''' + + def get_path(self, path, default=None): + '''As :meth:`get` but over a path key parsed with :func:`extract_path`. + + Parameters + ---------- + path : str + The path to search down + default : object, optional + The return value when the path is missing + + Returns + ------- + object + ''' + tokens = extract_path(path) + if not tokens: + return self.get(path, default) + layer = self + for k, i in tokens[:-1]: + i = int(i) + layer = layer.get(k) + if layer is None: + return None + layer = layer.get(i) + if layer is None: + return None + k, i = tokens[-1] + i = int(i) + layer = layer.get(k) + if layer is None: + return default + value = layer.get(i, default) + return value + + def __missing__(self, key): + value = self.__class__() + self[key] = value + return value + + +@add_metaclass(MetadataPropertyAnnotator) +class MzTab(_MzTabParserBase): + """Parser for mzTab format files. + + Attributes + ---------- + comments : list + A list of comments across the file + file : _file_obj + A file stream wrapper for the file to be read + metadata : OrderedDict + A mapping of metadata that was entities. + peptide_table : _MzTabTable or pd.DataFrame + The table of peptides. Not commonly used. + protein_table : _MzTabTable or pd.DataFrame + The table of protein identifications. + small_molecule_table : _MzTabTable or pd.DataFrame + The table of small molecule identifications. + spectrum_match_table : _MzTabTable or pd.DataFrame + The table of spectrum-to-peptide match identifications. + table_format: 'df', 'dict', or callable + The structure type to replace each table with. The string + 'df' will use pd.DataFrame instances. 'dict' will create + a dictionary of dictionaries for each table. A callable + will be called on each raw _MzTabTable object + + Additional components of :attr:`metadata` are exposed as properties, returning + single values or aggregated collections of objects. + """ + + __metadata_properties__ = [ + ('mzTab-version', 'version', ()), + ('mzTab-mode', 'mode', 'P'), + ('mzTab-type', 'type', 'P'), + ('mzTab-ID', 'id', 'M'), + 'title', + 'description', + ('ms_run[]', 'ms_runs', 'MP'), + ('instrument[]', 'instruments', ()), + ('software[]', 'software', ()), + ('publication[]', 'publications', ()), + ('contact[]', 'contacts', ()), + ('uri[]', 'uris', ()), + ('external_study_uri[]', 'external_study_uris', ()), + ('quantification_method', 'quantification_method', 'M'), + ('sample[]', 'samples', ()), + ('assay[]', 'assays', ()), + ('study_variable[]', 'study_variables', 'M'), + ('custom[]', 'custom', ()), + ('cv[]', 'cvs', 'M'), + ('database[]', 'databases', 'M'), + + ('psm_search_engine_score[]', 'psm_search_engine_scores', ()), + ('protein_search_engine_score[]', 'protein_search_engine_scores', ()), + ('fixed_mod[]', 'fixed_mods', 'P'), + ('variable_mod[]', 'variable_mods', 'P'), + 'colunit_protein', + 'colunit_peptide', + 'colunit_psm', + 'colunit_small_molecule', + 'false_discovery_rate', + + ('derivatization_agent[]', 'derivatization_agents', ()), + ('small_molecule-quantification_unit', + 'small_molecule_quantification_unit', 'M'), + ('small_molecule_feature-quantification_unit', 'small_molecule_feature_quantification_unit', 'M'), + ('small_molecule-identification_reliability', + 'small_molecule_identification_reliability', ()), + ('id_confidence_measure[]', 'id_confidence_measures', 'M'), + ('colunit-small_molecule', 'colunit_small_molecule', ()), + ('colunit-small_molecule_feature', 'colunit_small_molecule_feature', ()), + ('colunit-small_molecule_evidence', 'colunit_small_molecule_evidence', ()), + + ('sample_processing[]', 'sample_processing', ()) + ] + + def __init__(self, path, encoding='utf8', table_format=DATA_FRAME_FORMAT): + if table_format == DATA_FRAME_FORMAT: + _require_pandas() + # Must be defined in order for metadata properties to work + self.variant = None + self.file = _file_obj(path, mode='r', encoding=encoding) + self.metadata = OrderedDict() + self.comments = [] + self._table_format = table_format + self._init_tables() + self._parse() + self._determine_schema_version() + self._transform_tables() + + @property + def table_format(self): + return self._table_format + + def __getitem__(self, key): + key = key.lower().strip() + if key in ('psm', ): + return self.spectrum_match_table + if key in ('pep', ): + return self.peptide_table + if key in ('prt', ): + return self.protein_table + if key in ('sml', ): + return self.small_molecule_table + if key in ('smf', ): + return self.small_molecule_feature_table + if key in ('sme', ): + return self.small_molecule_evidence_table + else: + raise KeyError(key) + + def __iter__(self): + if self.variant == "P": + yield 'PRT', self.protein_table + yield 'PEP', self.peptide_table + yield 'PSM', self.spectrum_match_table + yield 'SML', self.small_molecule_table + elif self.variant == "M": + yield 'SML', self.small_molecule_table + yield 'SMF', self.small_molecule_feature_table + yield 'SME', self.small_molecule_evidence_table + + def _init_tables(self): + self.protein_table = _MzTabTable("protein") + self.peptide_table = _MzTabTable("peptide") + self.spectrum_match_table = _MzTabTable('psm') + self.small_molecule_table = _MzTabTable('small molecule') + self.small_molecule_feature_table = _MzTabTable('small molecule feature') + self.small_molecule_evidence_table = _MzTabTable('small molecule evidence') + + def _transform_tables(self): + if self._table_format == DATA_FRAME_FORMAT: + self.protein_table = self.protein_table.as_df('accession') + self.peptide_table = self.peptide_table.as_df() + self.spectrum_match_table = self.spectrum_match_table.as_df('PSM_ID') + self.small_molecule_table = self.small_molecule_table.as_df() + self.small_molecule_feature_table = self.small_molecule_feature_table.as_df() + self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_df() + elif self._table_format in (DICT_FORMAT, dict): + self.protein_table = self.protein_table.as_dict() + self.peptide_table = self.peptide_table.as_dict() + self.spectrum_match_table = self.spectrum_match_table.as_dict() + self.small_molecule_table = self.small_molecule_table.as_dict() + self.small_molecule_feature_table = self.small_molecule_feature_table.as_dict() + self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_dict() + elif callable(self._table_format): + self.protein_table = self._table_format(self.protein_table) + self.peptide_table = self._table_format(self.peptide_table) + self.spectrum_match_table = self._table_format(self.spectrum_match_table) + self.small_molecule_table = self._table_format(self.small_molecule_table) + self.small_molecule_feature_table = self._table_format(self.small_molecule_feature_table) + self.small_molecule_evidence_table = self._table_format(self.small_molecule_evidence_table) + + def _parse(self): + for i, line in enumerate(self.file): + line = line.strip() + tokens = line.split("\t") + if not tokens: + continue + if tokens[0] == ("MTD"): + name = tokens[1] + value = self._cast_value(tokens[2]) + self.metadata[name] = value + elif tokens[0] == 'COM': + self.comments.append(self._cast_value(tokens[1])) + # headers + elif tokens[0] == "PRH": + self.protein_table.header = tokens[1:] + elif tokens[0] == "PEH": + self.peptide_table.header = tokens[1:] + elif tokens[0] == "PSH": + self.spectrum_match_table.header = tokens[1:] + elif tokens[0] == "SMH": + self.small_molecule_table.header = tokens[1:] + elif tokens[0] == "SFH": + self.small_molecule_feature_table.header = tokens[1:] + elif tokens[0] == "SEH": + self.small_molecule_evidence_table.header = tokens[1:] + # rows + elif tokens[0] == "PRT": + self.protein_table.add(tokens[1:]) + elif tokens[0] == "PEP": + self.peptide_table.add(tokens[1:]) + elif tokens[0] == "PSM": + self.spectrum_match_table.add(tokens[1:]) + elif tokens[0] == "SML": + self.small_molecule_table.add(tokens[1:]) + elif tokens[0] == "SMF": + self.small_molecule_feature_table.add(tokens[1:]) + elif tokens[0] == "SME": + self.small_molecule_evidence_table.add(tokens[1:]) + + def _determine_schema_version(self): + if self.version is not None: + version = str(self.version) + else: + warnings.warn("The mzTab-version metadata header was missing. Assuming the schema version is 1.0.0") + version = "1.0.0" + self.version = version + match = re.search(r"(?P<schema_version>\d+(?:\.\d+(?:\.\d+)?)?)(?:-(?P<schema_variant>[MP]))?", version) + if match is None: + warnings.warn("mzTab-version does not match the expected pattern: %r" % version) + version_parsed = '1.0.0' + variant = 'P' + else: + version_parsed, variant = match.groups() + if variant is None: + variant = "P" + self.num_version = [int(v) for v in version_parsed.split(".")] + # Ensure self.num_version is 3-tuple + while len(self.num_version) < 3: + self.num_version.append(0) + self.variant = variant + + def keys(self): + return OrderedDict(list(self)).keys() + + def values(self): + return OrderedDict(list(self)).values() + + def items(self): + return OrderedDict(list(self)).items() diff --git a/pyteomics/mzxml.py b/pyteomics/mzxml.py new file mode 100644 index 0000000..28c67e3 --- /dev/null +++ b/pyteomics/mzxml.py @@ -0,0 +1,328 @@ +""" +mzxml - reader for mass spectrometry data in mzXML format +========================================================= + +Summary +------- + +**mzXML** is a (formerly) standard XML-format for raw mass spectrometry data storage, +intended to be replaced with **mzML**. + +This module provides a minimalistic way to extract information from mzXML +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`MzXML`) +to iterate over entries in ``<scan>`` elements. +:py:class:`MzXML` also supports direct indexing with scan IDs. + +Data access +----------- + + :py:class:`MzXML` - a class representing a single mzXML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through spectra in mzXML file. Data from a + single scan are converted to a human-readable dict. Spectra themselves are + stored under 'm/z array' and 'intensity array' keys. + + :py:func:`chain` - read multiple mzXML files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + +Deprecated functions +-------------------- + + :py:func:`version_info` - get version information about the mzXML file. + You can just read the corresponding attribute of the :py:class:`MzXML` object. + + :py:func:`iterfind` - iterate over elements in an mzXML file. + You can just call the corresponding method of the :py:class:`MzXML` object. + +Dependencies +------------ + +This module requires :py:mod:`lxml` and :py:mod:`numpy`. + +------------------------------------------------------------------------------- +""" + +# Copyright 2016 Joshua Klein, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import heapq + +from . import xml, auxiliary as aux, _schema_defaults +import numpy as np + + +def _decode_peaks(info, peaks_data): + """Decode the interleaved base 64 encoded, potentially + compressed, raw data points. + + Parameters + ---------- + info : dict + The current context + peaks_data : str + The textually encoded peak data + + Returns + ------- + tuple of np.array + A pair of NumPy arrays containing + m/z and intensity values. + """ + compressed = (info.get('compressionType') == 'zlib') + dt = np.float32 if info['precision'] == '32' else np.float64 + dtype = np.dtype([('m/z array', dt), ('intensity array', dt)]).newbyteorder('>') + data = aux._decode_base64_data_array(peaks_data, dtype, compressed) + return data + + +class IteratorQueue(object): + def __init__(self, iterator): + q = list() + heapq.heapify(q) + self.queue = q + self.iterator = iterator + self.last_index = -1 + self.producer = self.consume(iterator) + + def insert_item(self, scan): + heapq.heappush(self.queue, (int(scan['num']), scan)) + + def __iter__(self): + return self.producer + + def consume(self, iterator): + for scan in iterator: + scan.pop("scan", None) + if scan['msLevel'] != 1: + self.insert_item(scan) + else: + self.insert_item(scan) + barrier = int(scan['num']) + while True: + idx, item = heapq.heappop(self.queue) + if idx >= barrier: + self.insert_item(item) + break + yield item + while self.queue: + idx, item = heapq.heappop(self.queue) + yield item + + +class MzXML(aux.BinaryArrayConversionMixin, aux.TimeOrderedIndexedReaderMixin, xml.MultiProcessingXML, xml.IndexSavingXML): + """Parser class for mzXML files.""" + _root_element = 'mzXML' + _default_iter_tag = 'scan' + _indexed_tags = {'scan'} + _indexed_tag_keys = {'scan': 'num'} + _default_version = None + _default_schema = _schema_defaults._mzxml_schema_defaults + _default_id_attr = 'num' + + def __init__(self, *args, **kwargs): + self.decode_binary = kwargs.pop('decode_binary', True) + super(MzXML, self).__init__(*args, **kwargs) + + def __getstate__(self): + state = super(MzXML, self).__getstate__() + state['decode_binary'] = self.decode_binary + return state + + def __setstate__(self, state): + super(MzXML, self).__setstate__(state) + self.decode_binary = state['decode_binary'] + + def _get_info_smart(self, element, **kw): + name = xml._local_name(element) + + kwargs = dict(kw) + rec = kwargs.pop('recursive', None) + if name in {'mzXML'}: + info = self._get_info(element, + recursive=( + rec if rec is not None else False), + **kwargs) + else: + info = self._get_info(element, + recursive=(rec if rec is not None else True), + **kwargs) + if 'num' in info and isinstance(info, dict): + info['id'] = info['num'] + if 'peaks' in info and isinstance(info, dict): + self._decode_peaks(info) + return info + + def _determine_compression(self, info): + if info.get('compressionType') == 'zlib': + return 'zlib compression' + return "no compression" + + def _determine_dtype(self, info): + dt = np.float32 if info['precision'] == '32' else np.float64 + endianess = ">" if info['byteOrder'] in ('network', "big") else "<" + dtype = np.dtype( + [('m/z array', dt), ('intensity array', dt)]).newbyteorder(endianess) + return dtype + + def _finalize_record_conversion(self, array, record): + key = record.key + return self._convert_array(key, array[key]) + + def _decode_peaks(self, info): + # handle cases where peaks is the encoded binary data which must be + # unpacked + if not isinstance(info['peaks'], (dict, list)): + compression_type = self._determine_compression(info) + dtype = self._determine_dtype(info) + binary = info.pop('peaks') + if not self.decode_binary: + for k in self._array_keys: + record = self._make_record(binary, compression_type, dtype, k) + info[k] = record + else: + peak_data = self.decode_data_array(binary, compression_type, dtype) + for k in self._array_keys: + info[k] = self._convert_array(k, peak_data[k]) + # otherwise we've already decoded the arrays and we're just passing + # them up the hierarchy + else: + if not self.decode_binary: + arrays = info.pop('peaks')[0] + for k in self._array_keys: + info[k] = arrays[k] + else: + peak_data = info.pop('peaks')[0] + for k in self._array_keys: + info[k] = self._convert_array(k, peak_data.get(k, np.array([]))) + + def iterfind(self, path, **kwargs): + if path == 'scan': + generator = super(MzXML, self).iterfind(path, **kwargs) + for item in IteratorQueue(generator): + yield item + else: + for item in super(MzXML, self).iterfind(path, **kwargs): + yield item + + def _get_time(self, scan): + return scan['retentionTime'] + + +def read(source, read_schema=False, iterative=True, use_index=False, dtype=None, + huge_tree=False, decode_binary=True): + """Parse `source` and iterate through spectra. + + Parameters + ---------- + source : str or file + A path to a target mzML file or the file object itself. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzML header. Otherwise, use default + parameters. Not recommended without Internet connection or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + spectrum elements. Default is :py:const:`False`. + + decode_binary : bool, optional + Defines whether binary data should be decoded and included in the output + (under "m/z array", "intensity array", etc.). + Default is :py:const:`True`. + + huge_tree : bool, optional + This option is passed to the `lxml` parser and defines whether + security checks for XML tree depth and node size should be disabled. + Default is :py:const:`False`. + Enable this option for trusted files to avoid XMLSyntaxError exceptions + (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). + + Returns + ------- + out : iterator + An iterator over the dicts with spectrum properties. + """ + + return MzXML(source, read_schema=read_schema, iterative=iterative, + use_index=use_index, dtype=dtype, huge_tree=huge_tree, + decode_binary=decode_binary) + + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified XPath. + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create an :py:class:`MzXML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + decode_binary : bool, optional + Defines whether binary data should be decoded and included in the output + (under "m/z array", "intensity array", etc.). + Default is :py:const:`True`. + + Returns + ------- + out : iterator + """ + return MzXML(source, **kwargs).iterfind(path, **kwargs) + +version_info = xml._make_version_info(MzXML) + + +# chain = aux._make_chain(read, 'read') +chain = aux.ChainBase._make_chain(MzXML) diff --git a/pyteomics/openms/__init__.py b/pyteomics/openms/__init__.py new file mode 100644 index 0000000..8b9c033 --- /dev/null +++ b/pyteomics/openms/__init__.py @@ -0,0 +1 @@ +from . import featurexml, trafoxml, idxml diff --git a/pyteomics/openms/featurexml.py b/pyteomics/openms/featurexml.py new file mode 100644 index 0000000..0dcfd09 --- /dev/null +++ b/pyteomics/openms/featurexml.py @@ -0,0 +1,115 @@ +""" +featurexml - reader for featureXML files +======================================== + +Summary +------- + +**featureXML** is a format specified in the +`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. +It defines a list of LC-MS features observed in an experiment. + +This module provides a minimalistic way to extract information from **featureXML** +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`FeatureXML`) +to iterate over entries in ``<feature>`` elements. +:py:class:`FeatureXML` also supports direct indexing with feature IDs. + +Data access +----------- + + :py:class:`FeatureXML` - a class representing a single featureXML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through features in a featureXML file. Data from a + single feature are converted to a human-readable dict. + + :py:func:`chain` - read multiple featureXML files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + +Dependencies +------------ + +This module requres :py:mod:`lxml`. + +-------------------------------------------------------------------------------- +""" + +from .. import xml, auxiliary as aux, _schema_defaults, version + +class FeatureXML(xml.MultiProcessingXML): + """Parser class for featureXML files.""" + file_format = 'featureXML' + _root_element = 'featureMap' + _default_schema = _schema_defaults._featurexml_schema_defaults + _default_version = '1.9' + _default_iter_tag = 'feature' + _structures_to_flatten = {} + _indexed_tags = {'feature'} + _schema_location_param = 'noNamespaceSchemaLocation' + + _offending_keys = {'ints': { + ('PeptideIdentification', 'spectrum_reference'), + ('UnassignedPeptideIdentification', 'spectrum_reference'), + ('quality', 'quality') + }} + _missing_keys = {'floats': {('quality', 'quality')}} + + def _get_info_smart(self, element, **kw): + kw['recursive'] = kw.get('recursive', True) + info = self._get_info(element, **kw) + return info + + @xml._keepstate + def _get_schema_info(self, read_schema=True): + schema_info = super(FeatureXML, self)._get_schema_info(read_schema) + if not read_schema: + return schema_info + file_version, schema = self.version_info + if version.VersionInfo(file_version) < version.VersionInfo(self._default_version): + for k, s in self._offending_keys.items(): + if k in schema_info: + for elem in s: + try: + schema_info[k].remove(elem) + except KeyError: + pass + for t, s in self._missing_keys.items(): + schema_info.setdefault(t, set()).update(s) + return schema_info + + +def read(source, read_schema=True, iterative=True, use_index=False): + """Parse `source` and iterate through features. + + Parameters + ---------- + source : str or file + A path to a target featureXML file or the file object itself. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the file header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + spectrum elements. Default is :py:const:`False`. + + Returns + ------- + out : iterator + An iterator over the dicts with feature properties. + """ + + return FeatureXML(source, read_schema=read_schema, iterative=iterative, use_index=use_index) + +chain = aux._make_chain(read, 'read') diff --git a/pyteomics/openms/idxml.py b/pyteomics/openms/idxml.py new file mode 100644 index 0000000..0d71d7a --- /dev/null +++ b/pyteomics/openms/idxml.py @@ -0,0 +1,430 @@ +""" +idxml - idXML file reader +========================= + +Summary +------- + +**idXML** is a format specified in the +`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. +It defines a list of peptide identifications. + +This module provides a minimalistic way to extract information from idXML +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`IDXML`) to iterate over entries in +``<PeptideIdentification>`` elements. Note that each entry can contain more than one PSM +(peptide-spectrum match). They are accessible with ``'PeptideHit'`` key. +:py:class:`IDXML` objects also support direct indexing by element ID. + +Data access +----------- + + :py:class:`IDXML` - a class representing a single idXML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through peptide-spectrum matches in an idXML + file. Data from a single PSM group are converted to a human-readable dict. + Basically creates an :py:class:`IDXML` object and reads it. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`DataFrame` - read idXML files into a :py:class:`pandas.DataFrame`. + +Target-decoy approach +--------------------- + + :py:func:`filter` - read a chain of idXML files and filter to a certain + FDR using TDA. + + :py:func:`filter.chain` - chain a series of filters applied independently to + several files. + + :py:func:`filter.chain.from_iterable` - chain a series of filters applied + independently to an iterable of files. + + :py:func:`filter_df` - filter idXML files and return a :py:class:`pandas.DataFrame`. + + :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be + consiudered decoy. + + :py:func:`fdr` - estimate the false discovery rate of a set of identifications + using the target-decoy approach. + + :py:func:`qvalues` - get an array of scores and local FDR values for a PSM + set using the target-decoy approach. + +Deprecated functions +-------------------- + + :py:func:`version_info` - get information about idXML version and schema. + You can just read the corresponding attribute of the :py:class:`IDXML` + object. + + :py:func:`get_by_id` - get an element by its ID and extract the data from it. + You can just call the corresponding method of the :py:class:`IDXML` + object. + + :py:func:`iterfind` - iterate over elements in an idXML file. + You can just call the corresponding method of the :py:class:`IDXML` + object. + +Dependencies +------------ + +This module requires :py:mod:`lxml`. + +------------------------------------------------------------------------------- +""" + +# Copyright 2020 Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings +from .. import auxiliary as aux +from .. import xml, _schema_defaults + + +class IDXML(xml.IndexedXML): + """Parser class for idXML files.""" + file_format = 'idXML' + _root_element = 'IdXML' + _default_schema = _schema_defaults._idxml_schema_defaults + _default_version = '1.5' + _default_iter_tag = 'PeptideIdentification' + _structures_to_flatten = {} + _indexed_tags = {'ProteinHit'} + _schema_location_param = 'noNamespaceSchemaLocation' + + def __init__(self, *args, **kwargs): + kwargs.setdefault('retrieve_refs', True) + super(IDXML, self).__init__(*args, **kwargs) + + def _get_info_smart(self, element, **kwargs): + """Extract the info in a smart way depending on the element type""" + name = xml._local_name(element) + kwargs = dict(kwargs) + rec = kwargs.pop("recursive", None) + + # Try not to recursively unpack the root element + # unless the user really wants to. + if name == self._root_element: + info = self._get_info(element, recursive=(rec if rec is not None else False), **kwargs) + else: + info = self._get_info(element, recursive=(rec if rec is not None else True), **kwargs) + for k in ['start', 'end']: + v = info.get(k) + if isinstance(v, list) and len(v) == 2: + info[k] = [int(x) for x in v[0].split()] + for k in ['aa_before', 'aa_after']: + if k in info: + info[k] = info[k].split() + return info + + def _retrieve_refs(self, info, **kwargs): + """Retrieves and embeds the data for each attribute in `info` that + ends in _ref. Removes the id attribute from `info`""" + for k, v in dict(info).items(): + if k[-5:] == '_refs': + try: + by_id = [self.get_by_id(x, retrieve_refs=True) for x in v.split()] + except KeyError: + warnings.warn('Ignoring unresolved reference: ' + v) + else: + for x in by_id: + x.pop('id', None) + info[k[:-5]] = by_id + del info[k] + + +def read(source, **kwargs): + """Parse `source` and iterate through peptide-spectrum matches. + + .. note:: This function is provided for backward compatibility only. + It simply creates an :py:class:`IDXML` instance using + provided arguments and returns it. + + Parameters + ---------- + source : str or file + A path to a target IDXML file or the file object itself. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + retrieve_refs : bool, optional + If :py:const:`True`, additional information from references will be + automatically added to the results. The file processing time will + increase. Default is :py:const:`True`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the IDXML header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + build_id_cache : bool, optional + Defines whether a cache of element IDs should be built and stored on the + created :py:class:`IDXML` instance. Default value is the value of + `retrieve_refs`. + + .. note:: This parameter is ignored when ``use_index`` is ``True`` (default). + + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored. + + indexed_tags : container of bytes, optional + Defines which elements need to be indexed. Empty set by default. + + Returns + ------- + out : IDXML + An iterator over the dicts with PSM properties. + """ + kwargs = kwargs.copy() + kwargs.setdefault('retrieve_refs', True) + kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) + return IDXML(source, **kwargs) + + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified "XPath". + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create an :py:class:`IDXML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + retrieve_refs : bool, optional + If :py:const:`True`, additional information from references will be + automatically added to the results. The file processing time will + increase. Default is :py:const:`False`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the IDXML header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + build_id_cache : bool, optional + Defines whether a cache of element IDs should be built and stored on the + created :py:class:`IDXML` instance. Default value is the value of + `retrieve_refs`. + + Returns + ------- + out : iterator + """ + kwargs = kwargs.copy() + kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) + return IDXML(source, **kwargs).iterfind(path, **kwargs) + + +version_info = xml._make_version_info(IDXML) + + +def get_by_id(source, elem_id, **kwargs): + """Parse `source` and return the element with `id` attribute equal + to `elem_id`. Returns :py:const:`None` if no such element is found. + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`get_by_id` calls on one file, you should + create an :py:class:`IDXML` object and use its + :py:meth:`!get_by_id` method. + + Parameters + ---------- + source : str or file + A path to a target mzIdentML file of the file object itself. + + elem_id : str + The value of the `id` attribute to match. + + Returns + ------- + out : :py:class:`dict` or :py:const:`None` + """ + return IDXML(source, **kwargs).get_by_id(elem_id, **kwargs) + + +chain = aux.ChainBase._make_chain(IDXML) + + +def is_decoy(psm, prefix=None): + """Given a PSM dict, return :py:const:`True` if it is marked as decoy, + and :py:const:`False` otherwise. + + Parameters + ---------- + psm : dict + A dict, as yielded by :py:func:`read`. + prefix : ignored + + Returns + ------- + out : bool + """ + return psm['PeptideHit'][0]['target_decoy'] == 'decoy' + + +def DataFrame(*args, **kwargs): + """Read idXML files into a :py:class:`pandas.DataFrame`. + + Requires :py:mod:`pandas`. + + .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'. + + Parameters + ---------- + *args + Passed to :py:func:`chain` + + **kwargs + Passed to :py:func:`chain` + + sep : str or None, keyword only, optional + Some values related to PSMs (such as protein information) are variable-length + lists. If `sep` is a :py:class:`str`, they will be packed into single string using + this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is + :py:const:`None`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + data = [] + + sep = kwargs.pop('sep', None) + with chain(*args, **kwargs) as f: + for item in f: + info = {} + for k, v in item.items(): + if isinstance(v, (str, int, float)): + info[k] = v + peptide_hit = item.get('PeptideHit', [None])[0] + if peptide_hit is not None: + info.update((k, v) for k, v in peptide_hit.items() if isinstance(v, (str, int, float))) + protein = peptide_hit.get('protein') + if protein: + accessions, isd, starts, ends, scores, aa_bs, aa_as = [], [], [], [], [], [], [] + for d, start, end, aab, aaa in zip(protein, peptide_hit['start'], peptide_hit['end'], peptide_hit['aa_before'], peptide_hit['aa_after']): + accessions.append(d.get('accession')) + isd.append(d.get('target_decoy')) + scores.append(d.get('score')) + starts.append(start) + ends.append(end) + aa_bs.append(aab) + aa_as.append(aaa) + + isd = all(x == 'decoy' for x in isd) + if sep is not None: + if all(isinstance(acc, str) for acc in accessions): + accessions = sep.join(accessions) + if all(isinstance(aaa, str) for aaa in aa_as): + aa_as = sep.join(aa_as) + if all(isinstance(aab, str) for aab in aa_bs): + aa_bs = sep.join(aa_bs) + if all(acc is None for acc in accessions): + accessions = None + + info.update((k, v) for k, v in protein[0].items() if isinstance(v, (str, int, float, list))) + info['accession'] = accessions + info['is decoy'] = isd + info['start'] = starts + info['end'] = ends + info['aa_before'] = aa_bs + info['aa_after'] = aa_as + data.append(info) + df = pd.DataFrame(data) + return df + + +def filter_df(*args, **kwargs): + """Read idXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. + Positional arguments can be idXML files or DataFrames. + + Requires :py:mod:`pandas`. + + .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'. + + Parameters + ---------- + key : str / iterable / callable, keyword only, optional + Peptide identification score. Default is 'score'. You will probably need to change it. + is_decoy : str / iterable / callable, keyword only, optional + Default is 'is decoy'. + *args + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + **kwargs + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + kwargs.setdefault('key', 'score') + if all(isinstance(arg, pd.DataFrame) for arg in args): + df = pd.concat(args) + else: + df = DataFrame(*args, **kwargs) + if 'is_decoy' not in kwargs: + kwargs['is_decoy'] = 'is decoy' + return aux.filter(df, **kwargs) + + +fdr = aux._make_fdr(is_decoy, None) +_key = lambda x: x['PeptideHit'][0]['score'] +qvalues = aux._make_qvalues(chain, is_decoy, None, _key) +filter = aux._make_filter(chain, is_decoy, None, _key, qvalues) +filter.chain = aux._make_chain(filter, 'filter', True) diff --git a/pyteomics/openms/trafoxml.py b/pyteomics/openms/trafoxml.py new file mode 100644 index 0000000..d42c569 --- /dev/null +++ b/pyteomics/openms/trafoxml.py @@ -0,0 +1,82 @@ +""" +trafoxml - reader for trafoXML files +======================================== + +Summary +------- + +**trafoXML** is a format specified in the +`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. +It defines a transformation, which is a result of retention time alignment. + +This module provides a minimalistic way to extract information from **trafoXML** +files. You can use the old functional interface (:py:func:`read`) or the new +object-oriented interface (:py:class:`TrafoXML`) +to iterate over entries in ``<Pair>`` elements. + +Data access +----------- + + :py:class:`TrafoXML` - a class representing a single trafoXML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through pairs in a trafoXML file. Data from a + single trafo are converted to a human-readable dict. + + :py:func:`chain` - read multiple trafoXML files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + +Dependencies +------------ + +This module requres :py:mod:`lxml`. + +-------------------------------------------------------------------------------- +""" + +from .. import xml, auxiliary as aux, _schema_defaults + +class TrafoXML(xml.XML): + """Parser class for trafoXML files.""" + file_format = 'trafoXML' + _root_element = 'TrafoXML' + _default_schema = _schema_defaults._trafoxml_schema_defaults + _default_version = '1.0' + _default_iter_tag = 'Pair' + _schema_location_param = 'noNamespaceSchemaLocation' + + def _get_info_smart(self, element, **kw): + kw['recursive'] = kw.get('recursive', True) + info = self._get_info(element, **kw) + return info + +def read(source, read_schema=True, iterative=True): + """Parse `source` and iterate through pairs. + + Parameters + ---------- + source : str or file + A path to a target trafoXML file or the file object itself. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the file header (default). Otherwise, use default + parameters. Disable this to avoid waiting on slow network connections or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + Returns + ------- + out : iterator + An iterator over the dicts with feature properties. + """ + + return TrafoXML(source, read_schema=read_schema, iterative=iterative) + +chain = aux._make_chain(read, 'read') \ No newline at end of file diff --git a/pyteomics/parser.py b/pyteomics/parser.py new file mode 100644 index 0000000..72ef9fc --- /dev/null +++ b/pyteomics/parser.py @@ -0,0 +1,1148 @@ +""" +parser - operations on modX peptide sequences +============================================= + +modX is a simple extension of the `IUPAC one-letter peptide sequence +representation <http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html>`_. + +The labels (or codes) for the 20 standard amino acids in modX are the same as +in IUPAC nomeclature. A label for a modified amino acid has a general +form of 'modX', i.e.: + +- it starts with an arbitrary number of lower-case symbols or numbers + (a modification); + +- it ends with a single upper-case symbol (an amino acid residue). + +The valid examples of modX amino acid labels are: 'G', 'pS', 'oxM'. This rule +allows to combine read- and parseability. + +Besides the sequence of amino acid residues, modX has a rule to specify +terminal modifications of a polypeptide. Such a label should start or +end with a hyphen. The default N-terminal amine group and C-terminal +carboxyl group may not be shown explicitly. + +Therefore, valid examples of peptide sequences in modX are: "GAGA", +"H-PEPTIDE-OH", "H-TEST-NH2". It is not recommmended to specify only one +terminal group. + +Operations on polypeptide sequences +----------------------------------- + + :py:func:`parse` - convert a sequence string into a list of + amino acid residues. + + :py:func:`to_string` - convert a parsed sequence to a string. + + :py:func:`to_proforma` - convert a (parsed) *modX* sequence to ProForma. + + :py:func:`amino_acid_composition` - get numbers of each amino acid + residue in a peptide. + + :py:func:`cleave`, :py:func:`icleave`, :py:func:`xcleave` - cleave a polypeptide using a given rule of + enzymatic digestion. + + :py:func:`num_sites` - count the number of cleavage sites in a sequence. + + :py:func:`isoforms` - generate all unique modified peptide sequences + given the initial sequence and modifications. + +Auxiliary commands +------------------ + + :py:func:`coverage` - calculate the sequence coverage of a protein by peptides. + + :py:func:`length` - calculate the number of amino acid + residues in a polypeptide. + + :py:func:`valid` - check if a sequence can be parsed successfully. + + :py:func:`fast_valid` - check if a sequence contains of known one-letter + codes. + + :py:func:`is_modX` - check if supplied code corresponds to a modX label. + + :py:func:`is_term_mod` - check if supplied code corresponds to a + terminal modification. + +Data +---- + + :py:data:`std_amino_acids` - a list of the 20 standard amino acid IUPAC codes. + + :py:data:`std_nterm` - the standard N-terminal modification (the + unmodified group is a single atom of hydrogen). + + :py:data:`std_cterm` - the standard C-terminal modification (the + unmodified group is hydroxyl). + + :py:data:`std_labels` - a list of all standard sequence + elements, amino acid residues and terminal modifications. + + :py:data:`expasy_rules` and :py:data:`psims_rules` - two dicts with the regular expressions of + cleavage rules for the most popular proteolytic enzymes. + +------------------------------------------------------------------------------- + +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from collections import deque +import itertools as it +import warnings +from .auxiliary import PyteomicsError, memoize, BasicComposition, cvstr, cvquery + + +std_amino_acids = ['Q', 'W', 'E', 'R', 'T', 'Y', 'I', 'P', 'A', 'S', + 'D', 'F', 'G', 'H', 'K', 'L', 'C', 'V', 'N', 'M'] +"""modX labels for the 20 standard amino acids.""" + +std_nterm = 'H-' +"""modX label for the unmodified N-terminus.""" + +std_cterm = '-OH' +"""modX label for the unmodified C-terminus.""" + +std_labels = std_amino_acids + [std_nterm, std_cterm] +"""modX labels for the standard amino acids and unmodified termini.""" + +_nterm_mod = r'[^-]+-$' +_cterm_mod = r'-[^-]+$' + + +def is_term_mod(label): + """Check if `label` corresponds to a terminal modification. + + Parameters + ---------- + label : str + + Returns + ------- + out : bool + + Examples + -------- + >>> is_term_mod('A') + False + >>> is_term_mod('Ac-') + True + >>> is_term_mod('-customGroup') + True + >>> is_term_mod('this-group-') + False + >>> is_term_mod('-') + False + """ + return (re.match(_nterm_mod, label) or re.match(_cterm_mod, label)) is not None + + +def match_modX(label): + """Check if `label` is a valid 'modX' label. + + Parameters + ---------- + label : str + + Returns + ------- + out : re.match or None + """ + return re.match(_modX_single, label) + + +def is_modX(label): + """Check if `label` is a valid 'modX' label. + + Parameters + ---------- + label : str + + Returns + ------- + out : bool + + Examples + -------- + >>> is_modX('M') + True + >>> is_modX('oxM') + True + >>> is_modX('oxMet') + False + >>> is_modX('160C') + True + """ + return bool(match_modX(label)) + + +def length(sequence, **kwargs): + """Calculate the number of amino acid residues in a polypeptide + written in modX notation. + + Parameters + ---------- + sequence : str or list or dict + A string with a polypeptide sequence, a list with a parsed sequence or + a dict of amino acid composition. + labels : list, optional + A list of allowed labels for amino acids and terminal modifications. + + Returns + ------- + out : int + + Examples + -------- + >>> length('PEPTIDE') + 7 + >>> length('H-PEPTIDE-OH') + 7 + """ + if not sequence: + return 0 + + if isinstance(sequence, str) or isinstance(sequence, list): + if isinstance(sequence, str): + parsed_sequence = parse(sequence, **kwargs) + else: + parsed_sequence = sequence + num_term_groups = 0 + if is_term_mod(parsed_sequence[0]): + num_term_groups += 1 + if is_term_mod(parsed_sequence[-1]): + num_term_groups += 1 + return len(parsed_sequence) - num_term_groups + elif isinstance(sequence, dict): + return sum(amount for aa, amount in sequence.items() if not is_term_mod(aa)) + + raise PyteomicsError('Unsupported type of sequence.') + + +def _split_label(label): + try: + mod, X = match_modX(label).groups() + except AttributeError: + raise PyteomicsError('Cannot split a non-modX label: %s' % label) + if not mod: + return (X,) + else: + return mod, X + + +_modX_sequence = re.compile(r'^([^-]+-)?((?:[^A-Z-]*[A-Z])+)(-[^-]+)?$') +_modX_group = re.compile(r'[^A-Z-]*[A-Z]') +_modX_split = re.compile(r'([^A-Z-]*)([A-Z])') +_modX_single = re.compile(r'^([^A-Z-]*)([A-Z])$') + + +def parse(sequence, show_unmodified_termini=False, split=False, allow_unknown_modifications=False, **kwargs): + """Parse a sequence string written in modX notation into a list of + labels or (if `split` argument is :py:const:`True`) into a list of + tuples representing amino acid residues and their modifications. + + Parameters + ---------- + sequence : str + The sequence of a polypeptide. + show_unmodified_termini : bool, optional + If :py:const:`True` then the unmodified N- and C-termini are explicitly + shown in the returned list. Default value is :py:const:`False`. + split : bool, optional + If :py:const:`True` then the result will be a list of tuples with 1 to 4 + elements: terminal modification, modification, residue. Default value is + :py:const:`False`. + allow_unknown_modifications : bool, optional + If :py:const:`True` then do not raise an exception when an unknown + modification of a known amino acid residue is found in the sequence. + This also includes terminal groups. + Default value is :py:const:`False`. + + .. note:: + Since version 2.5, this parameter has effect only if `labels` + are provided. + labels : container, optional + A container of allowed labels for amino acids, + modifications and terminal modifications. + If not provided, no checks will be done. + Separate labels for modifications (such as 'p' or 'ox') + can be supplied, which means they are applicable to all residues. + + .. warning:: + If `show_unmodified_termini` is set to :py:const:`True`, standard + terminal groups need to be present in `labels`. + + .. warning:: + Avoid using sequences with only one terminal group, as they are + ambiguous. If you provide one, `labels` (or :py:const:`std_labels`) + will be used to resolve the ambiguity. + + Returns + ------- + out : list + List of tuples with labels of modifications and amino acid residues. + + Examples + -------- + >>> parse('PEPTIDE', split=True) + [('P',), ('E',), ('P',), ('T',), ('I',), ('D',), ('E',)] + >>> parse('H-PEPTIDE') + ['P', 'E', 'P', 'T', 'I', 'D', 'E'] + >>> parse('PEPTIDE', show_unmodified_termini=True) + ['H-', 'P', 'E', 'P', 'T', 'I', 'D', 'E', '-OH'] + >>> parse('TEpSToxM', labels=std_labels + ['pS', 'oxM']) + ['T', 'E', 'pS', 'T', 'oxM'] + >>> parse('zPEPzTIDzE', True, True, labels=std_labels+['z']) + [('H-', 'z', 'P'), ('E',), ('P',), ('z', 'T'), ('I',), ('D',), ('z', 'E', '-OH')] + >>> parse('Pmod1EPTIDE') + ['P', 'mod1E', 'P', 'T', 'I', 'D', 'E'] + """ + sequence = str(sequence) + + try: + n, body, c = re.match(_modX_sequence, sequence).groups() + except AttributeError: + raise PyteomicsError('Not a valid modX sequence: ' + sequence) + + # Check for allowed labels, if they were explicitly given + labels = kwargs.get('labels') + # labels help save the day when only one terminal group is given + if c is None and n is not None: + if labels is None: + labels = std_labels + # we can try to resolve the ambiguity + if n != std_nterm and n not in labels: + # n is the body then + c = '-' + body + body = n[:-1] + n = None + + # Actual parsing + if split: + parsed_sequence = [g if g[0] else (g[1],) for g in re.findall( + _modX_split, body)] + else: + parsed_sequence = re.findall(_modX_group, body) + nterm, cterm = (n or std_nterm), (c or std_cterm) + + # Check against `labels` if given + if labels is not None: + labels = set(labels) + for term, std_term in zip([n, c], [std_nterm, std_cterm]): + if term and term not in labels and not allow_unknown_modifications: + raise PyteomicsError('Unknown label: {}'.format(term)) + for group in parsed_sequence: + if split: + mod, X = group if len(group) == 2 else ('', group[0]) + else: + mod, X = re.match(_modX_split, group).groups() + if ((not mod) and X not in labels) or not ((mod + X in labels) or ( + X in labels and ( + mod in labels or allow_unknown_modifications))): + raise PyteomicsError('Unknown label: {}'.format(group)) + + # Append terminal labels + if show_unmodified_termini or nterm != std_nterm: + if split: + parsed_sequence[0] = (nterm,) + parsed_sequence[0] + else: + parsed_sequence.insert(0, nterm) + if show_unmodified_termini or cterm != std_cterm: + if split: + parsed_sequence[-1] = parsed_sequence[-1] + (cterm,) + else: + parsed_sequence.append(cterm) + + return parsed_sequence + + +def valid(*args, **kwargs): + """Try to parse sequence and catch the exceptions. + All parameters are passed to :py:func:`parse`. + + Returns + ------- + out : bool + :py:const:`True` if the sequence was parsed successfully, and + :py:const:`False` otherwise. + """ + try: + parse(*args, **kwargs) + except PyteomicsError: + return False + return True + + +def fast_valid(sequence, labels=set(std_labels)): + """Iterate over `sequence` and check if all items are in `labels`. + With strings, this only works as expected on sequences without + modifications or terminal groups. + + Parameters + ---------- + sequence : iterable (expectedly, str) + The sequence to check. A valid sequence would be a string of + labels, all present in `labels`. + labels : iterable, optional + An iterable of known labels. + + Returns + ------- + out : bool + """ + return set(sequence).issubset(labels) + + +def to_string(parsed_sequence, show_unmodified_termini=True): + """Create a string from a parsed sequence. + + Parameters + ---------- + parsed_sequence : iterable + Expected to be in one of the formats returned by + :py:func:`parse`, i.e. list of labels or list of tuples. + show_unmodified_termini : bool, optional + Defines the behavior towards standard terminal groups in the input. + :py:const:`True` means that they will be preserved if present (default). + :py:const:`False` means that they will be removed. Standard terminal + groups will not be added if not shown in `parsed_sequence`, + regardless of this setting. + + Returns + ------- + sequence : str + """ + parsed_sequence = list(parsed_sequence) + labels = [] + nterm = parsed_sequence[0] + cterm = parsed_sequence[-1] + + if isinstance(nterm, str): + if nterm != std_nterm or show_unmodified_termini: + labels.append(nterm) + labels.extend(parsed_sequence[1:-1]) + if len(parsed_sequence) > 1 and (cterm != std_cterm or show_unmodified_termini): + labels.append(cterm) + else: + if len(parsed_sequence) == 1: + g = nterm + if nterm[0] == std_nterm and not show_unmodified_termini: + g = g[1:] + if nterm[-1] == std_cterm and not show_unmodified_termini: + g = g[:-1] + return ''.join(g) + if nterm[0] != std_nterm or show_unmodified_termini: + labels.append(''.join(nterm)) + else: + labels.append(''.join(nterm[1:])) + labels.extend(''.join(g) for g in parsed_sequence[1:-1]) + if len(parsed_sequence) > 1: + if cterm[-1] != std_cterm or show_unmodified_termini: + labels.append(''.join(cterm)) + else: + labels.append(''.join(cterm[:-1])) + return ''.join(labels) + + +tostring = to_string + + +def to_proforma(sequence, **kwargs): + """Converts a (parsed) *modX* sequence to a basic ProForma string. + Modifications are represented as masses, if those are given in :arg:`aa_mass`, + as chemical formulas (via :arg:`aa_comp`) or as names (using :arg:`mod_names`). + + Parameters + ---------- + sequence : str or list + A *modX* sequence, possibly in the parsed form. + aa_mass : dict, keyword only, optional + Used to render modifications as mass shifts. + aa_comp : dict, keyword only, optional + Used to render modifications as chemical formulas. + mod_names : dict or callable, keyword only, optional + Used to get the rendered name of modification from the mod label. + prefix : str, keyword only, optional + Prepend all modification names with the given prefix. + + Returns + ------- + out : str + A ProForma sequence. + + Examples + -------- + >>> to_proforma('PEPTIDE') + 'PEPTIDE' + >>> to_proforma('Ac-oxMYPEPTIDE-OH', aa_mass={'Ac-': 42.010565}, mod_names={'ox': 'Oxidation'}, prefix='U:') + '[+42.0106]-M[U:Oxidation]YPEPTIDE' + >>> to_proforma('oxidationMYPEPTIDE') # last fallback is to just capitalize the label + 'M[Oxidation]YPEPTIDE' + """ + from . import proforma + from .mass.mass import std_aa_mass, std_aa_comp + + if isinstance(sequence, str): + return to_proforma(parse(sequence), **kwargs) + + aa_mass = kwargs.get('aa_mass', std_aa_mass) + aa_comp = kwargs.get('aa_comp', std_aa_comp) + mod_names = kwargs.get('mod_names', {}) + prefix = kwargs.get('prefix', '') + + if isinstance(mod_names, dict): + get_name = mod_names.get + else: + get_name = mod_names + + def get_tag(label): + if label in aa_mass: + return [proforma.MassModification(aa_mass[label])] + if label in aa_comp: + return [proforma.FormulaModification(''.join('{}{}'.format(k, v if v not in {0, 1} else '') for k, v in aa_comp[label].items()))] + name = get_name(label) + if not name: + warnings.warn("Unable to resolve label `{}`. " + "The ProForma string may be invalid. Specify `mod_names`, `aa_mass` or `aa_comp`.".format(label)) + name = label.capitalize() + return [proforma.GenericModification(prefix + name)] + + i, j = 0, len(sequence) + nterm = cterm = None + pro_sequence = [] + if isinstance(sequence[0], str): # regular parsed sequence + if is_term_mod(sequence[0]) and sequence[0] != std_nterm: + nterm = get_tag(sequence[0]) + i = 1 + if is_term_mod(sequence[-1]) and sequence[-1] != std_cterm: + cterm = get_tag(sequence[-1]) + j -= 1 + for label in sequence[i:j]: + if len(label) == 1: + pro_sequence.append((label, None)) + else: + mod, aa = _split_label(label) + pro_sequence.append((aa, get_tag(mod))) + else: # split sequence + if is_term_mod(sequence[0][0]) and sequence[0][0] != std_nterm: + nterm = get_tag(sequence[0][0]) + if is_term_mod(sequence[-1][-1]) and sequence[-1][-1] != std_cterm: + cterm = get_tag(sequence[-1][-1]) + if len(sequence) == 1: + pro_sequence = [(sequence[0][-2] if cterm else sequence[0][-1], get_tag(sequence[0][1]) if len(sequence[0]) == 4 else None)] + else: + pro_sequence.append((sequence[0][-1], get_tag(sequence[0][-2]) if len(sequence[0]) == 3 else None)) + for group in sequence[1:-1]: + pro_sequence.append((group[-1], get_tag(group[0]) if len(group) == 2 else None)) + if len(sequence[-1]) == 1 or (len(sequence[-1]) == 2 and cterm): + pro_sequence.append((sequence[-1][0], None)) + else: + pro_sequence.append((sequence[-1][1], get_tag(sequence[-1][0]))) + + return proforma.to_proforma(pro_sequence, n_term=nterm, c_term=cterm) + + +def amino_acid_composition(sequence, show_unmodified_termini=False, term_aa=False, allow_unknown_modifications=False, **kwargs): + """Calculate amino acid composition of a polypeptide. + + Parameters + ---------- + sequence : str or list + The sequence of a polypeptide or a list with a parsed sequence. + show_unmodified_termini : bool, optional + If :py:const:`True` then the unmodified N- and C-terminus are explicitly + shown in the returned dict. Default value is :py:const:`False`. + term_aa : bool, optional + If :py:const:`True` then the terminal amino acid residues are + artificially modified with `nterm` or `cterm` modification. + Default value is :py:const:`False`. + allow_unknown_modifications : bool, optional + If :py:const:`True` then do not raise an exception when an unknown + modification of a known amino acid residue is found in the sequence. + Default value is :py:const:`False`. + labels : list, optional + A list of allowed labels for amino acids and terminal modifications. + + Returns + ------- + out : dict + A dictionary of amino acid composition. + + Examples + -------- + >>> amino_acid_composition('PEPTIDE') == \ + {'I': 1, 'P': 2, 'E': 2, 'T': 1, 'D': 1} + True + >>> amino_acid_composition('PEPTDE', term_aa=True) == \ + {'ctermE': 1, 'E': 1, 'D': 1, 'P': 1, 'T': 1, 'ntermP': 1} + True + >>> amino_acid_composition('PEPpTIDE', labels=std_labels+['pT']) == \ + {'I': 1, 'P': 2, 'E': 2, 'D': 1, 'pT': 1} + True + """ + labels = kwargs.get('labels') + + if isinstance(sequence, str): + parsed_sequence = parse(sequence, show_unmodified_termini, + allow_unknown_modifications=allow_unknown_modifications, + labels=labels) + elif isinstance(sequence, list): + if sequence and isinstance(sequence[0], tuple): + parsed_sequence = parse(tostring(sequence, True), + show_unmodified_termini, + allow_unknown_modifications=allow_unknown_modifications, + labels=labels) + else: + parsed_sequence = sequence + else: + raise PyteomicsError('Unsupported type of a sequence.' + 'Must be str or list, not %s' % type(sequence)) + + aa_dict = BasicComposition() + + # Process terminal amino acids. + if term_aa: + nterm_aa_position = 1 if is_term_mod(parsed_sequence[0]) else 0 + cterm_aa_position = ( + len(parsed_sequence) - 2 if is_term_mod(parsed_sequence[-1]) + else len(parsed_sequence) - 1) + if len(parsed_sequence) > 1: + aa_dict['cterm' + parsed_sequence.pop(cterm_aa_position)] = 1 + aa_dict['nterm' + parsed_sequence.pop(nterm_aa_position)] = 1 + + # Process core amino acids. + for aa in parsed_sequence: + aa_dict[aa] += 1 + + return aa_dict + + +@memoize() +def cleave(*args, **kwargs): + """Cleaves a polypeptide sequence using a given rule. + + .. seealso:: + :func:`icleave` and :func:`xcleave`, which produce both peptides and their indices. + + Parameters + ---------- + sequence : str + The sequence of a polypeptide. + + .. note:: + The sequence is expected to be in one-letter uppercase notation. + Otherwise, some of the cleavage rules in :py:data:`expasy_rules` + will not work as expected. + + rule : str or compiled regex + A key present in :py:data:`expasy_rules`, :py:data:`psims_rules` (or an MS ontology accession) or a + `regular expression <https://docs.python.org/library/re.html#regular-expression-syntax>`_ + describing the site of cleavage. It is recommended + to design the regex so that it matches only the residue whose C-terminal + bond is to be cleaved. All additional requirements should be specified + using `lookaround assertions + <http://www.regular-expressions.info/lookaround.html>`_. + :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents. + + .. seealso:: The `regex` argument. + + missed_cleavages : int, optional + Maximum number of allowed missed cleavages. Defaults to 0. + min_length : int or None, optional + Minimum peptide length. Defaults to :py:const:`None`. + + .. note :: + This checks for string length, which is only correct for one-letter + notation and not for full *modX*. Use :py:func:`length` manually if + you know what you are doing and apply :py:func:`cleave` to *modX* + sequences. + + max_length : int or None, optional + Maximum peptide length. Defaults to :py:const:`None`. See note above. + + semi : bool, optional + Include products of semi-specific cleavage. Default is :py:const:`False`. + This effectively cuts every peptide at every position and adds results to the output. + + exception : str or compiled RE or None, optional + Exceptions to the cleavage rule. If specified, should be a key present in :py:const:`expasy_rules` + or regular expression. Cleavage sites matching `rule` will be checked against `exception` and omitted + if they match. + + regex : bool, optional + If :py:const:`True`, the cleavage rule is always interpreted as a regex. Otherwise, a matching value + is looked up in :py:data:`expasy_rules` and :py:data:`psims_rules`. + + Returns + ------- + out : set + A set of unique (!) peptides. + + Examples + -------- + >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'} + True + >>> cleave('AKAKBK', 'trypsin', 0) == {'AK', 'BK'} + True + >>> cleave('AKAKBK', 'MS:1001251', 0) == {'AK', 'BK'} + True + >>> cleave('GKGKYKCK', 'Trypsin/P', 2) == \ + {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'} + True + + """ + return set(p for i, p in icleave(*args, **kwargs)) + + +def icleave(sequence, rule, missed_cleavages=0, min_length=None, max_length=None, semi=False, exception=None, regex=False): + """Like :py:func:`cleave`, but the result is an iterator and includes peptide indices. + Refer to :py:func:`cleave` for explanation of parameters. + + Returns + ------- + out : iterator + An iterator over (index, sequence) pairs. + + """ + if not regex: + if rule in expasy_rules: + rule = expasy_rules[rule] + elif rule in psims_rules: + rule = psims_rules[rule] + elif rule in _psims_index: + rule = _psims_index[rule] + elif re.search(r'[a-z]', rule): + warnings.warn('Interpreting the rule as a regular expression: {}. Did you mistype the rule? ' + 'Specify `regex=True` to silence this warning.'.format(rule)) + exception = expasy_rules.get(exception, exception) + ml = missed_cleavages + 2 + trange = range(ml) + cleavage_sites = deque([0], maxlen=ml) + if min_length is None: + min_length = 1 + if max_length is None: + max_length = len(sequence) + cl = 1 + if exception is not None: + exceptions = {x.end() for x in re.finditer(exception, sequence)} + for end in it.chain([x.end() for x in re.finditer(rule, sequence)], [None]): + if exception is not None and end in exceptions: + continue + cleavage_sites.append(end) + if cl < ml: + cl += 1 + for j in trange[:cl - 1]: + seq = sequence[cleavage_sites[j]:cleavage_sites[-1]] + lenseq = len(seq) + if end is not None: + start = end - lenseq + else: + start = len(sequence) - lenseq + if seq and min_length <= lenseq <= max_length: + yield (start, seq) + if semi: + for k in range(min_length, min(lenseq, max_length)): + yield (start, seq[:k]) + for k in range(max(1, lenseq - max_length), lenseq - min_length + 1): + yield (start + k, seq[k:]) + + +def xcleave(*args, **kwargs): + """Like :py:func:`icleave`, but returns a list. + + Returns + ------- + out : list + A list of (index, sequence) pairs. + + Examples + -------- + >>> xcleave('AKAKBK', 'trypsin', 1) + [(0, 'AK'), (0, 'AKAK'), (2, 'AK'), (2, 'AKBK'), (4, 'BK')] + """ + return list(icleave(*args, **kwargs)) + + +def num_sites(sequence, rule, **kwargs): + """Count the number of sites where `sequence` can be cleaved using + the given `rule` (e.g. number of miscleavages for a peptide). + + Parameters + ---------- + sequence : str + The sequence of a polypeptide. + rule : str or compiled regex + A regular expression describing the site of cleavage. It is recommended + to design the regex so that it matches only the residue whose C-terminal + bond is to be cleaved. All additional requirements should be specified + using `lookaround assertions + <http://www.regular-expressions.info/lookaround.html>`_. + labels : list, optional + A list of allowed labels for amino acids and terminal modifications. + exception : str or compiled RE or None, optional + Exceptions to the cleavage rule. If specified, should be a regular expression. + Cleavage sites matching `rule` will be checked against `exception` and omitted + if they match. + + Returns + ------- + out : int + Number of cleavage sites. + """ + return sum(1 for _ in icleave(sequence, rule, **kwargs)) - 1 + + +expasy_rules = { + 'arg-c': r'R', + 'asp-n': r'\w(?=D)', + 'bnps-skatole' : r'W', + 'caspase 1': r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])', + 'caspase 2': r'(?<=DVA)D(?=[^PEDQKR])', + 'caspase 3': r'(?<=DMQ)D(?=[^PEDQKR])', + 'caspase 4': r'(?<=LEV)D(?=[^PEDQKR])', + 'caspase 5': r'(?<=[LW]EH)D', + 'caspase 6': r'(?<=VE[HI])D(?=[^PEDQKR])', + 'caspase 7': r'(?<=DEV)D(?=[^PEDQKR])', + 'caspase 8': r'(?<=[IL]ET)D(?=[^PEDQKR])', + 'caspase 9': r'(?<=LEH)D', + 'caspase 10': r'(?<=IEA)D', + 'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))', + 'chymotrypsin low specificity': + r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))', + 'clostripain': r'R', + 'cnbr': r'M', + 'enterokinase': r'(?<=[DE]{3})K', + 'factor xa': r'(?<=[AFGILTVM][DE]G)R', + 'formic acid': r'D', + 'glutamyl endopeptidase': r'E', + 'granzyme b': r'(?<=IEP)D', + 'hydroxylamine': r'N(?=G)', + 'iodosobenzoic acid': r'W', + 'lysc': r'K', + 'ntcb': r'\w(?=C)', + 'pepsin ph1.3': r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|' + r'((?<=[^HKR][^P])[FL](?=\w[^P]))', + 'pepsin ph2.0': r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|' + r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))', + 'proline endopeptidase': r'(?<=[HKR])P(?=[^P])', + 'proteinase k': r'[AEFILTVWY]', + 'staphylococcal peptidase i': r'(?<=[^E])E', + 'thermolysin': r'[^DE](?=[AFILMV][^P])', + 'thrombin': r'((?<=G)R(?=G))|' + r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))', + 'trypsin': r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))', + 'trypsin_exception': r'((?<=[CD])K(?=D))|((?<=C)K(?=[HY]))|((?<=C)R(?=K))|((?<=R)R(?=[HR]))', +} +""" +This dict contains regular expressions for cleavage rules of the most +popular proteolytic enzymes. The rules were taken from the +`PeptideCutter tool +<http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_ +at Expasy. + +.. note:: + 'trypsin_exception' can be used as `exception` argument when calling + :py:func:`cleave` with 'trypsin' `rule`:: + + >>> parser.cleave('PEPTIDKDE', parser.expasy_rules['trypsin']) + {'DE', 'PEPTIDK'} + >>> parser.cleave('PEPTIDKDE', parser.expasy_rules['trypsin'], \ +exception=parser.expasy_rules['trypsin_exception']) + {'PEPTIDKDE'} +""" + + +psims_rules = { + cvstr('2-iodobenzoate', 'MS:1001918'): r'(?<=W)', + cvstr('Arg-C', 'MS:1001303'): r'(?<=R)(?!P)', + cvstr('Asp-N', 'MS:1001304'): r'(?=[BD])', + cvstr('Asp-N ambic', 'MS:1001305'): r'(?=[DE])', + cvstr('CNBr', 'MS:1001307'): r'(?<=M)', + cvstr('Chymotrypsin', 'MS:1001306'): r'(?<=[FYWL])(?!P)', + cvstr('Formic acid', 'MS:1001308'): r'((?<=D))|((?=D))', + cvstr('Lys-C', 'MS:1001309'): r'(?<=K)(?!P)', + cvstr('Lys-C/P', 'MS:1001310'): r'(?<=K)', + cvstr('PepsinA', 'MS:1001311'): r'(?<=[FL])', + cvstr('TrypChymo', 'MS:1001312'): r'(?<=[FYWLKR])(?!P)', + cvstr('Trypsin', 'MS:1001251'): r'(?<=[KR])(?!P)', + cvstr('Trypsin/P', 'MS:1001313'): r'(?<=[KR])', + cvstr('V8-DE', 'MS:1001314'): r'(?<=[BDEZ])(?!P)', + cvstr('V8-E', 'MS:1001315'): r'(?<=[EZ])(?!P)', + cvstr('glutamyl endopeptidase', 'MS:1001917'): r'(?<=[^E]E)', + cvstr('leukocyte elastase', 'MS:1001915'): r'(?<=[ALIV])(?!P)', + cvstr('proline endopeptidase', 'MS:1001916'): r'(?<=[HKR]P)(?!P)', +} +""" +This dict contains regular expressions for cleavage rules of the most +popular proteolytic enzymes. The rules were taken from the PSI `MS ontology +<http://purl.obolibrary.org/obo/MS_1001045>`_. + +You can use names or accessions to access the rules. +Use :py:func:`pyteomics.auxiliary.cvquery` for accession access:: + + >>> from pyteomics.auxiliary import cvquery + >>> from pyteomics.parser import psims_rules + >>> cvquery(psims_rules, 'MS:1001918') + '(?<=W)' + +""" + +_psims_index = cvquery(psims_rules) + +def isoforms(sequence, **kwargs): + """ + Apply variable and fixed modifications to the polypeptide and yield + the unique modified sequences. + + Parameters + ---------- + + sequence : str + Peptide sequence to modify. + + variable_mods : dict, optional + A dict of variable modifications in the following format: + :py:const:`{'label1': ['X', 'Y', ...], 'label2': ['X', 'A', 'B', ...]}` + + Keys in the dict are modification labels (terminal modifications allowed). + Values are iterables of residue labels (one letter each) or + :py:const:`True`. If a value for a modification is :py:const:`True`, + it is applicable to any residue (useful for terminal modifications). + You can use values such as 'ntermX' or 'ctermY' to specify that a + mdofication only occurs when the residue is in the terminal position. + This is *not needed* for terminal modifications. + + .. note:: Several variable modifications can occur on amino acids of the + same type, but in the output each amino acid residue will be + modified at most once (apart from terminal modifications). + + fixed_mods : dict, optional + A dict of fixed modifications in the same format. + + **Note**: if a residue is affected by a fixed modification, no variable + modifications will be applied to it (apart from terminal modifications). + + labels : list, optional + A list of amino acid labels containing all the labels present in + `sequence`. Modified entries will be added automatically. + Defaults to :py:data:`std_labels`. + Not required since version 2.5. + + max_mods : int or None, optional + Number of modifications that can occur simultaneously on a peptide, + excluding fixed modifications. If :py:const:`None` or if ``max_mods`` + is greater than the number of modification sites, all possible + isoforms are generated. Default is :py:const:`None`. + + override : bool, optional + Defines how to handle the residues that are modified in the input. + :py:const:`False` means that they will be preserved (default). + :py:const:`True` means they will be treated as unmodified. + + show_unmodified_termini : bool, optional + If :py:const:`True` then the unmodified N- and C-termini are explicitly + shown in the returned sequences. Default value is :py:const:`False`. + + format : str, optional + If :py:const:`'str'` (default), an iterator over sequences is returned. + If :py:const:`'split'`, the iterator will yield results in the same + format as :py:func:`parse` with the 'split' option, with unmodified + terminal groups shown. + + Returns + ------- + + out : iterator over strings or lists + All possible unique polypeptide sequences resulting from + the specified modifications are yielded obe by one. + """ + def main(group): # index of the residue (capital letter) in `group` + if group[-1][0] == '-': + i = -2 + else: + i = -1 + return len(group) + i, group[i] + + def apply_mod(label, mod): + # `label` is assumed to be a tuple (see split option of `parse`) + # unmodified termini are assumed shown + # if the modification is not applicable, `None` is returned + group = list(label) + m = main(group)[0] + c = True # whether the change is applied in the end + if m == 0 and not is_term_mod(mod): + group.insert(0, mod) + elif mod[0] == '-' and (group[-1] == std_cterm or (group[-1][0] == '-' and override)): + group[-1] = mod + elif mod[-1] == '-' and (group[0] == std_nterm or (group[0][-1] == '-' and override)): + group[0] = mod + elif not is_term_mod(mod): + if m and group[m - 1][-1] != '-': + if override: + group[m - 1] = mod + else: + c = False + else: + group.insert(m, mod) + else: + c = False + if c: + return tuple(group) + + variable_mods = kwargs.get('variable_mods', {}) + varmods_term, varmods_non_term = [], [] + for m, r in sorted(variable_mods.items()): + if is_term_mod(m): + varmods_term.append((m, r)) + else: + varmods_non_term.append((m, r)) + fixed_mods = kwargs.get('fixed_mods', {}) + parse_kw = {} + if 'labels' in kwargs: + parse_kw['labels'] = list(kwargs['labels']) + list(fixed_mods) + parsed = parse(sequence, True, True, **parse_kw) + override = kwargs.get('override', False) + show_unmodified_termini = kwargs.get('show_unmodified_termini', False) + max_mods = kwargs.get('max_mods') + format_ = kwargs.get('format', 'str') + + # Apply fixed modifications + for cmod, res in fixed_mods.items(): + for i, group in enumerate(parsed): + if res is True or main(group)[1] in res: + parsed[i] = apply_mod(group, cmod) or parsed[i] + + # Create a list of possible states for each group + # Start with N-terminal mods and regular mods on the N-terminal residue + states = [[parsed[0]]] + m0 = main(parsed[0])[1] + for m, r in varmods_non_term: + if r is True or m0 in r or 'nterm' + m0 in r or len(parsed) == 1 and 'cterm' + m0 in r: + applied = apply_mod(parsed[0], m) + if applied is not None: + states[0].append(applied) + more_states = [] + for m, r in varmods_term: + if r is True or m0 in r: + if m[-1] == '-' or len(parsed) == 1: + for group in states[0]: + applied = apply_mod(group, m) + if applied is not None: + more_states.append(applied) + states[0].extend(more_states) + + # Continue with regular mods + for group in parsed[1:-1]: + gstates = [group] + for m, r in varmods_non_term: + if r is True or group[-1] in r: + applied = apply_mod(group, m) + if applied is not None: + gstates.append(applied) + states.append(gstates) + + # Finally add C-terminal mods and regular mods on the C-terminal residue + if len(parsed) > 1: + states.append([parsed[-1]]) + m1 = main(parsed[-1])[1] + for m, r in varmods_non_term: + if r is True or m1 in r or 'cterm' + m1 in r or len(parsed) == 1 and 'nterm' + m1 in r: + applied = apply_mod(parsed[-1], m) + if applied is not None: + states[-1].append(applied) + more_states = [] + for m, r in varmods_term: + if r is True or m1 in r: + if m[0] == '-' or len(parsed) == 1: + for group in states[-1]: + applied = apply_mod(group, m) + if applied is not None: + more_states.append(applied) + states[-1].extend(more_states) + + sites = [s for s in enumerate(states) if len(s[1]) > 1] + if max_mods is None or max_mods > len(sites): + possible_states = it.product(*states) + else: + def state_lists(): + for m in range(max_mods + 1): + for comb in it.combinations(sites, m): + skel = [[s[0]] for s in states] + for i, e in comb: + skel[i] = e[1:] + yield skel + possible_states = it.chain.from_iterable(it.product(*skel) for skel in state_lists()) + + if format_ == 'split': + def strip_std_terms(): + for ps in possible_states: + ps = list(ps) + if not show_unmodified_termini: + if ps[0][0] == std_nterm: + ps[0] = ps[0][1:] + if ps[-1][-1] == std_cterm: + ps[-1] = ps[-1][:-1] + yield ps + return strip_std_terms() + elif format_ == 'str': + return (tostring(form, show_unmodified_termini) + for form in possible_states) + else: + raise PyteomicsError('Unsupported value of "format": {}'.format(format_)) + + +def coverage(protein, peptides): + """Calculate how much of `protein` is covered by `peptides`. + Peptides can overlap. If a peptide is found multiple times in `protein`, + it contributes more to the overall coverage. + + Requires :py:mod:`numpy`. + + .. note:: + Modifications and terminal groups are discarded. + + Parameters + ---------- + protein : str + A protein sequence. + peptides : iterable + An iterable of peptide sequences. + + Returns + ------- + out : float + The sequence coverage, between 0 and 1. + + Examples + -------- + >>> coverage('PEPTIDES'*100, ['PEP', 'EPT']) + 0.5 + """ + import numpy as np + protein = re.sub(r'[^A-Z]', '', protein) + mask = np.zeros(len(protein), dtype=np.int8) + for peptide in peptides: + indices = [m.start() for m in re.finditer( + '(?={})'.format(re.sub(r'[^A-Z]', '', peptide)), protein)] + for i in indices: + mask[i:i + len(peptide)] = 1 + return mask.sum(dtype=float) / mask.size + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/pyteomics/peff.py b/pyteomics/peff.py new file mode 100644 index 0000000..90ffcc5 --- /dev/null +++ b/pyteomics/peff.py @@ -0,0 +1,277 @@ +""" +peff - PSI Extended FASTA Format +================================ + +PEFF is a forth-coming standard from PSI-HUPO formalizing and extending the +encoding of protein features and annotations for building search spaces for +proteomics. See `The PEFF specification <http://www.psidev.info/peff>`_ for +more up-to-date information on the standard. + +Data manipulation +----------------- + +Classes +....... + +The PEFF parser inherits several properties from implementation in the :mod:`~.fasta` module, +building on top of the :class:`~.TwoLayerIndexedFASTA` reader. + +Available classes: + + :py:class:`IndexedPEFF` - Parse a PEFF format file in binary-mode, supporting + direct indexing by header string or by tag. + +""" + +# Copyright 2018 Joshua Klein, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +try: + from collections.abc import Sequence as SequenceABC, Mapping +except ImportError: + from collections import Sequence as SequenceABC, Mapping +from collections import OrderedDict, defaultdict + +from .fasta import TwoLayerIndexedFASTA + + +class Header(Mapping): + """Hold parsed properties of a key-value pair like a sequence's + definition line. + + This object supports the :class:`Mapping` interface, and + keys may be accessed by attribute access notation. + """ + def __init__(self, mapping, original=None): + self._mapping = mapping + + def __getitem__(self, key): + return self._mapping[key] + + def __iter__(self): + return iter(self._mapping) + + def items(self): + return self._mapping.items() + + def keys(self): + return self._mapping.keys() + + def values(self): + return self._mapping.values() + + def __len__(self): + return len(self._mapping) + + def __contains__(self, key): + return key in self._mapping + + def __getattr__(self, key): + if key == "_mapping": + raise AttributeError(key) + try: + return self._mapping[key] + except KeyError: + raise AttributeError(key) + + def __repr__(self): + return "{self.__class__.__name__}({mapping})".format(self=self, mapping=dict(self._mapping)) + + def __hash__(self): + return hash(self.defline) + + def __eq__(self, other): + try: + return self._mapping == other._mapping + except AttributeError: + return str(self) == str(other) + + def __ne__(self, other): + return not (self == other) + + def __dir__(self): + base = set(dir(super(Header, self))) + keys = set(self._mapping.keys()) + return list(base | keys) + + +class IndexedPEFF(TwoLayerIndexedFASTA): + """Creates an :py:class:`IndexedPEFF` object. + + Parameters + ---------- + source : str or file + The file to read. If a file object, it needs to be in *rb* mode. + parse : bool, optional + Defines whether the descriptions should be parsed in the produced tuples. + Default is :py:const:`True`. + kwargs : passed to the :py:class:`TwoLayerIndexedFASTA` constructor. + """ + + kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)") + header_pattern = re.compile(r"^>?(\S+):(\S+)") + has_feature_index = re.compile(r"^\(?(\d+):") + header_group = 2 + + class _PEFFFeature(SequenceABC): + def __init__(self, *fields, **kwargs): + self.fields = tuple(fields) + self.id = kwargs.get('id') + self.feature_type = kwargs.get("feature_type") + + def __eq__(self, other): + return tuple(self) == tuple(other) + + def __ne__(self, other): + return not (self == other) + + def __getitem__(self, i): + return self.fields[i] + + def __len__(self): + return len(self.fields) + + def __repr__(self): + return repr(tuple(self)) + + def __str__(self): + return "(%s%s)" % ( + '%r:' % self.id if self.id is not None else '', + '|'.join(map(str, self)), ) + + def __init__(self, source, ignore_comments=False, **kwargs): + super(IndexedPEFF, self).__init__( + source, ignore_comments=ignore_comments, parser=self.parser, + header_pattern=self.header_pattern, **kwargs) + self.header_blocks = [] + self.comments = [] + self.version = None + self.number_of_entries = 0 + self._parse_header() + + def _parse_header(self): + self.seek(0) + line = self.readline().decode("ascii") + if not line.startswith("# PEFF"): + raise ValueError("Not a PEFF File") + self.version = tuple(map(int, line.strip()[7:].split("."))) + current_block = defaultdict(list) + in_header = True + while in_header: + line = self.readline().decode("ascii") + if not line.startswith("#"): + in_header = False + line = line.strip()[2:] + if '=' in line: + key, value = line.split("=", 1) + if key == "GeneralComment": + self.comments.append(value) + else: + current_block[key].append(value) + if line.startswith("//"): + if current_block: + self.header_blocks.append( + Header(OrderedDict((k, v if len(v) > 1 else v[0]) + for k, v in current_block.items()))) + current_block = defaultdict(list) + number_of_entries = 0 + for block in self.header_blocks: + try: + number_of_entries += int(block['NumberOfEntries']) + except KeyError: + pass + self.number_of_entries = number_of_entries + + def _extract_parenthesis_list(self, text): + chunks = [] + chunk = [] + paren_level = 0 + i = 0 + n = len(text) + while i < n: + c = text[i] + i += 1 + if c == "(": + if paren_level > 0: + chunk.append(c) + paren_level += 1 + elif c == ")": + if paren_level > 1: + chunk.append(c) + paren_level -= 1 + if paren_level == 0: + if chunk: + chunks.append(chunk) + chunk = [] + else: + chunk.append(c) + chunks = list(map(''.join, chunks)) + return chunks + + def _split_pipe_separated_tuple(self, text): + parts = text.split("|") + return parts + + def _coerce_types(self, key, value): + value = value.strip() + feature_id_match = self.has_feature_index.search(value) + if feature_id_match: + feature_id = int(feature_id_match.group(1)) + value = self.has_feature_index.sub('', value) + else: + feature_id = None + if "|" in value: + value = self._split_pipe_separated_tuple(value) + result = [] + for i, v in enumerate(value): + result.append(self._coerce_value(key, v, i)) + return self._PEFFFeature(*result, feature_type=key, id=feature_id) + else: + return self._coerce_value(key, value, 0) + + def _coerce_value(self, key, value, index): + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + pass + return str(value) + + def parser(self, line): + match = self.header_pattern.match(line) + if not match: + raise ValueError( + "Failed to parse {!r} using {!r}".format( + line, self)) + storage = OrderedDict() + prefix = None + db_uid = None + if line.startswith(">"): + line = line[1:] + prefix, line = line.split(":", 1) + db_uid, line = line.split(" ", 1) + storage['Prefix'] = prefix + storage['Tag'] = db_uid + kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)") + for key, value in kv_pattern.findall(line): + if not (value.startswith("(") or " (" in value): + storage[key] = self._coerce_types(key, value) + else: + # multi-value + storage[key] = [self._coerce_types(key, v) for v in self._extract_parenthesis_list(value)] + return Header(storage) diff --git a/pyteomics/pepxml.py b/pyteomics/pepxml.py new file mode 100644 index 0000000..813f574 --- /dev/null +++ b/pyteomics/pepxml.py @@ -0,0 +1,573 @@ +""" +pepxml - pepXML file reader +=========================== + +Summary +------- + +`pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_ +was the first widely accepted format for proteomics search engines' output. +Even though it is to be replaced by a community standard +`mzIdentML <http://www.psidev.info/index.php?q=node/454>`_, it is still used +commonly. + +This module provides minimalistic infrastructure for access to data stored in +pepXML files. The most important function is :py:func:`read`, which +reads peptide-spectum matches and related information and saves them into +human-readable dicts. This function relies on the terminology of the underlying +`lxml library <http://lxml.de/>`_. + +Data access +----------- + + :py:class:`PepXML` - a class representing a single pepXML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through peptide-spectrum matches in a pepXML + file. Data for a single spectrum are converted to an easy-to-use dict. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`DataFrame` - read pepXML files into a :py:class:`pandas.DataFrame`. + +Target-decoy approach +--------------------- + + :py:func:`filter` - filter PSMs from a chain of pepXML files to a specific FDR + using TDA. + + :py:func:`filter.chain` - chain a series of filters applied independently to + several files. + + :py:func:`filter.chain.from_iterable` - chain a series of filters applied + independently to an iterable of files. + + :py:func:`filter_df` - filter pepXML files and return a :py:class:`pandas.DataFrame`. + + :py:func:`fdr` - estimate the false discovery rate of a PSM set using the + target-decoy approach. + + :py:func:`qvalues` - get an array of scores and local FDR values for a PSM + set using the target-decoy approach. + + :py:func:`is_decoy` - determine whether a PSM is decoy or not. + +Miscellaneous +------------- + + :py:func:`roc_curve` - get a receiver-operator curve (min PeptideProphet + probability in a sample vs. false discovery rate) of PeptideProphet analysis. + +Deprecated functions +-------------------- + + :py:func:`iterfind` - iterate over elements in a pepXML file. + You can just call the corresponding method of the :py:class:`PepXML` + object. + + :py:func:`version_info` - get information about pepXML version and schema. + You can just read the corresponding attribute of the :py:class:`PepXML` + object. + +Dependencies +------------ + +This module requires :py:mod:`lxml`. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from lxml import etree +from . import xml, auxiliary as aux, _schema_defaults + + +class PepXML(xml.MultiProcessingXML, xml.IndexSavingXML): + """Parser class for pepXML files.""" + file_format = 'pepXML' + _root_element = 'msms_pipeline_analysis' + _default_schema = _schema_defaults._pepxml_schema_defaults + _default_version = '1.15' + _default_iter_tag = 'spectrum_query' + _indexed_tags = {'spectrum_query'} + _indexed_tag_keys = {'spectrum_query': 'spectrum'} + _default_id_attr = 'spectrum' + _structures_to_flatten = {'search_score_summary', 'modification_info'} + # attributes which contain unconverted values + _convert_items = {'float': {'calc_neutral_pep_mass', 'massdiff', + 'probability', 'variable', 'static'}, + 'int': {'start_scan', 'end_scan', 'index', 'num_matched_peptides'}, + 'bool': {'is_rejected'}, + 'floatarray': {'all_ntt_prob'}}.items() + + def _get_info_smart(self, element, **kwargs): + """Extract the info in a smart way depending on the element type""" + try: + name = kwargs.pop('ename') + except KeyError: + name = xml._local_name(element) + rec = kwargs.pop('recursive', None) + if name == self._root_element: + info = self._get_info(element, ename=name, recursive=(rec if rec is not None else False), **kwargs) + else: + info = self._get_info(element, ename=name, recursive=(rec if rec is not None else True), **kwargs) + + def safe_float(s): + try: + return float(s) + except ValueError: + if s.startswith('+-0'): + return 0 + return s + + converters = {'float': safe_float, 'int': int, + 'bool': lambda x: x.lower() in {'1', 'true'}, + 'floatarray': lambda x: list(map(float, x[1:-1].split(',')))} + for k, v in dict(info).items(): + for t, s in self._convert_items: + if k in s: + del info[k] + info[k] = converters[t](v) + for k in {'search_score', 'parameter'}: + if k in info and isinstance(info[k], list) and all( + isinstance(x, dict) and len(x) == 1 for x in info[k]): + scores = {} + for score in info[k]: + name, value = score.popitem() + try: + scores[name] = float(value) + except ValueError: + scores[name] = value + info[k] = scores + if 'search_result' in info and len(info['search_result']) == 1: + info.update(info['search_result'][0]) + del info['search_result'] + if 'protein' in info and 'peptide' in info: + info['proteins'] = [{'protein': info.pop('protein'), + 'protein_descr': info.pop('protein_descr', None)}] + for add_key in {'peptide_prev_aa', 'peptide_next_aa', 'protein_mw'}: + if add_key in info: + info['proteins'][0][add_key] = info.pop(add_key) + info['proteins'][0]['num_tol_term'] = info.pop('num_tol_term', 0) + if 'alternative_protein' in info: + info['proteins'].extend(info['alternative_protein']) + del info['alternative_protein'] + if 'peptide' in info and not 'modified_peptide' in info: + info['modified_peptide'] = info['peptide'] + if 'peptide' in info: + info['modifications'] = info.pop('mod_aminoacid_mass', []) + if 'mod_nterm_mass' in info: + info['modifications'].insert(0, {'position': 0, + 'mass': float(info.pop('mod_nterm_mass'))}) + if 'mod_cterm_mass' in info: + info['modifications'].append({'position': 1 + len(info['peptide']), + 'mass': float(info.pop('mod_cterm_mass'))}) + if 'modified_peptide' in info and info['modified_peptide'] == info.get( + 'peptide'): + if not info.get('modifications'): + info['modifications'] = [] + else: + mp = info['modified_peptide'] + for mod in sorted(info['modifications'], + key=lambda m: m['position'], + reverse=True): + if mod['position'] not in {0, 1+len(info['peptide'])}: + p = mod['position'] + mp = mp[:p] + '[{}]'.format(int(mod['mass'])) + mp[p:] + info['modified_peptide'] = mp + if 'search_hit' in info: + info['search_hit'].sort(key=lambda x: x['hit_rank']) + return info + + +def read(source, read_schema=False, iterative=True, **kwargs): + """Parse `source` and iterate through peptide-spectrum matches. + + Parameters + ---------- + source : str or file + A path to a target pepXML file or the file object itself. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the pepXML header. Otherwise, use default parameters. + Not recommended without Internet connection or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + Returns + ------- + out : PepXML + An iterator over dicts with PSM properties. + """ + + return PepXML(source, read_schema=read_schema, iterative=iterative) + + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified "XPath". + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create an :py:class:`PepXML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, keyword only, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + iterative : bool, keyword only, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, keyword only, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header. Otherwise, use default parameters. + Not recommended without Internet connection or + if you don't like to get the related warnings. + + Returns + ------- + out : iterator + """ + return PepXML(source, **kwargs).iterfind(path, **kwargs) + + +version_info = xml._make_version_info(PepXML) + + +def roc_curve(source): + """Parse source and return a ROC curve for peptideprophet analysis. + + Parameters + ---------- + source : str or file + A path to a target pepXML file or the file object itself. + + Returns + ------- + out : list + A list of ROC points. + """ + + parser = etree.XMLParser(remove_comments=True, ns_clean=True) + tree = etree.parse(source, parser=parser) + + roc_curve = [] + for roc_error_data in tree.xpath( + "/*[local-name()='msms_pipeline_analysis'] \ + //*[local-name()='analysis_summary' and @analysis='peptideprophet'] \ + //*[local-name()='peptideprophet_summary'] \ + //*[local-name()='roc_error_data']"): + for element in roc_error_data.xpath("*[local-name()='roc_data_point' or local-name()='error_point']"): + data_point = dict(element.attrib) + for key in data_point: + data_point[key] = float(data_point[key]) + data_point["charge"] = roc_error_data.attrib["charge"] + data_point["tag"] = etree.QName(element).localname + roc_curve.append(data_point) + + return roc_curve + + +# chain = aux._make_chain(read, 'read') +chain = aux.ChainBase._make_chain(read) + + +def _is_decoy_prefix(psm, prefix='DECOY_'): + """Given a PSM dict, return :py:const:`True` if all protein names for + the PSM start with ``prefix``, and :py:const:`False` otherwise. This + function might not work for some pepXML flavours. Use the source to get the + idea and suit it to your needs. + + Parameters + ---------- + psm : dict + A dict, as yielded by :py:func:`read`. + prefix : str, optional + A prefix used to mark decoy proteins. Default is `'DECOY_'`. + + Returns + ------- + out : bool + """ + return all(protein['protein'].startswith(prefix) + for protein in psm['search_hit'][0]['proteins']) + + +def _is_decoy_suffix(psm, suffix='_DECOY'): + return all(protein['protein'].endswith(suffix) + for protein in psm['search_hit'][0]['proteins']) + + +is_decoy = _is_decoy_prefix +fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix) +_key = lambda x: min(sh['search_score']['expect'] for sh in x['search_hit']) +qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, _key) +filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, _key, qvalues) +filter.chain = aux._make_chain(filter, 'filter', True) + + +def DataFrame(*args, **kwargs): + """Read pepXML output files into a :py:class:`pandas.DataFrame`. + + Requires :py:mod:`pandas`. + + Parameters + ---------- + *args + pepXML file names or objects. Passed to :py:func:`chain`. + + **kwargs + Passed to :py:func:`chain`. + + sep : str or None, keyword only, optional + Some values related to PSMs (such as protein information) are variable-length + lists. If `sep` is a :py:class:`str`, they will be packed into single string using + this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is + :py:const:`None`. + + recursive : bool, keyword only, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + iterative : bool, keyword only, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, keyword only, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header. Otherwise, use default parameters. + Not recommended without Internet connection or + if you don't like to get the related warnings. + + pd_kwargs : dict, optional + Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + kwargs = kwargs.copy() + sep = kwargs.pop('sep', None) + pd_kwargs = kwargs.pop('pd_kwargs', {}) + def gen_items(): + with chain(*args, **kwargs) as f: + for item in f: + info = {} + for k, v in item.items(): + if isinstance(v, (str, int, float)): + info[k] = v + if 'search_hit' in item: + sh = item['search_hit'][0] + proteins = sh.pop('proteins') + prot_dict = {} + for p in proteins: + for k in p: + prot_dict[k] = [] + for p in proteins: + for k, v in prot_dict.items(): + v.append(p.get(k)) + if sep is None: + info.update(prot_dict) + else: + for k, v in prot_dict.items(): + info[k] = sep.join(str(val) if val is not None else '' for val in v) + info.update(sh.pop('search_score')) + mods = sh.pop('modifications', []) + formatted_mods = ['{0[mass]:.3f}@{0[position]}'.format(x) for x in mods] + if sep is not None: + info['modifications'] = sep.join(formatted_mods) + else: + info['modifications'] = formatted_mods + for k, v in sh.items(): + if isinstance(v, (str, int, float)): + info[k] = v + if 'analysis_result' in sh: + for ar in sh['analysis_result']: + if ar['analysis'] == 'peptideprophet': + try: + info.update(ar['peptideprophet_result']['parameter']) + except KeyError: + pass + info['peptideprophet_probability'] = ar['peptideprophet_result']['probability'] + info['peptideprophet_ntt_prob'] = ar['peptideprophet_result']['all_ntt_prob'] + elif ar['analysis'] == 'interprophet': + info.update(ar['interprophet_result']['parameter']) + info['interprophet_probability'] = ar['interprophet_result']['probability'] + info['interprophet_ntt_prob'] = ar['interprophet_result']['all_ntt_prob'] + yield info + return pd.DataFrame(gen_items(), **pd_kwargs) + + +def filter_df(*args, **kwargs): + """Read pepXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. + Positional arguments can be pepXML files or DataFrames. Keyword parameter `fdr` is also required. + Other parameters are optional. + + Requires :py:mod:`pandas`. + + Parameters + ---------- + positional args + pepXML file names, file objects, or DataFrames. Passed to :py:func:`DataFrame`. + fdr : float, keyword only, 0 <= fdr <= 1 + Desired FDR level. + key : str / iterable / callable, keyword only, optional + PSM score. Default is 'expect'. + is_decoy : str / iterable / callable, keyword only, optional + Default is to check if all strings in the "protein" column start with `'DECOY_'`. + sep : str or None, keyword only, optional + Some values related to PSMs (such as protein information) are variable-length + lists. If `sep` is a :py:class:`str`, they will be packed into single string using + this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is + :py:const:`None`. + reverse : bool, keyword only, optional + If :py:const:`True`, then PSMs are sorted in descending order, + i.e. the value of the key function is higher for better PSMs. + Default is :py:const:`False`. + decoy_prefix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name prefix to use to detect decoy matches. If you provide your own + `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. + Default is `"DECOY_"`. + decoy_suffix : str, optional + If the default `is_decoy` function works for you, this parameter specifies which + protein name suffix to use to detect decoy matches. If you provide your own + `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. + remove_decoy : bool, keyword only, optional + Defines whether decoy matches should be removed from the output. + Default is :py:const:`True`. + + .. note:: If set to :py:const:`False`, then by default the decoy + PSMs will be taken into account when estimating FDR. Refer to the + documentation of :py:func:`fdr` for math; basically, if + `remove_decoy` is :py:const:`True`, then formula 1 is used + to control output FDR, otherwise it's formula 2. This can be + changed by overriding the `formula` argument. + + formula : int, keyword only, optional + Can be either 1 or 2, defines which formula should be used for FDR + estimation. Default is 1 if `remove_decoy` is :py:const:`True`, + else 2 (see :py:func:`fdr` for definitions). + ratio : float, keyword only, optional + The size ratio between the decoy and target databases. Default is + 1. In theory, the "size" of the database is the number of + theoretical peptides eligible for assignment to spectra that are + produced by *in silico* cleavage of that database. + correction : int or float, keyword only, optional + Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. + + 0 (default): no correction; + + 1: enable "+1" correction. This accounts for the probability that a false + positive scores better than the first excluded decoy PSM; + + 2: this also corrects that probability for finite size of the sample, + so the correction will be slightly less than "+1". + + If a floating point number + is given, then instead of the expectation value for the number of false PSMs, + the confidence value is used. The value of `correction` is then interpreted as + desired confidence level. E.g., if correction=0.95, then the calculated q-values + do not exceed the "real" q-values with 95% probability. + + See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. + + pep : callable / array-like / iterable / str, keyword only, optional + If callable, a function used to determine the posterior error probability (PEP). + Should accept exactly one argument (PSM) and return a float. + If array-like, should contain float values for all given PSMs. + If string, it is used as a field name (PSMs must be in a record array + or a :py:class:`DataFrame`). + + .. note:: If this parameter is given, then PEP values will be used to calculate + q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with: + `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`. + `key` can still be provided. Without `key`, PSMs will be sorted by PEP. + + q_label : str, optional + Field name for q-value in the output. Default is ``'q'``. + + score_label : str, optional + Field name for score in the output. Default is ``'score'``. + + decoy_label : str, optional + Field name for the decoy flag in the output. Default is ``'is decoy'``. + + pep_label : str, optional + Field name for PEP in the output. Default is ``'PEP'``. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + sep = kwargs.get('sep') + kwargs.setdefault('key', 'expect') + if all(isinstance(arg, pd.DataFrame) for arg in args): + if len(args) > 1: + df = pd.concat(args) + else: + df = args[0] + else: + read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs} + df = DataFrame(*args, **read_kw) + if 'is_decoy' not in kwargs: + if sep is not None: + if 'decoy_suffix' in kwargs: + kwargs['is_decoy'] = df['protein'].str.split(';').apply( + lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) + else: + kwargs['is_decoy'] = df['protein'].str.split(';').apply( + lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) + else: + if 'decoy_suffix' in kwargs: + kwargs['is_decoy'] = df['protein'].apply( + lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) + else: + kwargs['is_decoy'] = df['protein'].apply( + lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) + return aux.filter(df, **kwargs) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py new file mode 100644 index 0000000..c24792b --- /dev/null +++ b/pyteomics/proforma.py @@ -0,0 +1,2372 @@ +''' +proforma - Proteoform and Peptidoform Notation +============================================== + +ProForma is a notation for defining modified amino acid sequences using +a set of controlled vocabularies, as well as encoding uncertain or partial +information about localization. See `ProForma specification <https://www.psidev.info/proforma>`_ +for more up-to-date information. + +For more details, see the :mod:`pyteomics.proforma` online. +''' + +import re +import warnings +from collections import deque, namedtuple +from functools import partial +from array import array as _array + +try: + from enum import Enum +except ImportError: + # Python 2 doesn't have a builtin Enum type + Enum = object + +from .mass import Composition, std_aa_mass, Unimod, nist_mass, calculate_mass, std_ion_comp, mass_charge_ratio +from .auxiliary import PyteomicsError, BasicComposition +from .auxiliary.utils import add_metaclass + +try: + import numpy as np +except ImportError: + np = None + +try: + from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache, load_unimod) + _has_psims = True +except ImportError: + def _needs_psims(name): + raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims`" % name) + + load_psimod = partial(_needs_psims, 'PSIMOD') + load_xlmod = partial(_needs_psims, 'XLMOD') + load_gno = partial(_needs_psims, 'GNO') + load_unimod = partial(_needs_psims, 'UNIMOD') + obo_cache = None + _has_psims = False + +_WATER_MASS = calculate_mass(formula="H2O") + +std_aa_mass = std_aa_mass.copy() +std_aa_mass['X'] = 0 + +element_symbols = set(nist_mass) +element_symbols.remove("e*") +element_symbols.add('e') + + +class ProFormaError(PyteomicsError): + def __init__(self, message, index=None, parser_state=None, **kwargs): + super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state) + self.message = message + self.index = index + self.parser_state = parser_state + + +class PrefixSavingMeta(type): + '''A subclass-registering-metaclass that provides easy + lookup of subclasses by prefix attributes. + ''' + + def __new__(mcs, name, parents, attrs): + new_type = type.__new__(mcs, name, parents, attrs) + prefix = attrs.get("prefix_name") + if prefix: + new_type.prefix_map[prefix.lower()] = new_type + short = attrs.get("short_prefix") + if short: + new_type.prefix_map[short.lower()] = new_type + return new_type + + def find_by_tag(self, tag_name): + if tag_name is None: + raise ValueError("tag_name cannot be None!") + tag_name = tag_name.lower() + return self.prefix_map[tag_name] + + +class TagTypeEnum(Enum): + unimod = 0 + psimod = 1 + massmod = 2 + generic = 3 + info = 4 + gnome = 5 + xlmod = 6 + + formula = 7 + glycan = 8 + + localization_marker = 9 + position_label = 10 + group_placeholder = 999 + + +class ModificationTagStyle(Enum): + Unset = 0 + ShortId = 1 + LongId = 2 + ShortName = 3 + LongName = 4 + + +_sentinel = object() + + +class ModificationMassNotFoundError(ProFormaError): + pass + + +class UnknownMonosaccharideError(ProFormaError): + pass + + +@add_metaclass(PrefixSavingMeta) +class TagBase(object): + '''A base class for all tag types. + + Attributes + ---------- + type: Enum + An element of :class:`TagTypeEnum` saying what kind of tag this is. + value: object + The data stored in this tag, usually an externally controlled name + extra: list + Any extra tags that were nested within this tag. Usually limited to INFO + tags but may be other synonymous controlled vocabulary terms. + group_id: str or None + A short label denoting which group, if any, this tag belongs to + ''' + __slots__ = ("type", "value", "extra", "group_id") + + prefix_name = None + short_prefix = None + prefix_map = {} + + def __init__(self, type, value, extra=None, group_id=None): + self.type = type + self.value = value + self.extra = extra + self.group_id = group_id + + def __str__(self): + part = self._format_main() + had_marker = False + if self.extra: + rest = [] + for e in self.extra: + rest.append(str(e)) + had_marker |= isinstance(e, GroupLabelBase) and e.group_id == self.group_id + label = '|'.join([part] + rest) + else: + label = part + if self.group_id and not had_marker: + label = '%s%s' % (label, self.group_id) + return '%s' % label + + def __repr__(self): + template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})" + return template.format(self=self) + + def __eq__(self, other): + if other is None: + return False + if isinstance(other, str): + return str(self) == other + return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \ + and (self.group_id == other.group_id) + + def __ne__(self, other): + return not self == other + + def find_tag_type(self, tag_type): + '''Search this tag or tag collection for elements with a particular + tag type and return them. + + Parameters + ---------- + tag_type : TagTypeEnum + A label from :class:`TagTypeEnum`, or an equivalent type. + + Returns + ------- + matches : list + The list of all tags in this object which match the requested tag type. + ''' + out = [] + if self.type == tag_type: + out.append(self) + if not self.extra: + return out + for e in self.extra: + if e.type == tag_type: + out.append(e) + return out + + @classmethod + def parse(cls, buffer): + return process_tag_tokens(buffer) + + +class GroupLabelBase(TagBase): + __slots__ = () + + def __str__(self): + part = self._format_main() + if self.extra: + rest = [str(e) for e in self.extra] + label = '|'.join([part] + rest) + else: + label = part + return '%s' % label + + +class PositionLabelTag(GroupLabelBase): + '''A tag to mark that a position is involved in a group in some way, but does + not imply any specific semantics. + ''' + __slots__ = () + + def __init__(self, value=None, extra=None, group_id=None): + assert group_id is not None + value = group_id + super(PositionLabelTag, self).__init__( + TagTypeEnum.position_label, value, extra, group_id) + + def _format_main(self): + return "{self.group_id}".format(self=self) + + +class LocalizationMarker(GroupLabelBase): + '''A tag to mark a particular localization site + ''' + __slots__ = () + + def __init__(self, value, extra=None, group_id=None): + assert group_id is not None + super(LocalizationMarker, self).__init__( + TagTypeEnum.localization_marker, float(value), extra, group_id) + + def _format_main(self): + return "{self.group_id}({self.value:.4g})".format(self=self) + + +class InformationTag(TagBase): + '''A tag carrying free text describing the location + ''' + __slots__ = () + + prefix_name = "INFO" + + def __init__(self, value, extra=None, group_id=None): + super(InformationTag, self).__init__( + TagTypeEnum.info, str(value), extra, group_id) + + def _format_main(self): + return str(self.value) + + +class ModificationResolver(object): + def __init__(self, name, **kwargs): + self.name = name.lower() + self.symbol = self.name[0] + self._database = None + + def load_database(self): + raise NotImplementedError() + + @property + def database(self): + if not self._database: + self._database = self.load_database() + return self._database + + @database.setter + def database(self, database): + self._database = database + + def parse_identifier(self, identifier): + """Parse a string that is either a CV prefixed identifier or name. + + Parameters + ---------- + identifier : str + The identifier string to parse, removing CV prefix as needed. + + Returns + ------- + name : str, optional + A textual identifier embedded in the qualified identifier, if any, otherwise + :const:`None`. + id : int, optional + An integer ID embedded in the qualified identifier, if any, otherwise + :const:`None`. + """ + tokens = identifier.split(":", 1) + if len(tokens) > 1: + prefix = tokens[0].lower() + if prefix == self.name or prefix == self.symbol: + identifier = tokens[1] + + if identifier.isdigit(): + id = int(identifier) + name = None + else: + name = identifier + id = None + return name, id + + def resolve(self, name=None, id=None, **kwargs): + raise NotImplementedError() + + def __call__(self, name=None, id=None, **kwargs): + return self.resolve(name, id, **kwargs) + + def __eq__(self, other): + return self.name == other.name + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self.name) + + +class UnimodResolver(ModificationResolver): + def __init__(self, **kwargs): + super(UnimodResolver, self).__init__("unimod", **kwargs) + self._database = kwargs.get("database") + self.strict = kwargs.get("strict", True) + + def load_database(self): + if _has_psims: + return obo_cache.resolve("http://www.unimod.org/obo/unimod.obo") + return Unimod() + + def resolve(self, name=None, id=None, **kwargs): + strict = kwargs.get("strict", self.strict) + exhaustive = kwargs.get("exhaustive", True) + if name is not None: + defn = self.database.by_title(name, strict=strict) + if not defn: + defn = self.database.by_name(name, strict=strict) + if not defn and exhaustive and strict: + defn = self.database.by_title(name, strict=False) + if not defn: + defn = self.database.by_name(name, strict=False) + if defn and isinstance(defn, list): + warnings.warn( + "Multiple matches found for {!r} in Unimod, taking the first, {}.".format( + name, defn[0]['record_id'])) + defn = defn[0] + if not defn: + raise KeyError(name) + elif id is not None: + defn = self.database[id] + if not defn: + raise KeyError(id) + else: + raise ValueError("Must provide one of `name` or `id`") + if isinstance(defn, dict): + return { + 'composition': defn['composition'], + 'name': defn['title'], + 'id': defn['record_id'], + 'mass': defn['mono_mass'], + 'provider': self.name, + "source": self + } + else: + name = defn.ex_code_name + if not name: + name = defn.code_name + return { + "composition": defn.composition, + "name": name, + "id": defn.id, + "mass": defn.monoisotopic_mass, + "provider": self.name, + "source": self + } + + +class PSIModResolver(ModificationResolver): + def __init__(self, **kwargs): + super(PSIModResolver, self).__init__('psimod', **kwargs) + self._database = kwargs.get("database") + + def load_database(self): + return load_psimod() + + def resolve(self, name=None, id=None, **kwargs): + if name is not None: + defn = self.database[name] + elif id is not None: + defn = self.database['MOD:{:05d}'.format(id)] + else: + raise ValueError("Must provide one of `name` or `id`") + try: + mass = float(defn.DiffMono) + except (KeyError, TypeError, ValueError): + raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn)) + if defn.DiffFormula is not None: + composition = Composition() + diff_formula_tokens = defn.DiffFormula.strip().split(" ") + for i in range(0, len(diff_formula_tokens), 2): + element = diff_formula_tokens[i] + count = diff_formula_tokens[i + 1] + if count: + count = int(count) + if element.startswith("("): + j = element.index(")") + isotope = element[1:j] + element = "%s[%s]" % (element[j + 1:], isotope) + composition[element] += count + else: + composition = None + warnings.warn("No formula was found for %r in PSI-MOD, composition will be missing" % ((name, id), )) + return { + 'mass': mass, + 'composition': composition, + 'name': defn.name, + 'id': defn.id, + 'provider': self.name, + "source": self + } + + +class XLMODResolver(ModificationResolver): + def __init__(self, **kwargs): + super(XLMODResolver, self).__init__('xlmod', **kwargs) + self._database = kwargs.get("database") + + def load_database(self): + return load_xlmod() + + def resolve(self, name=None, id=None, **kwargs): + if name is not None: + defn = self.database[name] + elif id is not None: + defn = self.database['XLMOD:{:05d}'.format(id)] + else: + raise ValueError("Must provide one of `name` or `id`") + try: + mass = float(defn['monoIsotopicMass']) + except (KeyError, TypeError, ValueError): + raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn)) + if 'deadEndFormula' in defn: + composition = Composition(defn['deadEndFormula'].replace(" ", '').replace("D", "H[2]")) + elif 'bridgeFormula' in defn: + composition = Composition( + defn['bridgeFormula'].replace(" ", '').replace("D", "H[2]")) + return { + 'mass': mass, + 'composition': composition, + 'name': defn.name, + 'id': defn.id, + 'provider': self.name, + "source": self + } + +# TODO: Implement resolve walking up the graph to get the mass. Can't really +# get any more information without glypy/glyspace interaction +class GNOResolver(ModificationResolver): + mass_pattern = re.compile(r"(\d+(:?\.\d+)) Da") + + def __init__(self, **kwargs): + super(GNOResolver, self).__init__('gnome', **kwargs) + self._database = kwargs.get("database") + + def load_database(self): + return load_gno() + + def get_mass_from_glycan_composition(self, term): + '''Parse the Byonic-style glycan composition from property GNO:00000202 + to get the counts of each monosaccharide and use that to calculate mass. + + The mass computed here is exact and dehydrated, distinct from the rounded-off + mass that :meth:`get_mass_from_term` will produce by walking up the CV term + hierarchy. However, not all glycan compositions are representable in GNO:00000202 + format, so this may silently be absent or incomplete, hence the double-check in + :meth:`get_mass_from_term`. + + Parameters + ---------- + term : psims.controlled_vocabulary.Entity + The CV entity being parsed. + + Returns + ------- + mass : float or :const:`None` + If a glycan composition is found on the term, the computed + mass will be returned. Otherwise the :const:`None` is returned + ''' + val = term.get('GNO:00000202') + monosaccharides = BasicComposition() + composition = Composition() + if val: + tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val) + mass = 0.0 + for symbol, count in tokens: + count = int(count) + try: + mono_mass, mono_comp, symbol = GlycanModification.valid_monosaccharides[symbol] + mass += mono_mass * count + composition += mono_comp * count + monosaccharides[symbol] += count + except KeyError: + continue + return mass, monosaccharides, composition + return None, None, None + + def get_mass_from_term(self, term, raw_mass): + '''Walk up the term hierarchy and find the mass group + term near the root of the tree, and return the most accurate + mass available for the provided term. + + The mass group term's mass is rounded to two decimal places, leading + to relatively large errors. + + Parameters + ---------- + term : psims.controlled_vocabulary.Entity + The CV entity being parsed. + + Returns + ------- + mass : float or :const:`None` + If a root node is found along the term's lineage, computed + mass will be returned. Otherwise the :const:`None` is returned. + The mass may be + ''' + root_id = 'GNO:00000001' + parent = term.parent() + if isinstance(parent, list): + parent = parent[0] + while parent.id != root_id: + next_parent = parent.parent() + if isinstance(next_parent, list): + next_parent = next_parent[0] + if next_parent.id == root_id: + break + parent = next_parent + match = self.mass_pattern.search(parent.name) + if not match: + return None + # This will have a small mass error. + rough_mass = float(match.group(1)) - _WATER_MASS + if raw_mass is not None and abs(rough_mass - raw_mass) < 1: + return raw_mass + warnings.warn( + ("An accurate glycan composition could not be inferred from %s. " + "Only a rough approximation is available.") % (term, )) + return rough_mass + + def resolve(self, name=None, id=None, **kwargs): + if name is not None: + term = self.database[name] + elif id is not None: + term = self.database[id] + else: + raise ValueError("Must provide one of `name` or `id`") + raw_mass, monosaccharides, composition = self.get_mass_from_glycan_composition(term) + + rec = { + "name":term.name, + "id": term.id, + "provider": self.name, + "composition": composition, + "monosaccharides": monosaccharides, + "mass": self.get_mass_from_term(term, raw_mass), + "source": self + } + return rec + + +class GenericResolver(ModificationResolver): + + def __init__(self, resolvers, **kwargs): + super(GenericResolver, self).__init__('generic', **kwargs) + self.resolvers = list(resolvers) + + def load_database(self): + return None + + def parse_identifier(self, identifier): + """Parse a string that is either a CV prefixed identifier or name. + + Does no parsing as a :class:`GenericModification` is never qualified. + + Parameters + ---------- + identifier : str + The identifier string to parse, removing CV prefix as needed. + + Returns + ------- + name : str, optional + A textual identifier embedded in the qualified identifier, if any, otherwise + :const:`None`. + id : int, optional + An integer ID embedded in the qualified identifier, if any, otherwise + :const:`None`. + """ + return identifier, None + + def resolve(self, name=None, id=None, **kwargs): + defn = None + for resolver in self.resolvers: + try: + defn = resolver(name=name, id=id, **kwargs) + break + except KeyError: + continue + except ModificationMassNotFoundError: + warnings.warn("Could not resolve the mass for %r in %r" % ((name, id), resolver)) + continue + if defn is None: + if name is None: + raise KeyError(id) + elif id is None: + raise KeyError(name) + else: + raise ValueError("Must provide one of `name` or `id`") + return defn + + +class ModificationBase(TagBase): + '''A base class for all modification tags with marked prefixes. + + While :class:`ModificationBase` is hashable, its equality testing + brings in additional tag-related information. For pure modification + identity comparison, use :attr:`key` to get a :class:`ModificationToken` + free of these concerns.. + ''' + + _tag_type = None + __slots__ = ('_definition', 'style') + + def __init__(self, value, extra=None, group_id=None, style=None): + if style is None: + style = ModificationTagStyle.Unset + super(ModificationBase, self).__init__( + self._tag_type, value, extra, group_id) + self._definition = None + self.style = style + + def __eq__(self, other): + if isinstance(other, ModificationToken): + return other == self + return super(ModificationBase, self).__eq__(other) + + def __hash__(self): + return hash((self.id, self.provider)) + + @property + def key(self): + '''Get a safe-to-hash-and-compare :class:`ModificationToken` + representing this modification without tag-like properties. + + Returns + -------- + ModificationToken + ''' + return ModificationToken(self.value, self.id, self.provider, self.__class__) + + @property + def definition(self): + '''A :class:`dict` of properties describing this modification, given + by the providing controlled vocabulary. This value is cached, and + should not be modified. + + Returns + ------- + dict + ''' + if self._definition is None: + self._definition = self.resolve() + return self._definition + + @property + def mass(self): + '''The monoisotopic mass shift this modification applies + + Returns + -------float + ''' + return self.definition['mass'] + + @property + def composition(self): + '''The chemical composition shift this modification applies''' + return self.definition.get('composition') + + @property + def id(self): + '''The unique identifier given to this modification by its provider + + Returns + ------- + str or int + ''' + return self.definition.get('id') + + @property + def name(self): + '''The primary name of this modification from its provider. + + Returns + ------- + str + ''' + return self.definition.get('name') + + @property + def provider(self): + '''The name of the controlled vocabulary that provided this + modification. + + Returns + ------- + str + ''' + return self.definition.get('provider') + + def _populate_from_definition(self, definition): + self._definition = definition + + def _format_main(self): + if self.style == ModificationTagStyle.Unset or self.style is None: + return "{self.prefix_name}:{self.value}".format(self=self) + elif self.style == ModificationTagStyle.LongId: + return "{self.prefix_name}:{self.id}".format(self=self) + elif self.style == ModificationTagStyle.ShortId: + return "{self.short_prefix}:{self.id}".format(self=self) + elif self.style == ModificationTagStyle.LongName: + return "{self.prefix_name}:{self.name}".format(self=self) + elif self.style == ModificationTagStyle.ShortName: + return "{self.short_prefix}:{self.name}".format(self=self) + else: + warnings.warn("Unknown formatting style {!r}".format(self.style)) + return "{self.prefix_name}:{self.value}".format(self=self) + + def resolve(self): + '''Find the term and return it's properties + ''' + keys = self.resolver.parse_identifier(self.value) + return self.resolver(*keys) + + +class MassModification(TagBase): + '''A modification defined purely by a signed mass shift in Daltons. + + The value of a :class:`MassModification` is always a :class:`float` + ''' + __slots__ = ('_significant_figures', ) + + prefix_name = "Obs" + + def __init__(self, value, extra=None, group_id=None): + if isinstance(value, str): + sigfigs = len(value.split('.')[-1].rstrip('0')) + else: + sigfigs = 4 + self._significant_figures = sigfigs + super(MassModification, self).__init__( + TagTypeEnum.massmod, float(value), extra, group_id) + + def _format_main(self): + if self.value >= 0: + return ('+{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') + else: + return ('{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') + + @property + def provider(self): + return None + + @property + def id(self): + return self._format_main() + + @property + def key(self): + '''Get a safe-to-hash-and-compare :class:`ModificationToken` + representing this modification without tag-like properties. + + Returns + -------- + ModificationToken + ''' + return ModificationToken(self.value, self.id, self.provider, self.__class__) + + @property + def mass(self): + return self.value + + def __eq__(self, other): + if isinstance(other, ModificationToken): + return other == self + return super(MassModification, self).__eq__(other) + + def __hash__(self): + return hash((self.id, self.provider)) + + +class FormulaModification(ModificationBase): + prefix_name = "Formula" + + isotope_pattern = re.compile(r'\[(?P<isotope>\d+)(?P<element>[A-Z][a-z]*)(?P<quantity>[\-+]?\d+)\]') + _tag_type = TagTypeEnum.formula + + def _normalize_isotope_notation(self, match): + '''Rewrite ProForma isotope notation to Pyteomics-compatible + isotope notation. + + Parameters + ---------- + match : Match + The matched isotope notation string parsed by the regular expression. + + Returns + reformatted : str + The re-written isotope notation + ''' + parts = match.groupdict() + return "{element}[{isotope}]{quantity}".format(**parts) + + def resolve(self): + normalized = self.value.replace(' ', '') + # If there is a [ character in the formula, we know there are isotopes which + # need to be normalized. + if '[' in normalized: + normalized = self.isotope_pattern.sub(self._normalize_isotope_notation, normalized) + composition = Composition(formula=normalized) + return { + "mass": composition.mass(), + "composition": composition, + "name": self.value + } + + +monosaccharide_description = namedtuple('monosaccharide_description', ('mass', 'composition', "symbol")) + + +class GlycanModification(ModificationBase): + prefix_name = "Glycan" + + _tag_type = TagTypeEnum.glycan + + valid_monosaccharides = { + "Hex": monosaccharide_description(162.0528, Composition("C6H10O5"), 'Hex'), + "HexNAc": monosaccharide_description(203.0793, Composition("C8H13N1O5"), 'HexNAc'), + "HexS": monosaccharide_description(242.009, Composition("C6H10O8S1"), 'HexS'), + "HexP": monosaccharide_description(242.0191, Composition("C6H11O8P1"), 'HexP'), + "HexNAcS": monosaccharide_description(283.0361, Composition("C8H13N1O8S1"), 'HexNAcS'), + "dHex": monosaccharide_description(146.0579, Composition("C6H10O4"), 'dHex'), + "NeuAc": monosaccharide_description(291.0954, Composition("C11H17N1O8"), 'NeuAc'), + "NeuGc": monosaccharide_description(307.0903, Composition("C11H17N1O9"), 'NeuGc'), + "Pen": monosaccharide_description(132.0422, Composition("C5H8O4"), 'Pen'), + "Fuc": monosaccharide_description(146.0579, Composition("C6H10O4"), 'Fuc') + } + + valid_monosaccharides['Neu5Ac'] = valid_monosaccharides['NeuAc'] + valid_monosaccharides['Neu5Gc'] = valid_monosaccharides['NeuGc'] + valid_monosaccharides['Pent'] = valid_monosaccharides['Pen'] + valid_monosaccharides['d-Hex'] = valid_monosaccharides['dHex'] + + monomer_tokenizer = re.compile( + r"|".join(sorted(valid_monosaccharides.keys(), key=len, reverse=True))) + tokenizer = re.compile(r"(%s|[A-Za-z]+)\s*(\d*)\s*" % monomer_tokenizer.pattern) + + @property + def monosaccharides(self): + return self.definition.get('monosaccharides') + + def resolve(self): + composite = BasicComposition() + for tok, cnt in self.tokenizer.findall(self.value): + if cnt: + cnt = int(cnt) + else: + cnt = 1 + if tok not in self.valid_monosaccharides: + parts = self.monomer_tokenizer.findall(tok) + t = 0 + for p in parts: + if p not in self.valid_monosaccharides: + break + t += len(p) + if t != len(tok): + raise ValueError("{tok!r} is not a valid monosaccharide name".format(tok=tok)) + else: + for p in parts[:-1]: + sym = self.valid_monosaccharides[p].symbol + composite[sym] += 1 + sym = self.valid_monosaccharides[parts[-1]].symbol + composite[sym] += cnt + else: + sym = self.valid_monosaccharides[tok].symbol + composite[sym] += cnt + mass = 0 + chemcomp = Composition() + for key, cnt in composite.items(): + try: + m, c, sym = self.valid_monosaccharides[key] + except KeyError: + raise UnknownMonosaccharideError(key) + mass += m * cnt + chemcomp += c * cnt + return { + "mass": mass, + "composition": chemcomp, + "name": self.value, + "monosaccharides": composite + } + + +class UnimodModification(ModificationBase): + __slots__ = () + + resolver = UnimodResolver() + + prefix_name = "UNIMOD" + short_prefix = "U" + _tag_type = TagTypeEnum.unimod + + +class PSIModModification(ModificationBase): + __slots__ = () + + resolver = PSIModResolver() + + prefix_name = "MOD" + short_prefix = 'M' + _tag_type = TagTypeEnum.psimod + + +class GNOmeModification(ModificationBase): + __slots__ = () + + resolver = GNOResolver() + + prefix_name = "GNO" + short_prefix = 'G' + _tag_type = TagTypeEnum.gnome + + @property + def monosaccharides(self): + return self.definition.get('monosaccharides') + + +class XLMODModification(ModificationBase): + __slots__ = () + + resolver = XLMODResolver() + + prefix_name = "XLMOD" + # short_prefix = 'XL' + _tag_type = TagTypeEnum.xlmod + + +class GenericModification(ModificationBase): + __slots__ = () + _tag_type = TagTypeEnum.generic + resolver = GenericResolver([ + # Do exact matching here first. Then default to non-strict matching as a final + # correction effort. + partial(UnimodModification.resolver, exhaustive=False), + PSIModModification.resolver, + XLMODModification.resolver, + GNOmeModification.resolver, + # Some really common names aren't actually found in the XML exactly, so default + # to non-strict matching now to avoid masking other sources here. + partial(UnimodModification.resolver, strict=False) + ]) + + def __init__(self, value, extra=None, group_id=None): + super(GenericModification, self).__init__( + value, extra, group_id) + + def _format_main(self): + return self.value + + def resolve(self): + '''Find the term, searching through all available vocabularies and + return the first match's properties + ''' + keys = self.resolver.parse_identifier(self.value) + defn = self.resolver(*keys) + if defn is not None: + return defn + raise KeyError(keys) + + +def set_unimod_path(path): + '''Set the path to load the Unimod database from for resolving + ProForma Unimod modifications. + + .. note:: + + This method ensures that the Unimod modification database loads + quickly from a local database file instead of downloading a new + copy from the internet. + + Parameters + ---------- + path : str or file-like object + A path to or file-like object for the "unimod.xml" file. + + Returns + ------- + :class:`~pyteomics.mass.mass.Unimod` + ''' + db = Unimod(path) + UnimodModification.resolver.database = db + return db + + +class ModificationToken(object): + '''Describes a particular modification from a particular provider, independent + of a :class:`TagBase`'s state. + + This class is meant to be used in place of a :class:`ModificationBase` object + when equality testing and hashing is desired, but do not want extra properties + to be involved. + + :class:`ModificationToken` is comparable and hashable, and can be compared with + :class:`ModificationBase` subclass instances safely. It can be called to create + a new instance of the :class:`ModificationBase` it is equal to. + + Attributes + ---------- + name : str + The name of the modification being represented, as the user specified it. + id : int or str + Whatever unique identifier the providing controlled vocabulary gave to this + modification + provider : str + The name of the providing controlled vocabulary. + source_cls : type + A sub-class of :class:`ModificationBase` that will be used to fulfill this + token if requested, providing it a resolver. + ''' + __slots__ = ('name', 'id', 'provider', 'source_cls') + + def __init__(self, name, id, provider, source_cls): + self.name = name + self.id = id + self.provider = provider + self.source_cls = source_cls + + def __eq__(self, other): + if other is None: + return False + if isinstance(other, (ModificationToken, ModificationBase, MassModification)): + return self.id == other.id and self.provider == other.provider + return False + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash((self.id, self.provider)) + + def __call__(self): + '''Create a new :class:`ModificationBase` + instance from the provided :attr:`name` + against :attr:`source_cls`'s resolver. + + Returns + ------- + ModificationBase + ''' + return self.source_cls(self.name) + + def __repr__(self): + template = "{self.__class__.__name__}({self.name!r}, {self.id!r}, {self.provider!r}, {self.source_cls})" + return template.format(self=self) + + +def split_tags(tokens): + '''Split a token array into discrete sets of tag + tokens. + + Parameters + ---------- + tokens: list + The characters of the tag token buffer + + Returns + ------- + list of list: + The tokens for each contained tag + ''' + starts = [0] + ends = [] + for i, c in enumerate(tokens): + if c == '|': + ends.append(i) + starts.append(i + 1) + elif (i != 0 and c == '#'): + ends.append(i) + starts.append(i) + ends.append(len(tokens)) + out = [] + for i, start in enumerate(starts): + end = ends[i] + tag = tokens[start:end] + if len(tag) == 0: + continue + # Short circuit on INFO tags which can't be broken + # if (tag[0] == 'i' and tag[:5] == ['i', 'n', 'f', 'o', ':']) or (tag[0] == 'I' and tag[:5] == ['I', 'N', 'F', 'O', ':']): + # tag = tokens[start:] + # out.append(tag) + # break + out.append(tag) + return out + + +def find_prefix(tokens): + '''Find the prefix, if any of the tag defined by `tokens` + delimited by ":". + + Parameters + ---------- + tokens: list + The tag tokens to search + + Returns + ------- + prefix: str or None + The prefix string, if found + rest: str + The rest of the tokens, merged as a string + ''' + for i, c in enumerate(tokens): + if c == ':': + return ''.join(tokens[:i]), ''.join(tokens[i + 1:]) + return None, ''.join(tokens) + + +def process_marker(tokens): + '''Process a marker, which is a tag whose value starts with #. + + Parameters + ---------- + tokens: list + The tag tokens to parse + + Returns + ------- + PositionLabelTag or LocalizationMarker + ''' + if tokens[1:3] == 'XL': + return PositionLabelTag(None, group_id=''.join(tokens)) + else: + group_id = None + value = None + for i, c in enumerate(tokens): + if c == '(': + group_id = ''.join(tokens[:i]) + if tokens[-1] != ')': + raise Exception( + "Localization marker with score missing closing parenthesis") + value = float(''.join(tokens[i + 1:-1])) + return LocalizationMarker(value, group_id=group_id) + else: + group_id = ''.join(tokens) + return PositionLabelTag(group_id=group_id) + + +def process_tag_tokens(tokens): + '''Convert a tag token buffer into a parsed :class:`TagBase` instance + of the appropriate sub-type with zero or more sub-tags. + + Parameters + ---------- + tokens: list + The tokens to parse + + Returns + ------- + TagBase: + The parsed tag + ''' + parts = split_tags(tokens) + main_tag = parts[0] + if main_tag[0] in ('+', '-'): + main_tag = ''.join(main_tag) + main_tag = MassModification(main_tag) + elif main_tag[0] == '#': + main_tag = process_marker(main_tag) + else: + prefix, value = find_prefix(main_tag) + if prefix is None: + main_tag = GenericModification(''.join(value)) + else: + try: + tag_type = TagBase.find_by_tag(prefix) + main_tag = tag_type(value) + except KeyError: + main_tag_str = ''.join(main_tag) + main_tag = GenericModification(main_tag_str) + + if len(parts) > 1: + extras = [] + for part in parts[1:]: + prefix, value = find_prefix(part) + if prefix is None: + if value[0] == "#": + marker = process_marker(value) + if isinstance(marker, PositionLabelTag): + main_tag.group_id = ''.join(value) + else: + main_tag.group_id = marker.group_id + extras.append(marker) + else: + extras.append(GenericModification(''.join(value))) + else: + try: + tag_type = TagBase.find_by_tag(prefix) + extra_tag = tag_type(value) + except KeyError: + part_str = ''.join(part) + extra_tag = GenericModification(part_str) + extras.append(extra_tag) + main_tag.extra = extras + return main_tag + + +class ModificationRule(object): + '''Define a fixed modification rule which dictates a modification tag is + always applied at one or more amino acid residues. + + Attributes + ---------- + modification_tag: TagBase + The modification to apply + targets: list + The list of amino acids this applies to + ''' + __slots__ = ('modification_tag', 'targets') + + def __init__(self, modification_tag, targets=None): + self.modification_tag = modification_tag + self.targets = targets + + def __eq__(self, other): + if other is None: + return False + return self.modification_tag == other.modification_tag and self.targets == other.targets + + def __ne__(self, other): + return not self == other + + def __str__(self): + targets = ','.join(self.targets) + return "<[{self.modification_tag}]@{targets}>".format(self=self, targets=targets) + + def __repr__(self): + return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self) + + +class StableIsotope(object): + '''Define a fixed isotope that is applied globally to all amino acids. + + Attributes + ---------- + isotope: str + The stable isotope string, of the form [<isotope-number>]<element> or a special + isotopoform's name. + ''' + __slots__ = ('isotope', ) + + def __init__(self, isotope): + self.isotope = isotope + + def __eq__(self, other): + if other is None: + return False + return self.isotope == other.isotope + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "<{self.isotope}>".format(self=self) + + def __repr__(self): + return "{self.__class__.__name__}({self.isotope})".format(self=self) + + +class IntersectionEnum(Enum): + no_overlap = 0 + full_contains_interval = 1 + full_contained_in_interval = 2 + start_overlap = 3 + end_overlap = 4 + + +class TaggedInterval(object): + '''Define a fixed interval over the associated sequence which contains the localization + of the associated tag or denotes a region of general sequence order ambiguity. + + Attributes + ---------- + start: int + The starting position (inclusive) of the interval along the primary sequence + end: int + The ending position (exclusive) of the interval along the primary sequence + tags: list[TagBase] + The tags being localized + ambiguous : bool + Whether the interval is ambiguous or not + ''' + __slots__ = ('start', 'end', 'tags', 'ambiguous') + + def __init__(self, start, end=None, tags=None, ambiguous=False): + self.start = start + self.end = end + self.tags = tags + self.ambiguous = ambiguous + + def __eq__(self, other): + if other is None: + return False + return self.start == other.start and self.end == other.end and self.tags == other.tags + + def __ne__(self, other): + return not self == other + + def __str__(self): + return "({self.start}-{self.end}){self.tags!r}".format(self=self) + + def __repr__(self): + return "{self.__class__.__name__}({self.start}, {self.end}, {self.tags})".format(self=self) + + def as_slice(self): + return slice(self.start, self.end) + + def contains(self, i): + return self.start <= i < self.end + + def __contains__(self, i): + return self.contains(i) + + def copy(self): + return self.__class__(self.start, self.end, self.tags) + + def _check_slice(self, qstart, qend, warn_ambiguous): + # Fully contained interval + valid = qstart <= self.start and qend >= self.end + case = IntersectionEnum.full_contained_in_interval if valid else IntersectionEnum.no_overlap + if not valid: + # Spans the beginning but not the end + valid = qstart <= self.start and qend > self.start + if valid: + case = IntersectionEnum.start_overlap + if warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + + if not valid: + # Spans the end but not the beginning + valid = qstart < self.end and qend > self.end + if valid: + case = IntersectionEnum.end_overlap + if warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + + if not valid: + # Contained interval + valid = qstart >= self.start and qend < self.end + if valid: + case = IntersectionEnum.full_contains_interval + if warn_ambiguous: + warnings.warn("Slice bisecting interval %s" % (self, )) + return valid, case + + def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True): + if end is None: + qend = self.end + 1 + else: + qend = end + if start is None: + qstart = self.start - 1 + else: + qstart = start + + valid, intersection_type = self._check_slice(qstart, qend, warn_ambiguous) + if self.ambiguous and intersection_type not in (IntersectionEnum.full_contained_in_interval, IntersectionEnum.no_overlap): + raise ValueError("Cannot bisect an ambiguous interval") + if not valid: + return None + new = self.copy() + if start is not None: + diff = self.start - start + if diff < 0: + diff = 0 + new.start = diff + if end is not None: + width = min(new.end, end) - self.start + else: + width = self.end - max(start, self.start) + new.end = new.start + width + return new + + +class ChargeState(object): + '''Describes the charge and adduct types of the structure. + + Attributes + ---------- + charge : int + The total charge state as a signed number. + adducts : list[str] + Each charge carrier associated with the molecule. + ''' + __slots__ = ("charge", "adducts") + + def __init__(self, charge, adducts=None): + if adducts is None: + adducts = [] + self.charge = charge + self.adducts = adducts + + def __str__(self): + tokens = [str(self.charge)] + if self.adducts: + tokens.append("[") + tokens.append(','.join(str(adduct) for adduct in self.adducts)) + tokens.append("]") + return ''.join(tokens) + + def __repr__(self): + template = "{self.__class__.__name__}({self.charge}, {self.adducts})" + return template.format(self=self) + + +class TokenBuffer(object): + '''A token buffer that wraps the accumulation and reset logic + of a list of :class:`str` objects. + + Implements a subset of the Sequence protocol. + + Attributes + ---------- + buffer: list + The list of tokens accumulated since the last parsing. + ''' + def __init__(self, initial=None): + self.buffer = list(initial or []) + self.boundaries = [] + + def append(self, c): + '''Append a new character to the buffer. + + Parameters + ---------- + c: str + The character appended + ''' + self.buffer.append(c) + + def reset(self): + '''Discard the content of the current buffer. + ''' + if self.buffer: + self.buffer = [] + if self.boundaries: + self.boundaries = [] + + def __bool__(self): + return bool(self.buffer) + + def __iter__(self): + return iter(self.buffer) + + def __getitem__(self, i): + return self.buffer[i] + + def __len__(self): + return len(self.buffer) + + def tokenize(self): + i = 0 + pieces = [] + for k in self.boundaries + [len(self)]: + piece = self.buffer[i:k] + i = k + pieces.append(piece) + return pieces + + def _transform(self, value): + return value + + def process(self): + if self.boundaries: + value = [self._transform(v) for v in self.tokenize()] + else: + value = self._transform(self.buffer) + self.reset() + return value + + def bound(self): + k = len(self) + self.boundaries.append(k) + return k + + def __call__(self): + return self.process() + + +class NumberParser(TokenBuffer): + '''A buffer which accumulates tokens until it is asked to parse them into + :class:`int` instances. + ''' + + def _transform(self, value): + return int(''.join(value)) + + +class StringParser(TokenBuffer): + '''A buffer which accumulates tokens until it is asked to parse them into + :class:`str` instances. + ''' + + def _transform(self, value): + return ''.join(value) + + +class TagParser(TokenBuffer): + '''A buffer which accumulates tokens until it is asked to parse them into + :class:`TagBase` instances. + + Implements a subset of the Sequence protocol. + + Attributes + ---------- + buffer: list + The list of tokens accumulated since the last parsing. + group_ids: set + The set of all group IDs that have been produced so far. + ''' + + def __init__(self, initial=None, group_ids=None): + super(TagParser, self).__init__(initial) + if group_ids: + self.group_ids = set(group_ids) + else: + self.group_ids = set() + + def _transform(self, value): + tag = process_tag_tokens(value) + if tag.group_id: + self.group_ids.add(tag.group_id) + return tag + + def process(self): + value = super(TagParser, self).process() + if not isinstance(value, list): + value = [value] + return value + + +class ParserStateEnum(Enum): + before_sequence = 0 + tag_before_sequence = 1 + global_tag = 2 + fixed_spec = 3 + labile_tag = 4 + sequence = 5 + tag_in_sequence = 6 + interval_tag = 7 + tag_after_sequence = 8 + stable_isotope = 9 + post_tag_before = 10 + unlocalized_count = 11 + post_global = 12 + post_global_aa = 13 + post_interval_tag = 14 + post_tag_after = 15 + charge_state_start = 16 + charge_state_number = 17 + charge_state_adduct_start = 18 + charge_state_adduct_end = 19 + inter_chain_cross_link_start = 20 + chimeric_start = 21 + interval_initial = 22 + done = 999 + + +BEFORE = ParserStateEnum.before_sequence +TAG_BEFORE = ParserStateEnum.tag_before_sequence +FIXED = ParserStateEnum.fixed_spec +GLOBAL = ParserStateEnum.global_tag +ISOTOPE = ParserStateEnum.stable_isotope +LABILE = ParserStateEnum.labile_tag +SEQ = ParserStateEnum.sequence +TAG = ParserStateEnum.tag_in_sequence +INTERVAL_TAG = ParserStateEnum.interval_tag +INTERVAL_INIT = ParserStateEnum.interval_initial +TAG_AFTER = ParserStateEnum.tag_after_sequence +POST_TAG_BEFORE = ParserStateEnum.post_tag_before +POST_TAG_AFTER = ParserStateEnum.post_tag_after +UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count +POST_GLOBAL = ParserStateEnum.post_global +POST_GLOBAL_AA = ParserStateEnum.post_global_aa +POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag +CHARGE_START = ParserStateEnum.charge_state_start +CHARGE_NUMBER = ParserStateEnum.charge_state_number +ADDUCT_START = ParserStateEnum.charge_state_adduct_start +ADDUCT_END = ParserStateEnum.charge_state_adduct_end +DONE = ParserStateEnum.done + +VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB") + +def parse(sequence): + '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a + mapping of sequence-spanning modifiers. + + .. note:: + This is a state machine parser, but with certain sub-state paths + unrolled to avoid an explosion of formal intermediary states. + + Parameters + ---------- + sequence: str + The sequence to parse + + Returns + ------- + parsed_sequence: list[tuple[str, list[TagBase]]] + The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence + modifiers: dict + A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized + modifications, tagged intervals, and group IDs + ''' + labile_modifications = [] + fixed_modifications = [] + unlocalized_modifications = [] + intervals = [] + isotopes = [] + + n_term = None + c_term = None + + i = 0 + n = len(sequence) + + positions = [] + state = BEFORE + depth = 0 + + current_aa = None + current_tag = TagParser() + current_interval = None + current_unlocalized_count = NumberParser() + current_aa_targets = TokenBuffer() + + charge_buffer = None + adduct_buffer = None + + # A mostly context free finite state machine unrolled + # by hand. + while i < n: + c = sequence[i] + i += 1 + # Initial state prior to sequence content + if state == BEFORE: + if c == '[': + state = TAG_BEFORE + depth = 1 + elif c == '{': + state = LABILE + depth = 1 + elif c == '<': + state = FIXED + elif c in VALID_AA: + current_aa = c + state = SEQ + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + # The body of the amino acid sequence. + elif state == SEQ or state == INTERVAL_INIT: + if state == INTERVAL_INIT: + state = SEQ + if c == '?': + if current_interval is not None: + current_interval.ambiguous = True + continue + if c in VALID_AA: + if current_aa is not None: + positions.append((current_aa, current_tag() if current_tag else None)) + current_aa = c + elif c == '[': + state = TAG + if current_tag: + current_tag.bound() + depth = 1 + elif c == '(': + if current_interval is not None: + raise ProFormaError( + ("Error In State {state}, nested range found at index {i}. " + "Nested ranges are not yet supported by ProForma.").format( + **locals()), i, state) + current_interval = TaggedInterval(len(positions) + 1) + state = INTERVAL_INIT + elif c == ')': + positions.append( + (current_aa, current_tag() if current_tag else None)) + current_aa = None + if current_interval is None: + raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + else: + current_interval.end = len(positions) + if i < n and sequence[i] == '[': + i += 1 + depth = 1 + state = INTERVAL_TAG + else: + intervals.append(current_interval) + current_interval = None + elif c == '-': + if current_aa: + positions.append((current_aa, current_tag() if current_tag else None)) + current_aa = None + state = TAG_AFTER + if i >= n or sequence[i] != '[': + raise ProFormaError("Missing Closing Tag", i, state) + i += 1 + depth = 1 + elif c == '/': + state = CHARGE_START + charge_buffer = NumberParser() + elif c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) + else: + raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + # Tag parsing which rely on `current_tag` to buffer tokens. + elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL or state == INTERVAL_TAG: + if c == '[': + depth += 1 + current_tag.append(c) + elif c == ']': + depth -= 1 + if depth <= 0: + depth = 0 + if state == TAG: + state = SEQ + elif state == TAG_BEFORE: + state = POST_TAG_BEFORE + elif state == TAG_AFTER: + c_term = current_tag() + state = POST_TAG_AFTER + elif state == GLOBAL: + state = POST_GLOBAL + elif state == INTERVAL_TAG: + state = POST_INTERVAL_TAG + depth = 0 + else: + current_tag.append(c) + else: + current_tag.append(c) + # Handle transition to fixed modifications or isotope labeling from opening signal. + elif state == FIXED: + if c == '[': + state = GLOBAL + else: + # Do validation here + state = ISOTOPE + current_tag.reset() + current_tag.append(c) + # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens + elif state == ISOTOPE: + if c != '>': + current_tag.append(c) + else: + # Not technically a tag, but exploits the current buffer + isotopes.append(StableIsotope(''.join(current_tag))) + current_tag.reset() + state = BEFORE + # Handle labile modifications, which rely on `current_tag` to buffer tokens + elif state == LABILE: + if c == '{': + depth += 1 + elif c == '}': + depth -= 1 + if depth <= 0: + depth = 0 + labile_modifications.append(current_tag()[0]) + state = BEFORE + else: + current_tag.append(c) + # The intermediate state between an interval tag and returning to sequence parsing. + # A new tag may start immediately, leading to it being appended to the interval instead + # instead of returning to the primary sequence. Because this state may also occur at the + # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags, + # charge states, and the like. + elif state == POST_INTERVAL_TAG: + if c == '[': + current_tag.bound() + state = INTERVAL_TAG + elif c in VALID_AA: + current_aa = c + current_interval.tags = current_tag() + intervals.append(current_interval) + current_interval = None + state = SEQ + elif c == '-': + state = TAG_AFTER + if i >= n or sequence[i] != '[': + raise ProFormaError("Missing Closing Tag", i, state) + i += 1 + depth = 1 + elif c == '/': + state = CHARGE_START + charge_buffer = NumberParser() + elif c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + # An intermediate state for discriminating which type of tag-before-sequence type + # we just finished parsing. + elif state == POST_TAG_BEFORE: + if c == '?': + unlocalized_modifications.append(current_tag()[0]) + state = BEFORE + elif c == '-': + n_term = current_tag() + state = BEFORE + elif c == '^': + state = UNLOCALIZED_COUNT + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == UNLOCALIZED_COUNT: + if c.isdigit(): + current_unlocalized_count.append(c) + elif c == '[': + state = TAG_BEFORE + depth = 1 + tag = current_tag()[0] + multiplicity = current_unlocalized_count() + for i in range(multiplicity): + unlocalized_modifications.append(tag) + elif c == '?': + state = BEFORE + tag = current_tag()[0] + multiplicity = current_unlocalized_count() + for i in range(multiplicity): + unlocalized_modifications.append(tag) + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == POST_GLOBAL: + if c == '@': + state = POST_GLOBAL_AA + else: + raise ProFormaError( + ("Error In State {state}, fixed modification detected without " + "target amino acids found at index {i}").format(**locals()), i, state) + elif state == POST_GLOBAL_AA: + if c in VALID_AA: + current_aa_targets.append(c) + elif c == ',': + # the next character should be another amino acid + pass + elif c == '>': + fixed_modifications.append( + ModificationRule(current_tag()[0], current_aa_targets())) + state = BEFORE + else: + raise ProFormaError( + ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state) + elif state == POST_TAG_AFTER: + if c == '/': + state = CHARGE_START + charge_buffer = NumberParser() + elif c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) + elif state == CHARGE_START: + if c in '+-': + charge_buffer.append(c) + state = CHARGE_NUMBER + elif c.isdigit(): + charge_buffer.append(c) + state = CHARGE_NUMBER + elif c == '/': + state = ParserStateEnum.inter_chain_cross_link_start + raise ProFormaError("Inter-chain cross-linked peptides are not yet supported", i, state) + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == CHARGE_NUMBER: + if c.isdigit(): + charge_buffer.append(c) + elif c == "[": + state = ADDUCT_START + adduct_buffer = StringParser() + else: + raise ProFormaError( + "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + elif state == ADDUCT_START: + if c.isdigit() or c in "+-" or c in element_symbols: + adduct_buffer.append(c) + elif c == ',': + adduct_buffer.bound() + elif c == ']': + state = ADDUCT_END + elif state == ADDUCT_END: + if c == '+': + raise ProFormaError( + "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) + else: + raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) + if charge_buffer: + charge_number = charge_buffer() + if adduct_buffer: + adducts = adduct_buffer() + else: + adducts = None + charge_state = ChargeState(charge_number, adducts) + else: + charge_state = None + if current_aa: + positions.append((current_aa, current_tag() if current_tag else None)) + if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): + raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) + return positions, { + 'n_term': n_term, + 'c_term': c_term, + 'unlocalized_modifications': unlocalized_modifications, + 'labile_modifications': labile_modifications, + 'fixed_modifications': fixed_modifications, + 'intervals': intervals, + 'isotopes': isotopes, + 'group_ids': sorted(current_tag.group_ids), + 'charge_state': charge_state, + } + + +def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None, + labile_modifications=None, fixed_modifications=None, intervals=None, + isotopes=None, charge_state=None, group_ids=None): + '''Convert a sequence plus modifiers into formatted text following the + ProForma specification. + + Parameters + ---------- + sequence : list[tuple[str, TagBase]] + The primary sequence of the peptidoform/proteoform to render + n_term : Optional[TagBase] + The N-terminal modification, if any. + c_term : Optional[TagBase] + The C-terminal modification, if any. + unlocalized_modifications : Optional[list[TagBase]] + Any modifications which aren't assigned to a specific location. + labile_modifications : Optional[list[TagBase]] + Any labile modifications + fixed_modifications : Optional[list[ModificationRule]] + Any fixed modifications + intervals : Optional[list[TaggedInterval]] + A list of modified intervals, if any + isotopes : Optional[list[StableIsotope]] + Any global stable isotope labels applied + charge_state : Optional[ChargeState] + An optional charge state value + group_ids : Optional[list[str]] + Any group identifiers. This parameter is currently not used. + + Returns + ------- + str + ''' + primary = deque() + for aa, tags in sequence: + if not tags: + primary.append(str(aa)) + else: + primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags])) + if intervals: + for iv in sorted(intervals, key=lambda x: x.start): + if iv.ambiguous: + primary[iv.start] = '(?' + primary[iv.start] + else: + primary[iv.start] = '(' + primary[iv.start] + + terminator = '{0!s})'.format(primary[iv.end - 1]) + if iv.tags: + terminator += ''.join('[{!s}]'.format(t) for t in iv.tags) + primary[iv.end - 1] = terminator + if n_term: + primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-') + if c_term: + primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term)) + if charge_state: + primary.append("/{!s}".format(charge_state)) + if labile_modifications: + primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications]) + if unlocalized_modifications: + primary.appendleft("?") + primary.extendleft(['[{!s}]'.format(m) for m in unlocalized_modifications]) + if isotopes: + primary.extendleft(['{!s}'.format(m) for m in isotopes]) + if fixed_modifications: + primary.extendleft(['{!s}'.format(m) for m in fixed_modifications]) + return ''.join(primary) + + +class _ProFormaProperty(object): + def __init__(self, name): + self.name = name + + def __get__(self, obj, cls): + return obj.properties[self.name] + + def __set__(self, obj, value): + obj.properties[self.name] = value + + def __repr__(self): + template = "{self.__class__.__name__}({self.name!r})" + return template.format(self=self) + + +class ProForma(object): + '''Represent a parsed ProForma sequence. + + The preferred way to instantiate this class is via the :meth:`parse` + method. + + Attributes + ---------- + sequence : list[tuple[str, List[TagBase]]] + The list of (amino acid, tag collection) pairs making up the primary sequence of the + peptide. + isotopes : list[StableIsotope] + A list of any stable isotope rules that apply to this peptide + charge_state : int, optional + An optional charge state that may have been provided + intervals : list[Interval] + Any annotated intervals that contain either sequence ambiguity or a + tag over that interval. + labile_modifications : list[ModificationBase] + Any modifications that were parsed as labile, and may not appear at + any location on the peptide primary sequence. + unlocalized_modifications : list[ModificationBase] + Any modifications that were not localized but may be attached to peptide + sequence evidence. + n_term : list[ModificationBase] + Any modifications on the N-terminus of the peptide + c_term : list[ModificationBase] + Any modifications on the C-terminus of the peptide + group_ids : set + The collection of all groupd identifiers on this sequence. + mass : float + The computed mass for the fully modified peptide, including labile + and unlocalized modifications. **Does not include stable isotopes at this time** + ''' + + def __init__(self, sequence, properties): + self.sequence = sequence + self.properties = properties + + isotopes = _ProFormaProperty('isotopes') + charge_state = _ProFormaProperty('charge_state') + + intervals = _ProFormaProperty('intervals') + fixed_modifications = _ProFormaProperty('fixed_modifications') + labile_modifications = _ProFormaProperty('labile_modifications') + unlocalized_modifications = _ProFormaProperty('unlocalized_modifications') + + n_term = _ProFormaProperty('n_term') + c_term = _ProFormaProperty('c_term') + + group_ids = _ProFormaProperty('group_ids') + + def __str__(self): + return to_proforma(self.sequence, **self.properties) + + def __repr__(self): + return "{self.__class__.__name__}({self.sequence}, {self.properties})".format(self=self) + + def __len__(self): + return len(self.sequence) + + def __getitem__(self, i): + if isinstance(i, slice): + props = self.properties.copy() + ivs = [] + for iv in props['intervals']: + iv = iv._update_coordinates_sliced( + i.start, i.stop) + if iv is None: + continue + ivs.append(iv) + props['intervals'] = ivs + + if not (i.start is None or i.start == 0): + props['n_term'] = None + n = len(self) + if not (i.stop is None or i.stop >= n): + props['c_term'] = None + + return self.__class__(self.sequence[i], props) + else: + return self.sequence[i] + + def __eq__(self, other): + if isinstance(other, str): + return str(self) == other + elif other is None: + return False + else: + return self.sequence == other.sequence and self.properties == other.properties + + def __ne__(self, other): + return not self == other + + @classmethod + def parse(cls, string): + '''Parse a ProForma string. + + Parameters + ---------- + string : str + The string to parse + + Returns + ------- + ProForma + ''' + return cls(*parse(string)) + + @property + def mass(self): + mass = 0.0 + + fixed_modifications = self.properties['fixed_modifications'] + fixed_rules = {} + for rule in fixed_modifications: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.mass + + for position in self.sequence: + aa = position[0] + try: + mass += std_aa_mass[aa] + except KeyError: + warnings.warn("%r does not have an exact mass" % (aa, )) + if aa in fixed_rules: + mass += fixed_rules[aa] + tags = position[1] + if tags: + for tag in tags: + try: + mass += tag.mass + except (AttributeError, KeyError): + continue + for mod in self.properties['labile_modifications']: + mass += mod.mass + for mod in self.properties['unlocalized_modifications']: + mass += mod.mass + if self.properties.get('n_term'): + for mod in self.properties['n_term']: + try: + mass += mod.mass + except (AttributeError, KeyError): + continue + mass += calculate_mass(formula="H") + if self.properties.get('c_term'): + for mod in self.properties['c_term']: + try: + mass += mod.mass + except (AttributeError, KeyError): + continue + + mass += calculate_mass(formula="OH") + for iv in self.properties['intervals']: + try: + mass += iv.tag.mass + except (AttributeError, KeyError): + continue + return mass + + def fragments(self, ion_shift, charge=1, reverse=None, include_labile=True, include_unlocalized=True): + """ + The function generates all possible fragments of the requested + series type. + + Parameters + ---------- + ion_shift : float or str + The mass shift of the ion series, or the name of the ion series + charge : int + The charge state of the theoretical fragment masses to generate. + Defaults to 1+. If 0 is passed, neutral masses will be returned. + reverse : bool, optional + Whether to fragment from the N-terminus (``False``) or C-terminus (``True``). + If ``ion_shift`` is a :class:`str`, the terminal will be inferred from + the series name. Otherwise, defaults to ``False``. + include_labile : bool, optional + Whether or not to include dissociated modification masses. + Defaults to ``True`` + include_unlocalized : bool, optional + Whether or not to include unlocalized modification masses. + Defaults to ``True`` + + Returns + ------- + np.ndarray + + Examples + -------- + + >>> p = proforma.ProForma.parse("PEPTIDE") + >>> p.fragments('b', charge=1) + array([ 98.06004032, 227.1026334 , 324.15539725, 425.20307572, + 538.2871397 , 653.31408272]) + >>> p.fragments('y', charge=1) + array([148.06043424, 263.08737726, 376.17144124, 477.21911971, + 574.27188356, 703.31447664]) + + """ + if isinstance(ion_shift, str): + if ion_shift[0] in 'xyz': + reverse = True + ion_shift = std_ion_comp[ion_shift].mass(absolute=False) + + n = len(self.sequence) + masses = _array('d') + + mass = 0 + mass += ion_shift + + fixed_modifications = self.properties['fixed_modifications'] + fixed_rules = {} + for rule in fixed_modifications: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.mass + + intervals = self.intervals + if intervals: + intervals = sorted(intervals, key=lambda x: x.start) + intervals = deque(intervals) + + if not include_labile: + for mod in self.properties['labile_modifications']: + mass += mod.mass + + if not reverse: + if self.properties.get('n_term'): + for mod in self.properties['n_term']: + try: + mass += mod.mass + except (AttributeError, KeyError): + continue + else: + if self.properties.get('c_term'): + for mod in self.properties['c_term']: + try: + mass += mod.mass + except (AttributeError, KeyError): + continue + + if include_unlocalized: + for mod in self.properties['unlocalized_modifications']: + mass += mod.mass + + mass += _WATER_MASS + + if not reverse: + iterator = (iter(range(0, n - 1))) + else: + iterator = (reversed(range(1, n))) + + for i in iterator: + position = self.sequence[i] + + aa = position[0] + try: + mass += std_aa_mass[aa] + except KeyError: + warnings.warn("%r does not have an exact mass" % (aa, )) + + if aa in fixed_rules: + mass += fixed_rules[aa] + + tags = position[1] + if tags: + for tag in tags: + try: + mass += tag.mass + except (AttributeError, KeyError): + continue + + while intervals and intervals[0].contains(i): + iv = intervals.popleft() + + try: + mass += iv.tag.mass + except (AttributeError, KeyError): + continue + + masses.append(mass) + + if np is not None: + masses = np.asarray(masses) + if charge != 0: + return mass_charge_ratio(masses, charge) + return masses + if charge != 0: + for i, mass in enumerate(masses): + masses[i] = mass_charge_ratio(mass, charge) + return masses + + def find_tags_by_id(self, tag_id, include_position=True): + '''Find all occurrences of a particular tag ID + + Parameters + ---------- + tag_id : str + The tag ID to search for + include_position : bool + Whether or not to return the locations for matched + tag positions + + Returns + ------- + list[tuple[Any, TagBase]] or list[TagBase] + ''' + if not tag_id.startswith("#"): + tag_id = "#" + tag_id + matches = [] + for i, (_token, tags) in enumerate(self.sequence): + if tags: + for tag in tags: + if tag.group_id == tag_id: + if include_position: + matches.append((i, tag)) + else: + matches.append(tag) + for iv in self.properties['intervals']: + if iv.tag.group_id == tag_id: + matches.append((iv, iv.tag) if include_position else iv.tag) + for ulmod in self.properties['unlocalized_modifications']: + if ulmod.group_id == tag_id: + matches.append(('unlocalized_modifications', ulmod) + if include_position else ulmod) + for lamod in self.properties['labile_modifications']: + if lamod.group_id == tag_id: + matches.append(('labile_modifications', lamod) + if include_position else lamod) + return matches + + @property + def tags(self): + return [tag for tags_at in [pos[1] for pos in self if pos[1]] for tag in tags_at] diff --git a/pyteomics/protxml.py b/pyteomics/protxml.py new file mode 100644 index 0000000..51dea03 --- /dev/null +++ b/pyteomics/protxml.py @@ -0,0 +1,309 @@ +""" +protxml - parsing of ProteinProphet output files +================================================ + +Summary +------- + +**protXML** is the output format of the `ProteinProphet software <http://proteinprophet.sourceforge.net/>`_. +It contains information about identified proteins and their statistical significance. + +This module provides minimalistic infrastructure for access to data stored in +protXML files. The central class is :py:class:`ProtXML`, which +reads protein entries and related information and saves them into +Python dicts. + +Data access +----------- + + :py:class:`ProtXML` - a class representing a single protXML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through peptide-spectrum matches in a protXML + file. Calling the function is synonymous to instantiating the :py:class:`ProtXML` class. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`DataFrame` - read protXML files into a :py:class:`pandas.DataFrame`. + +Target-decoy approach +--------------------- + + :py:func:`filter` - filter protein groups from a chain of protXML files to a specific FDR + using TDA. + + :py:func:`filter.chain` - chain a series of filters applied independently to + several files. + + :py:func:`filter.chain.from_iterable` - chain a series of filters applied + independently to an iterable of files. + + :py:func:`filter_df` - filter protXML files and return a :py:class:`pandas.DataFrame`. + + :py:func:`fdr` - estimate the false discovery rate of a set of protein groups using the + target-decoy approach. + + :py:func:`qvalues` - get an array of scores and *q* values for protein groups using the target-decoy approach. + + :py:func:`is_decoy` - determine whether a protein group is decoy or not. This function may not suit your use case. + +Dependencies +------------ + +This module requres :py:mod:`lxml`. + +-------------------------------------------------------------------------------- +""" + +# Copyright 2018 Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import xml, auxiliary as aux, _schema_defaults +import operator as op + +class ProtXML(xml.MultiProcessingXML): + """Parser class for protXML files.""" + file_format = 'protXML' + _root_element = 'protein_summary' + _default_schema = _schema_defaults._protxml_schema_defaults + # _default_version = None + _default_iter_tag = 'protein_group' + _indexed_tag_keys = {'protein_group': 'group_number'} + _default_id_attr = 'group_number' + _indexed_tags = {'protein_group'} + _structures_to_flatten = {'annotation'} + # attributes which contain unconverted values + _convert_items = {'float': {'pct_spectrum_ids'}, + 'int': {'group_number', 'prot_length'}, + 'bool': {'is_contributing_evidence', 'is_nondegenerate_evidence'} + }.items() + + def _get_info_smart(self, element, **kwargs): + """Extract the info in a smart way depending on the element type""" + try: + name = kwargs.pop('ename') + except KeyError: + name = xml._local_name(element) + rec = kwargs.pop('recursive', None) + if name == self._root_element: + info = self._get_info(element, ename=name, + recursive=(rec if rec is not None else False), + **kwargs) + else: + info = self._get_info(element, ename=name, + recursive=(rec if rec is not None else True), + **kwargs) + + converters = {'float': float, 'int': int, + 'bool': lambda x: x.lower() in {'1', 'true', 'y'}} + for k, v in dict(info).items(): + for t, s in self._convert_items: + if k in s: + del info[k] + info[k] = converters[t](v) + p = info.get('parameter') + if isinstance(p, list) and len(p) == 1 and isinstance(p[0], dict): + info.update(info.pop('parameter')[0]) + + if 'modification_info' in info: + # this is a list with one element + info.update(info.pop('modification_info')[0]) + + if 'unique_stripped_peptides' in info: + info['unique_stripped_peptides'] = info['unique_stripped_peptides'].split('+') + return info + +def read(source, read_schema=False, iterative=True, **kwargs): + """Parse `source` and iterate through protein groups. + + Parameters + ---------- + source : str or file + A path to a target protXML file or the file object itself. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the protXML header. Otherwise, use default parameters. + Not recommended without Internet connection or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + Returns + ------- + out : ProtXML + An iterator over dicts with protein group properties. + """ + + return ProtXML(source, read_schema=read_schema, iterative=iterative) + + +# chain = aux._make_chain(read, 'read') +chain = aux.ChainBase._make_chain(ProtXML) + + +def _is_decoy_prefix(pg, prefix='DECOY_'): + """Determine if a protein group should be considered decoy. + + This function checks that all protein names in a group start with `prefix`. + You may need to provide your own function for correct filtering and FDR estimation. + + Parameters + ---------- + + pg : dict + A protein group dict produced by the :py:class:`ProtXML` parser. + prefix : str, optional + A prefix used to mark decoy proteins. Default is `'DECOY_'`. + + Returns + ------- + + out : bool + """ + return all(p['protein_name'].startswith(prefix) for p in pg['protein']) + +def _is_decoy_suffix(pg, suffix='_DECOY'): + """Determine if a protein group should be considered decoy. + + This function checks that all protein names in a group end with `suffix`. + You may need to provide your own function for correct filtering and FDR estimation. + + Parameters + ---------- + + pg : dict + A protein group dict produced by the :py:class:`ProtXML` parser. + suffix : str, optional + A suffix used to mark decoy proteins. Default is `'_DECOY'`. + + Returns + ------- + + out : bool + """ + return all(p['protein_name'].endswith(suffix) for p in pg['protein']) + +is_decoy = _is_decoy_prefix + +fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix) +_key = op.itemgetter('probability') +qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, _key) +filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, _key, qvalues) +filter.chain = aux._make_chain(filter, 'filter', True) + +def DataFrame(*args, **kwargs): + """Read protXML output files into a :py:class:`pandas.DataFrame`. + + .. note :: Rows in the DataFrame correspond to individual proteins, not protein groups. + + Requires :py:mod:`pandas`. + + Parameters + ---------- + sep : str or None, keyword only, optional + Some values related to protein groups are variable-length lists. + If `sep` is a :py:class:`str`, they will be packed into single string using + this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is + :py:const:`None`. + + pd_kwargs : dict, optional + Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor. + + *args + Passed to :py:func:`chain`. + + **kwargs + Passed to :py:func:`chain`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + kwargs = kwargs.copy() + sep = kwargs.pop('sep', None) + pd_kwargs = kwargs.pop('pd_kwargs', {}) + def gen_items(): + with chain(*args, **kwargs) as f: + for item in f: + info = {} + for k, v in item.items(): + if isinstance(v, (str, int, float)): + info[k] = v + if 'protein' in item: + for prot in item['protein']: + out = dict(info) + out.update(prot) + if 'unique_stripped_peptides' in out: + if sep is not None: + out['unique_stripped_peptides'] = sep.join(out['unique_stripped_peptides']) + if 'indistinguishable_protein' in out: + if sep is None: + out['indistinguishable_protein'] = [p['protein_name'] for p in out['indistinguishable_protein']] + else: + out['indistinguishable_protein'] = sep.join(p['protein_name'] for p in out['indistinguishable_protein']) + yield out + return pd.DataFrame(gen_items(), **pd_kwargs) + + +def filter_df(*args, **kwargs): + """Read protXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. + Positional arguments can be protXML files or DataFrames. + + .. note :: Rows in the DataFrame correspond to individual proteins, not protein groups. + + Requires :py:mod:`pandas`. + + Parameters + ---------- + key : str / iterable / callable, keyword only, optional + Default is 'probability'. + is_decoy : str / iterable / callable, keyword only, optional + Default is to check that "protein_name" starts with `'DECOY_'`. + reverse : bool, keyword only, optional + Should be :py:const:`True` if higher score is better. + Default is :py:const:`True` (because the default key is 'probability'). + *args + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + **kwargs + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + kwargs.setdefault('key', 'probability') + kwargs.setdefault('reverse', True) + if all(isinstance(arg, pd.DataFrame) for arg in args): + if len(args) > 1: + df = pd.concat(args) + else: + df = args[0] + else: + read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs} + df = DataFrame(*args, **read_kw) + if 'is_decoy' not in kwargs: + if 'decoy_suffix' in kwargs: + kwargs['is_decoy'] = df['protein_name'].str.endswith(kwargs['decoy_suffix']) + else: + kwargs['is_decoy'] = df['protein_name'].str.startswith(kwargs.get('decoy_prefix', 'DECOY_')) + return aux.filter(df, **kwargs) diff --git a/pyteomics/pylab_aux.py b/pyteomics/pylab_aux.py new file mode 100644 index 0000000..c52e17e --- /dev/null +++ b/pyteomics/pylab_aux.py @@ -0,0 +1,831 @@ +""" +pylab_aux - auxiliary functions for plotting with pylab +======================================================= + +This module serves as a collection of useful routines for data plotting with +matplotlib. + +Generic plotting +---------------- + + :py:func:`plot_line` - plot a line. + + :py:func:`scatter_trend` - plot a scatter plot with a regression line. + + :py:func:`plot_function_3d` - plot a 3D graph of a function of two variables. + + :py:func:`plot_function_contour` - plot a contour graph of a function of + two variables. + +Spectrum visualization +---------------------- + + :py:func:`plot_spectrum` - plot a single spectrum (m/z vs intensity). + + :py:func:`annotate_spectrum` - plot and annotate peaks in MS/MS spectrum. + + :py:func:`mirror` - create a mirror plot of two spectra (using :py:mod:`spectrum_utils`). + +FDR control +----------- + + :py:func:`plot_qvalue_curve` - plot the dependence of q-value on the amount of PSMs + (similar to a ROC curve). + +See also +-------- + + - `Matplotlib cookbook <http://www.scipy.org/Cookbook/Matplotlib/>`_ + - `Matplotlib tutorial + <http://matplotlib.sourceforge.net/mpl_toolkits/mplot3d/tutorial.html>`_ + +Dependencies +------------ + +This module requires :py:mod:`matplotlib`. Optional dependencies: :py:mod:`adjustText`, :py:mod:`spectrum_utils`. + +------------------------------------------------------------------------------- + +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pylab +import numpy as np +from .auxiliary import linear_regression, PyteomicsError +from .version import VersionInfo +from . import parser, mass, mgf, proforma + +try: + import spectrum_utils + if VersionInfo(spectrum_utils.__version__) < VersionInfo('0.4'): + raise ImportError("Supported spectrum_utils version is 0.4.0 or newer.") + import spectrum_utils.spectrum as sus + import spectrum_utils.plot as sup +except ImportError: + sus = sup = None + + +def plot_line(a, b, xlim=None, *args, **kwargs): + """Plot a line y = a * x + b. + + Parameters + ---------- + a : float + The slope of the line. + b : float + The intercept of the line. + xlim : tuple, optional + Minimal and maximal values of `x`. If not given, :py:func:`pylab.xlim` will be called. + *args + Passed to :py:func:`pylab.plot` after `x` and `y` values. + **kwargs + Passed to :py:func:`pylab.plot`. + + Returns + ------- + out : matplotlib.lines.Line2D + The line object. + """ + if xlim is None: + xlim = pylab.xlim() + return pylab.plot([xlim[0], xlim[1]], [a * xlim[0] + b, a * xlim[1] + b], *args, **kwargs) + + +def scatter_trend(x, y=None, **kwargs): + """Make a scatter plot with a linear regression. + + Parameters + ---------- + x : array_like of float + 1-D array of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). + y : array_like of float, optional + 1-D arrays of floats. If `y` is omitted or :py:const:`None`, `x` must be a 2-D array of shape (N, 2). + plot_trend : bool, optional + If :py:const:`True` then plot a trendline (default). + plot_sigmas : bool, optional + If :py:const:`True` then plot confidence intervals of the linear fit. + :py:const:`False` by default. + show_legend : bool, optional + If :py:const:`True`, a legend will be shown with linear fit equation, + correlation coefficient, and standard deviation from the fit. Default is + :py:const:`True`. + title : str, optional + The title. Empty by default. + xlabel, ylabel : str, optional + The axes labels. Empty by default. + alpha_legend : float, optional + Legend box transparency. 1.0 by default + scatter_kwargs : dict, optional + Keyword arguments for :py:func:`pylab.scatter`. + Empty by default. + plot_kwargs : dict, optional + Keyword arguments for :py:func:`plot_line`. + By default, sets `xlim` and `label`. + legend_kwargs : dict, optional + Keyword arguments for :py:func:`pylab.legend`. + Default is :py:const:`{'loc': 'upper left'}`. + sigma_kwargs : dict, optional + Keyword arguments for :py:func:`pylab.plot` used for sigma lines. + Default is :py:const:`{'color': 'red', 'linestyle': 'dashed'}`. + sigma_values : iterable, optional + Each value will be multiplied with standard error of the fit, and the line + shifted by the resulting value will be plotted. Default is :py:const:`range(-3, 4)`. + regression : callable, optional + Function to perform linear regression. Will be given ``x`` and ``y`` as arguments. + Must return a 4-tuple: (a, b, r, stderr). + Default is :py:func:`pyteomics.auxiliary.linear_regression`. + + Returns + ------- + out : tuple + A (scatter_plot, trend_line, sigma_lines, legend) tuple. + """ + regression = kwargs.get('regression', linear_regression) + a, b, r, stderr = regression(x, y) + pylab.title(kwargs.get('title', '')) + pylab.xlabel(kwargs.get('xlabel', '')) + pylab.ylabel(kwargs.get('ylabel', '')) + + equation = ( + '$y\,=\,{:.3f}x\,{}\,{:.3f}$, ' + '$R^2=\,{:.3f}$ \n$\sigma\,=\,{:.3f}$'.format( + a, '-' if b < 0 else '+', abs(b), r*r, stderr)) + + if y is None: + x = np.array(x, copy=False) + y = x[:, 1] + x = x[:, 0] + else: + x = np.array(x) + y = np.array(y) + sc = pylab.scatter(x, y, **kwargs.get('scatter_kwargs', {})) + xlim = (x.min(), x.max()) + plkw = kwargs.get('plot_kwargs', {}).copy() + plkw.setdefault('xlim', xlim) + plkw.setdefault('label', equation) + if kwargs.get('plot_trend', True): + line = plot_line(a, b, **plkw) + else: + line = None + + if kwargs.get('plot_sigmas', False): + s_lines = [] + sigma_kwargs = kwargs.get('sigma_kwargs', {'color': 'red', 'linestyle': 'dashed'}) + for i in kwargs.get('sigma_values', range(-3, 4)): + s_lines.append(plot_line(a, b + i * stderr, xlim, **sigma_kwargs)) + else: + s_lines = None + + if kwargs.get('show_legend', True): + legend = pylab.legend(**kwargs.get('legend_kwargs', {'loc': 'upper left'})) + legend_frame = legend.get_frame() + legend_frame.set_alpha(kwargs.get('alpha_legend', 1.0)) + else: + legend = None + return sc, line, s_lines, legend + + +def plot_function_3d(x, y, function, **kwargs): + """Plot values of a function of two variables in 3D. + + More on 3D plotting in pylab: + + http://www.scipy.org/Cookbook/Matplotlib/mplot3D + + Parameters + ---------- + x : array_like of float + The plotting range on X axis. + y : array_like of float + The plotting range on Y axis. + function : function + The function to plot. + plot_type : {'surface', 'wireframe', 'scatter', 'contour', 'contourf'}, keyword only, optional + The type of a plot, see + `scipy cookbook <http://www.scipy.org/Cookbook/Matplotlib/mplot3D>`_ + for examples. The default value is 'surface'. + num_contours : int + The number of contours to plot, 50 by default. + xlabel : str, keyword only, optional + The X axis label. Empty by default. + ylabel : str, keyword only, optional + The Y axis label. Empty by default. + zlabel : str, keyword only, optional + The Z axis label. Empty by default. + title : str, keyword only, optional + The title. Empty by default. + **kwargs + Passed to the respective plotting function. + """ + import mpl_toolkits.mplot3d.axes3d as pylab3d + ax = pylab3d.Axes3D(pylab.gcf()) + ax.set_xlabel(kwargs.pop('xlabel', '')) + ax.set_ylabel(kwargs.pop('ylabel', '')) + ax.set_zlabel(kwargs.pop('zlabel', '')) + ax.set_title(kwargs.pop('title', '')) + X, Y = np.meshgrid(x, y) + Z = [] + for y_value in y: + Z.append([]) + for x_value in x: + Z[-1].append(function(x_value, y_value)) + Z = np.array(Z) + plot_type = kwargs.pop('plot_type', 'surface') + if plot_type == 'surface': + ax.plot_surface(X, Y, Z, + rstride=kwargs.pop('rstride', 1), + cstride=kwargs.pop('cstride', 1), + cmap=kwargs.pop('cmap', pylab.cm.jet), + **kwargs) + elif plot_type == 'wireframe': + ax.plot_wireframe(X, Y, Z, + cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) + elif plot_type == 'scatter': + ax.scatter3D(np.ravel(X), np.ravel(Y), np.ravel(Z), **kwargs) + elif plot_type == 'contour': + num_contours = kwargs.pop('num_contours', 50) + ax.contour3D(X, Y, Z, num_contours, + cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) + elif plot_type == 'contourf': + num_contours = kwargs.pop('num_contours', 50) + ax.contourf3D(X, Y, Z, num_contours, + cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) + else: + raise PyteomicsError('Unknown plot type: {}'.format(plot_type)) + + +def plot_function_contour(x, y, function, **kwargs): + """Make a contour plot of a function of two variables. + + Parameters + ---------- + x, y : array_like of float + The positions of the nodes of a plotting grid. + function : function + The function to plot. + filling : bool + Fill contours if True (default). + num_contours : int + The number of contours to plot, 50 by default. + xlabel, ylabel : str, optional + The axes labels. Empty by default. + title : str, optional + The title. Empty by default. + **kwargs + Passed to :py:func:`pylab.contour` or :py:func:`pylab.contourf`. + """ + pylab.xlabel(kwargs.pop('xlabel', '')) + pylab.ylabel(kwargs.pop('ylabel', '')) + pylab.title(kwargs.pop('title', '')) + X, Y = np.meshgrid(x, y) + Z = [] + for y_value in y: + Z.append([]) + for x_value in x: + Z[-1].append(function(x_value, y_value)) + Z = np.array(Z) + num_contours = kwargs.pop('num_contours', 50) + if kwargs.pop('filling', True): + pylab.contourf(X, Y, Z, num_contours, + cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) + else: + pylab.contour(X, Y, Z, num_contours, + cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) + + +def plot_qvalue_curve(qvalues, *args, **kwargs): + """ + Plot a curve with q-values on the X axis and corresponding PSM number + (starting with ``1``) on the Y axis. + + Parameters + ---------- + qvalues : array-like + An array of q-values for sorted PSMs. + xlabel : str, keyword only, optional + Label for the X axis. Default is "q-value". + ylabel : str, keyword only, optional + Label for the Y axis. Default is "# of PSMs". + title : str, keyword only, optional + The title. Empty by default. + *args + Given to :py:func:`pylab.plot` after `x` and `y`. + **kwargs + Given to :py:func:`pylab.plot`. + + Returns + ------- + out : matplotlib.lines.Line2D + """ + pylab.xlabel(kwargs.pop('xlabel', 'q-value')) + pylab.ylabel(kwargs.pop('ylabel', '# of PSMs')) + pylab.title(kwargs.pop('title', '')) + return pylab.plot(qvalues, 1 + np.arange(qvalues.size), *args, **kwargs) + + +def _default_plot_spectrum(spectrum, *args, **kwargs): + ax = kwargs.pop('ax', None) or pylab.gca() + if kwargs.pop('centroided', True): + kwargs.setdefault('align', 'center') + kwargs.setdefault('width', 0) + kwargs.setdefault('linewidth', 1) + kwargs.setdefault('edgecolor', 'k') + ax.bar(spectrum['m/z array'], spectrum['intensity array'], *args, **kwargs) + else: + ax.plot(spectrum['m/z array'], spectrum['intensity array'], *args, **kwargs) + return ax + + +def _spectrum_utils_plot(spectrum, *args, **kwargs): + + with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): + spectrum = _spectrum_utils_create_spectrum(spectrum, None, *args, **kwargs) + return sup.spectrum(spectrum) + + +def _spectrum_utils_iplot(spectrum, *args, **kwargs): + import spectrum_utils.iplot as supi + with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): + spectrum = _spectrum_utils_create_spectrum(spectrum, None, *args, **kwargs) + return supi.spectrum(spectrum) + + +_plot_backends = { + 'default': _default_plot_spectrum, + 'spectrum_utils': _spectrum_utils_plot, + 'spectrum_utils.iplot': _spectrum_utils_iplot, +} + + +def plot_spectrum(spectrum, *args, **kwargs): + """ + Plot a spectrum, assuming it is a dictionary containing "m/z array" and "intensity array". + + Parameters + ---------- + spectrum : dict + A dictionary, as returned by pyteomics MS data parsers. + Must contain "m/z array" and "intensity array" keys with decoded arrays. + backend : str, keyword only, optional + One of `{'default', 'spectrum_utils', 'spectrum_utils.iplot'}`. + The `spectrum_utils` backend requires installing :py:mod:`spectrum_utils`. + The `spectrum_utils.iplot` backend requires installing :py:mod:`spectrum_utils[iplot]`. + xlabel : str, keyword only, optional + Label for the X axis. Default is "m/z". + ylabel : str, keyword only, optional + Label for the Y axis. Default is "intensity". + title : str, keyword only, optional + The title. Empty by default. + + centroided : bool, keyword only, optional + Works only for the `default` backend. + If :py:const:`True` (default), peaks of the spectrum are plotted using :py:func:`pylab.bar`. + If :py:const:`False`, the arrays are simply plotted using :py:func:`pylab.plot`. + *args + When using `default` backend: given to :py:func:`pylab.plot` or :py:func:`pylab.bar` (depending on `centroided`). + **kwargs + When using `default` backend: given to :py:func:`pylab.plot` or :py:func:`pylab.bar` (depending on `centroided`). + + min_intensity : float, keyword only, optional + Remove low-intensity peaks; this is a factor of maximum peak intensity. Default is 0 (no filtering). + Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + max_num_peaks : int or None, keyword only, optional + Remove low-intensity peaks; this is the number of peaks to keep. Default is :py:const:`None` (no filtering). + Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + scaling : one of `{'root', 'log', 'rank'}` or None, keyword only, optional + Scaling to apply to peak intensities. Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + max_intensity : float or None, keyword only, optional + Intensity of the most intense peak relative to which the peaks will be scaled + (the default is :py:const:`None`, which means that no scaling + relative to the most intense peak will be performed). + Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + + Returns + ------- + out : matplotlib.pyplot.Axes + """ + bname = kwargs.pop('backend', 'default') + backend = _plot_backends.get(bname) + if backend is None: + raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format( + bname, '; '.join(_plot_backends))) + + pylab.xlabel(kwargs.pop('xlabel', 'm/z')) + pylab.ylabel(kwargs.pop('ylabel', 'intensity')) + if 'title' in kwargs: + pylab.title(kwargs.pop('title')) + return backend(spectrum, *args, **kwargs) + + +def _default_annotate_spectrum(spectrum, peptide, *args, **kwargs): + + # common kwargs + types = kwargs.pop('ion_types', ('b', 'y')) + aa_mass = kwargs.pop('aa_mass', mass.std_aa_mass) + mass_data = kwargs.pop('mass_data', mass.nist_mass) + ion_comp = kwargs.pop('ion_comp', mass.std_ion_comp) + colors = { + 'a': '#388E3C', + 'b': '#1976D2', + 'c': '#00796B', + 'x': '#7B1FA2', + 'y': '#D32F2F', + 'z': '#F57C00', + } + colors.update(kwargs.pop('colors', {})) + ftol = kwargs.pop('ftol', None) + if ftol is None: + rtol = kwargs.pop('rtol', 1e-5) + text_kw = kwargs.pop('text_kw', dict(ha='center', clip_on=True, backgroundcolor='#ffffff99')) + precursor_charge = kwargs.pop('precursor_charge', None) + if precursor_charge is None: + precursor_charge = _get_precursor_charge(spectrum) + if precursor_charge is None: + raise PyteomicsError('Could not extract precursor charge from spectrum. Please specify `precursor_charge` kwarg.') + maxcharge = kwargs.pop('maxcharge', max(1, precursor_charge - 1)) + ax = kwargs.get('ax', None) + # end of common kwargs + + # backend-specific kwargs + centroided = kwargs.pop('centroided', True) + adjust = kwargs.pop('adjust_text', None) + if adjust or adjust is None: + try: + from adjustText import adjust_text + adjust_kw = kwargs.pop('adjust_kw', dict( + only_move={'text': 'y', 'points': 'y', 'objects': 'y'}, autoalign=False, force_text=(1, 1))) + except ImportError: + if adjust: + raise PyteomicsError('Install adjustText for text adjustment') + adjust = False + else: + if adjust is None: + adjust = True + # end of backend-specific kwargs + + parsed = parser.parse(peptide, True, labels=list(aa_mass) + [parser.std_cterm, parser.std_nterm]) + n = len(parsed) + maxpeak = spectrum['intensity array'].max() + mz, names = {}, {} + for ion in types: + for charge in range(1, maxcharge + 1): + if ion[0] in 'abc': + for i in range(2, n): + mz.setdefault(ion, []).append(mass.fast_mass2(parsed[:i] + [parser.std_cterm], + aa_mass=aa_mass, charge=charge, ion_type=ion, mass_data=mass_data, ion_comp=ion_comp)) + names.setdefault(ion, []).append(ion[0] + str(i - 1) + ion[1:]) + else: + for i in range(1, n - 1): + mz.setdefault(ion, []).append(mass.fast_mass2([parser.std_nterm] + parsed[n - (i + 1):], + aa_mass=aa_mass, charge=charge, ion_type=ion, mass_data=mass_data, ion_comp=ion_comp)) + names.setdefault(ion, []).append(ion[0] + str(i) + ion[1:]) + texts = [] + for ion in types: + c = colors.get(ion, colors.get(ion[0], 'blue')) + matrix = np.abs(spectrum['m/z array'] - np.array(mz[ion]).reshape(-1, 1)) + if ftol is not None: + match = np.where(matrix < ftol) + else: + match = np.where(matrix / spectrum['m/z array'] < rtol) + pseudo_spec = {'m/z array': spectrum['m/z array'][match[1]], 'intensity array': spectrum['intensity array'][match[1]]} + plot_spectrum(pseudo_spec, centroided=True, edgecolor=c, ax=ax) + for j, i in zip(*match): + x = spectrum['m/z array'][i] + y = spectrum['intensity array'][i] + maxpeak * 0.02 + name = names[ion][j] + texts.append(pylab.text(x, y, name, color=c, **text_kw)) + if adjust: + adjust_text(texts, **adjust_kw) + kwargs.setdefault('zorder', -1) + return plot_spectrum(spectrum, *args, centroided=centroided, **kwargs) + + +def _get_precursor_charge(spectrum): + try: + return mgf.MGFBase.parse_precursor_charge(spectrum['params']['charge'], list_only=True)[0] + except (PyteomicsError, KeyError): + pass + try: + return int(spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state']) + except KeyError: + pass + return None + + +def _get_precursor_mz(spectrum): + try: + return spectrum['params']['pepmass'][0] + except KeyError: + pass + try: + return spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'] + except KeyError: + pass + if 'attributes' in spectrum: + for attr in spectrum['attributes']: + if attr in {"MS:1000827", "MS:1000744", "MS:1002234"}: + return spectrum['attributes'][attr] + return None + + +def _spectrum_utils_create_spectrum(spectrum, *args, **kwargs): + if sus is None: + raise PyteomicsError('This backend requires `spectrum_utils>=0.4`.') + + # backend-specific parameters + mz_range = kwargs.pop('mz_range', None) + + min_intensity = kwargs.pop('min_intensity', 0.0) + max_num_peaks = kwargs.pop('max_num_peaks', None) + scaling = kwargs.pop('scaling', None) + max_intensity = kwargs.pop('max_intensity', None) + spectrum = sus.MsmsSpectrum( + 'None', kwargs.pop('precursor_mz', None), kwargs.pop('precursor_charge', None), + spectrum['m/z array'], spectrum['intensity array']) + if mz_range: + spectrum = spectrum.set_mz_range(*mz_range) + + spectrum = spectrum.filter_intensity(min_intensity=min_intensity, max_num_peaks=max_num_peaks + ).scale_intensity(scaling, max_intensity) + return spectrum + + +def _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs): + + # common kwargs + aa_mass = kwargs.pop('aa_mass', mass.std_aa_mass) + types = kwargs.pop('ion_types', ('b', 'y')) + tol = kwargs.pop('ftol', None) + if tol is None: + tol = kwargs.pop('rtol', 1e-5) * 1e6 + tol_mode = 'ppm' + else: + tol_mode = 'Da' + + # kwargs.pop('text_kw', None) # not used + + precursor_charge = kwargs.pop('precursor_charge', None) + if precursor_charge is None: + precursor_charge = _get_precursor_charge(spectrum) + if precursor_charge is None: + raise PyteomicsError('Could not extract precursor charge from spectrum. ' + 'Please specify `precursor_charge` keyword argument.') + + maxcharge = kwargs.pop('maxcharge', max(1, precursor_charge - 1)) + # end of common kwargs + + # backend-specific parameters + remove_precursor_peak = kwargs.pop('remove_precursor_peak', False) + + # peptide can be modX or proforma. spectrum_utils supports proforma only + aa_comp = kwargs.get('aa_comp') + mod_names = kwargs.get('mod_names') + prefix = kwargs.get('prefix') + + try: + parsed_proforma = proforma.ProForma.parse(peptide) + peptide_pro = peptide + except Exception: + parsed_proforma = None + try: + peptide_pro = parser.to_proforma(peptide, aa_mass=aa_mass, aa_comp=aa_comp, mod_names=mod_names, prefix=prefix) + except Exception: + raise PyteomicsError("Cannot parse {} as ProForma or convert from modX".format(peptide)) + + precursor_mz = kwargs.pop('precursor_mz', None) + if precursor_mz is None: + precursor_mz = _get_precursor_mz(spectrum) + if precursor_mz is None: + try: + if aa_comp: + precursor_mz = mass.calculate_mass(peptide, aa_comp=aa_comp, charge=precursor_charge) + elif not parsed_proforma: + precursor_mz = mass.fast_mass2(peptide, aa_mass=aa_mass, charge=precursor_charge) + else: + precursor_mz = mass.mass_charge_ratio(parsed_proforma.mass, precursor_charge) + except PyteomicsError: + raise PyteomicsError('Cannot obtain precursor m/z, please specify `precursor_mz` argument.') + + spectrum = _spectrum_utils_create_spectrum(spectrum, *args, + precursor_mz=precursor_mz, precursor_charge=precursor_charge, **kwargs) + if remove_precursor_peak: + spectrum = spectrum.remove_precursor_peak(tol, tol_mode) + spectrum = spectrum.annotate_proforma(peptide_pro, tol, tol_mode, types, maxcharge) + + return spectrum + + +class SpectrumUtilsColorScheme: + """Context manager that temporarily changes `spectrum_utils.plot.colors`.""" + def __init__(self, colors): + self.colors = colors + self.previous_colors = sup.colors.copy() + + def __enter__(self): + if self.colors: + sup.colors.update(self.colors) + + def __exit__(self, *args, **kwargs): + sup.colors = self.previous_colors + + +def _spectrum_utils_annotate_plot(spectrum, peptide, *args, **kwargs): + + with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): + spectrum = _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs) + return sup.spectrum(spectrum, annot_kws=kwargs.pop('text_kw', None), ax=kwargs.pop('ax', None)) + + +def _spectrum_utils_annotate_iplot(spectrum, peptide, *args, **kwargs): + import spectrum_utils.iplot as supi + with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): + spectrum = _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs) + return supi.spectrum(spectrum, annot_kws=kwargs.pop('text_kw', None)) + + +_annotation_backends = { + 'default': _default_annotate_spectrum, + 'spectrum_utils': _spectrum_utils_annotate_plot, + 'spectrum_utils.iplot': _spectrum_utils_annotate_iplot, +} + + +def annotate_spectrum(spectrum, peptide, *args, **kwargs): + """Plot a spectrum and annotate matching fragment peaks. + + Parameters + ---------- + spectrum : dict + A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys. + peptide : str + A modX sequence. + backend : str, keyword only, optional + One of `{'default', 'spectrum_utils', 'spectrum_utils.iplot'}`. + The `spectrum_utils` backend requires installing :py:mod:`spectrum_utils`. + The `spectrum_utils.iplot` backend requires installing :py:mod:`spectrum_utils[iplot]`. + ion_types : Container, keyword only, optional + Ion types to be considered for annotation. Default is `('b', 'y')`. + precursor_charge : int, keyword only, optional + If not specified, an attempt is made to extract it from `spectrum`. + maxcharge : int, keyword only, optional + Maximum charge state for fragment ions to be considered. Default is `precursor_charge - 1`. + colors : dict, keyword only, optional + Keys are ion types, values are colors to plot the annotated peaks with. Default depends on backend. + ftol : float, keyword only, optional + A fixed m/z tolerance value for peak matching. Alternative to `rtol`. + rtol : float, keyword only, optional + A relative m/z error for peak matching. Default is 10 ppm. + aa_mass : dict, keyword only, optional + A dictionary of amino acid residue masses. + text_kw : dict, keyword only, optional + Keyword arguments for :py:func:`pylab.text`. + xlabel : str, keyword only, optional + Label for the X axis. Default is "m/z". Does not work with `spectrum_utils.iplot` backend. + ylabel : str, keyword only, optional + Label for the Y axis. Default is "intensity". Does not work with `spectrum_utils.iplot` backend. + title : str, keyword only, optional + The title. Empty by default. Does not work with `spectrum_utils.iplot` backend. + ax : matplotlib.pyplot.Axes, keyword only, optional + Axes to draw the spectrum. Does not work with `spectrum_utils.iplot` backend. + + *args + Passed to the plotting backend. + **kwargs + Passed to the plotting backend. + + centroided : bool, keyword only, optional + Passed to :py:func:`plot_spectrum`. Only works with `default` backend. + ion_comp : dict, keyword only, optional + A dictionary defining ion compositions to override :py:const:`pyteomics.mass.std_ion_comp`. + Only works with `default` backend. + mass_data : dict, keyword only, optional + A dictionary of element masses to override :py:const:`pyteomics.mass.nist_mass`. + Only works with `default` backend. + + adjust_text : bool, keyword only, optional + Adjust the overlapping text annotations using :py:mod:`adjustText`. Only works with `default` backend. + adjust_kw : dict, keyword only, optional + Keyword arguments for :py:func:`adjust_text`. Only works with `default` backend. + + remove_precursor_peak : bool, keyword only, optional + Remove precursor peak from spectrum before annotation. Default is :py:const:`False`. + Only works with `spectrum_utils` backend. + min_intensity : float, keyword only, optional + Remove low-intensity peaks; this is a factor of maximum peak intensity. Default is 0 (no filtering). + Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + max_num_peaks : int or None, keyword only, optional + Remove low-intensity peaks; this is the number of peaks to keep. Default is :py:const:`None` (no filtering). + Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + scaling : one of `{'root', 'log', 'rank'}` or None, keyword only, optional + Scaling to apply to peak intensities. Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + max_intensity : float or None, keyword only, optional + Intensity of the most intense peak relative to which the peaks will be scaled + (the default is :py:const:`None`, which means that no scaling + relative to the most intense peak will be performed). + Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. + aa_comp : dict, keyword only, optional + Amino acid compositions, including modified ones. If given, will be used for conversion from *modX* to ProForma. + mod_names : dict or callable, keyword only, optional + If given, will be used for conversion from *modX* to ProForma. + prefix : str, keyword only, optional + If given, will be used for conversion from *modX* to ProForma. + + Returns + ------- + out : matplotlib.pyplot.Axes + """ + bname = kwargs.pop('backend', 'default') + backend = _annotation_backends.get(bname) + if backend is None: + raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format( + bname, '; '.join(_annotation_backends))) + + pylab.xlabel(kwargs.pop('xlabel', 'm/z')) + pylab.ylabel(kwargs.pop('ylabel', 'intensity')) + pylab.title(kwargs.pop('title', '')) + return backend(spectrum, peptide, *args, **kwargs) + + +def _spectrum_utils_mirror(spec_top, spec_bottom, spectrum_kws=None, ax=None, **kwargs): + with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): + ax = sup.mirror(spec_top, spec_bottom, spectrum_kws=spectrum_kws, ax=ax) + ax.set_xlabel(kwargs.pop('xlabel', 'm/z')) + ax.set_ylabel(kwargs.pop('ylabel', 'intensity')) + ax.set_title(kwargs.pop('title', '')) + return ax + + +def _spectrum_utils_iplot_mirror(spec_top, spec_bottom, spectrum_kws=None, **kwargs): + import spectrum_utils.iplot as supi + with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): + return supi.mirror(spec_top, spec_bottom, spectrum_kws=spectrum_kws) + + +_mirror_backends = { + 'spectrum_utils': _spectrum_utils_mirror, + 'spectrum_utils.iplot': _spectrum_utils_iplot_mirror, +} + + +def mirror(spec_top, spec_bottom, peptide=None, spectrum_kws=None, ax=None, **kwargs): + """Create a mirror plot of two (possible annotated) spectra using `spectrum_utils`. + + Parameters + ---------- + spec_top : dict + A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys. + spec_bottom : dict + A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys. + peptide : str or None, optional + A modX sequence or ProForma. If provided, the peaks will be annotated as peptide fragments. + spectrum_kws : dict or None, optional + Passed to :py:func:`spectrum_utils.plot.mirror`. + backend : str, keyword only, optional + One of {'spectrum_utils', 'spectrum_utils.iplot'}. Default is 'spectrum_utils'. + + .. note :: + Requires :py:mod:`spectrum_utils` or :py:mod:`spectrun_utils[iplot]`, respectively. + + ax : matplotlib.pyplot.Axes or None, optional + Passed to :py:func:`spectrum_utils.plot.mirror`. Works only for the 'spectrum_utils' backend. + xlabel : str, keyword only, optional + Label for the X axis. Default is "m/z". Works only for the 'spectrum_utils' backend. + ylabel : str, keyword only, optional + Label for the Y axis. Default is "intensity". Works only for the 'spectrum_utils' backend. + title : str, keyword only, optional + The title. Empty by default. Works only for the 'spectrum_utils' backend. + + **kwargs : same as for :py:func:`annotate_spectrum` for `spectrum_utils` backends. + + Returns + ------- + out : matplotlib.pyplot.Axes + """ + + spec_gen = _spectrum_utils_create_spectrum if peptide is None else _spectrum_utils_annotate_spectrum + spec_top = spec_gen(spec_top, peptide, **kwargs) + spec_bottom = spec_gen(spec_bottom, peptide, **kwargs) + + bname = kwargs.pop('backend', 'spectrum_utils') + backend = _mirror_backends.get(bname) + if backend is None: + raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format( + bname, '; '.join(_mirror_backends))) + backend_kw = {'spectrum_kws': spectrum_kws} + if bname == 'spectrum_utils': + backend_kw['ax'] = ax + backend_kw.update(kwargs) + return backend(spec_top, spec_bottom, **backend_kw) diff --git a/pyteomics/tandem.py b/pyteomics/tandem.py new file mode 100644 index 0000000..ba08b43 --- /dev/null +++ b/pyteomics/tandem.py @@ -0,0 +1,384 @@ +""" +tandem - X!Tandem output file reader +==================================== + +Summary +------- + +`X!Tandem <http://thegpm.org/tandem/>`_ is an open-source proteomic search +engine with a very simple, sophisticated application programming interface +(API): it simply takes an XML file of instructions on its command line, +and outputs the results into an XML file, which has been specified in the input +XML file. The output format is described +`here (PDF) <http://www.thegpm.org/docs/X_series_output_form.pdf>`_. + +This module provides a minimalistic way to extract information from X!Tandem +output files. You can use the old functional interface (:py:func:`read`) or the +new object-oriented interface (:py:class:`TandemXML`) to iterate over entries in +`<group>` elements, i.e. identifications for a certain spectrum. + +Data access +----------- + + :py:class:`TandemXML` - a class representing a single X!Tandem output file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through peptide-spectrum matches in an X!Tandem + output file. Data from a single PSM are converted to a human-readable dict. + + :py:func:`chain` - read multiple files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + + :py:func:`DataFrame` - read X!Tandem output files into a :py:class:`pandas.DataFrame`. + +Target-decoy approach +--------------------- + + :py:func:`filter` - iterate through peptide-spectrum matches in a chain of + X!Tandem output files, yielding only top PSMs and keeping false discovery rate + (FDR) at the desired level. The FDR is estimated using the target-decoy + approach (TDA). + + :py:func:`filter.chain` - chain a series of filters applied independently to + several files. + + :py:func:`filter.chain.from_iterable` - chain a series of filters applied + independently to an iterable of files. + + :py:func:`filter_df` - filter X!Tandem output files and return a :py:class:`pandas.DataFrame`. + + + :py:func:`is_decoy` - determine if a PSM is from the decoy database. + + :py:func:`fdr` - estimate the FDR in a data set using TDA. + + :py:func:`qvalues` - get an array of scores and local FDR values for a PSM + set using the target-decoy approach. + +Deprecated functions +-------------------- + + :py:func:`iterfind` - iterate over elements in an X!Tandem file. + You can just call the corresponding method of the :py:class:`TandemXML` + object. + +Dependencies +------------ + +This module requires :py:mod:`lxml` and :py:mod:`numpy`. + +------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from . import xml, auxiliary as aux, _schema_defaults + + +class TandemXML(xml.XML): + """Parser class for TandemXML files.""" + file_format = "TandemXML" + _root_element = "bioml" + _default_schema = _schema_defaults._tandem_schema_defaults + _default_iter_path = 'group[@type="model"]' + _structures_to_flatten = {'domain'} + + def __init__(self, *args, **kwargs): + if 'recursive' not in kwargs: + super(TandemXML, self).__init__(*args, recursive=True, **kwargs) + else: + super(TandemXML, self).__init__(*args, **kwargs) + + __init__.__doc__ = xml.XML.__init__.__doc__ + + def _get_info_smart(self, element, **kw): + info = self._get_info(element, **kw) + # handy simplifications below + if isinstance(info.get('note'), list) and len(info['note']) == 1 and set(info['note'][0]) == {'label', 'note'}: + info['note'] = info['note'][0]['note'] + if 'protein' in info and 'label' in info: + del info['label'] + if 'group' in info: + for g in info['group']: + label = g.pop('label') + type_ = g.pop('type') + info.setdefault(type_, {})[label] = g + del info['group'] + if 'trace' in info: + for t in info['trace']: + info[t.pop('type')] = t + del info['trace'] + if isinstance(info.get('values'), dict): + info['values'] = info['values']['values'] + if isinstance(info.get('attribute'), list): + for a in info.pop('attribute'): + info[a['type']] = float(a['attribute']) + if 'support' in info: + for d in info['support'].get('supporting data', {}).values(): + for label in ['Xdata', 'Ydata']: + d[label]['values'] = d[label]['values'].astype(int) + del d[label]['label'] + if 'fragment ion mass spectrum' in info['support']: + fims = info['support']['fragment ion mass spectrum'] + fims.update(fims.pop('tandem mass spectrum')) + for label in ['Xdata', 'Ydata']: + del info['support']['fragment ion mass spectrum'][label]['label'] + if 'charge' in info: + info['charge'] = int(info['charge']) + if info.get('rt') == '': + info['rt'] = None + + return info + + def _get_schema_info(self, read_schema): + return self._default_schema + + def __next__(self): + n = super(TandemXML, self).__next__() + del n['type'] + return n + + next = __next__ + + +def read(source, iterative=True, **kwargs): + """Parse `source` and iterate through peptide-spectrum matches. + + Parameters + ---------- + source : str or file + A path to a target X!Tandem output file or the file object itself. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + Returns + ------- + out : iterator + An iterator over dicts with PSM properties. + """ + return TandemXML(source, read_schema=False, recursive=True, iterative=iterative) + + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified "XPath". + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create a :py:class:`TandemXML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + Returns + ------- + out : iterator + """ + return TandemXML(source, **kwargs).iterfind(path, **kwargs) + + +# chain = aux._make_chain(read, 'read') +chain = aux.ChainBase._make_chain(TandemXML) + + +def _is_decoy_prefix(psm, prefix='DECOY_'): + """Given a PSM dict, return :py:const:`True` if all protein names for + the PSM start with `prefix`, and :py:const:`False` otherwise. + + Parameters + ---------- + psm : dict + A dict, as yielded by :py:func:`read`. + prefix : str, optional + A prefix used to mark decoy proteins. Default is `'DECOY_'`. + + Returns + ------- + out : bool + """ + return all(prot['label'].startswith(prefix) for prot in psm['protein']) + + +def _is_decoy_suffix(psm, suffix='_DECOY'): + """Given a PSM dict, return :py:const:`True` if all protein names for + the PSM end with `suffix`, and :py:const:`False` otherwise. + + Parameters + ---------- + psm : dict + A dict, as yielded by :py:func:`read`. + suffix : str, optional + A suffix used to mark decoy proteins. Default is `'_DECOY'`. + + Returns + ------- + out : bool + """ + return all(prot['label'].endswith(suffix) for prot in psm['protein']) + + +is_decoy = _is_decoy_prefix +qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, operator.itemgetter('expect')) +filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, operator.itemgetter('expect'), qvalues) +fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix) +filter.chain = aux._make_chain(filter, 'filter', True) + + +def DataFrame(*args, **kwargs): + """Read X!Tandem output files into a :py:class:`pandas.DataFrame`. + + Requires :py:mod:`pandas`. + + Parameters + ---------- + + sep : str or None, optional + Some values related to PSMs (such as protein information) are variable-length + lists. If `sep` is a :py:class:`str`, they will be packed into single string using + this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is + :py:const:`None`. + + pd_kwargs : dict, optional + Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor. + + *args + Passed to :py:func:`chain`. + + **kwargs + Passed to :py:func:`chain`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + data = [] + prot_keys = ['id', 'uid', 'label', 'expect'] + pep_keys = ['id', 'pre', 'post', 'start', 'end'] + sep = kwargs.pop('sep', None) + pd_kwargs = kwargs.pop('pd_kwargs', {}) + with chain(*args, **kwargs) as f: + for item in f: + info = {} + for k, v in item.items(): + if isinstance(v, (str, int, float)): + info[k] = v + protein = item['protein'][0] + + for key in prot_keys: + vals = [prot.get(key) for prot in item['protein']] + if sep is not None: + vals = sep.join(str(val) if val is not None else '' for val in vals) + info['protein_' + key] = vals + for key in pep_keys: + vals = [prot['peptide'].get(key) for prot in item['protein']] + if sep is not None: + vals = sep.join(str(val) if val is not None else '' for val in vals) + info['peptide_' + key] = vals + aa = protein['peptide'].pop('aa', []) + info['modifications'] = ','.join('{0[modified]:.3f}@{0[type]}'.format(x) for x in aa) + for k in prot_keys: + protein.pop(k, None) + for k in pep_keys: + protein['peptide'].pop(k, None) + info.update(protein['peptide']) + fims = item['support']['fragment ion mass spectrum'] + try: + info['scan'] = fims['note'] + except KeyError: + info['scan'] = fims['id'] + data.append(info) + return pd.DataFrame(data, **pd_kwargs) + + +def filter_df(*args, **kwargs): + """Read X!Tandem output files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. + Positional arguments can be X!Tandem output files or DataFrames. + + Requires :py:mod:`pandas`. + + Parameters + ---------- + key : str / iterable / callable, optional + Default is 'expect'. + is_decoy : str / iterable / callable, optional + Default is to check if all strings in the "protein" column start with `'DECOY_'` + *args + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + **kwargs + Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. + + Returns + ------- + out : pandas.DataFrame + """ + import pandas as pd + sep = kwargs.get('sep') + kwargs.setdefault('key', 'expect') + if all(isinstance(arg, pd.DataFrame) for arg in args): + if len(args) > 1: + df = pd.concat(args) + else: + df = args[0] + else: + read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs} + df = DataFrame(*args, **read_kw) + + if 'is_decoy' not in kwargs: + if sep is not None: + if 'decoy_suffix' in kwargs: + kwargs['is_decoy'] = df['protein_label'].str.split(sep).apply( + lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) + else: + kwargs['is_decoy'] = df['protein_label'].str.split(sep).apply( + lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) + else: + if 'decoy_suffix' in kwargs: + kwargs['is_decoy'] = df['protein_label'].apply( + lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) + else: + kwargs['is_decoy'] = df['protein_label'].apply( + lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) + + return aux.filter(df, **kwargs) diff --git a/pyteomics/traml.py b/pyteomics/traml.py new file mode 100644 index 0000000..66ed27e --- /dev/null +++ b/pyteomics/traml.py @@ -0,0 +1,235 @@ +""" +traml - targeted MS transition data in TraML format +=================================================== + +Summary +------- + +TraML is a standard rich XML-format for targeted mass spectrometry method definitions. +Please refer to `psidev.info <http://www.psidev.info/traml>`_ +for the detailed specification of the format and structure of TraML files. + +This module provides a minimalistic way to extract information from TraML +files. You can use the object-oriented interface (:class:`TraML` instances) to +access target definitions and transitions. :class:`TraML` objects also support +indexing with entity IDs directly. + +Data access +----------- + + :py:class:`TraML` - a class representing a single TraML file. + Other data access functions use this class internally. + + :py:func:`read` - iterate through transitions in TraML format. + + :py:func:`chain` - read multiple TraML files at once. + + :py:func:`chain.from_iterable` - read multiple files at once, using an + iterable of files. + +Controlled Vocabularies +~~~~~~~~~~~~~~~~~~~~~~~ +TraML relies on controlled vocabularies to describe its contents extensibly. See +`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ +for more details on how they are used. + +Handling Time Units and Other Qualified Quantities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +TraML contains information which may be described as using a variety of different time units. +See `Unit Handling <../data.html#unit-handling>`_ for more information. + +Deprecated functions +-------------------- + + :py:func:`version_info` - get version information about the TraML file. + You can just read the corresponding attribute of the :py:class:`TraML` object. + + :py:func:`iterfind` - iterate over elements in an TraML file. + You can just call the corresponding method of the :py:class:`TraML` object. + +Dependencies +------------ + +This module requires :py:mod:`lxml` + +------------------------------------------------------------------------------- +""" + +# Copyright 2018 Joshua Klein, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import warnings +from . import xml, _schema_defaults, auxiliary as aux + + +class TraML(xml.MultiProcessingXML, xml.IndexSavingXML): + """Parser class for TraML files.""" + file_format = 'TraML' + _root_element = 'TraML' + _default_schema = _schema_defaults._traml_schema_defaults + _default_version = '1.0.0' + + _default_iter_tag = 'Transition' + _indexed_tags = { + 'Transition', + 'Peptide', + 'Compound', + 'Target', + 'Protein', + 'Compound', + } + + _element_handlers = xml.XML._element_handlers.copy() + _element_handlers.update({ + 'Modification': xml.XML._promote_empty_parameter_to_name, + 'Interpretation': xml.XML._promote_empty_parameter_to_name, + 'Software': xml.XML._promote_empty_parameter_to_name, + }) + + def __init__(self, *args, **kwargs): + kwargs.setdefault('retrieve_refs', True) + super(TraML, self).__init__(*args, **kwargs) + + def _get_info_smart(self, element, **kw): + kwargs = dict(kw) + rec = kwargs.pop('recursive', None) + info = self._get_info( + element, + recursive=(rec if rec is not None else True), + **kwargs) + return info + + def _retrieve_refs(self, info, **kwargs): + """Retrieves and embeds the data for each attribute in `info` that + ends in `Ref`. Removes the id attribute from `info`""" + for k, v in dict(info).items(): + if k[-3:] in {'Ref', 'ref'}: + if isinstance(v, str): + key = v + elif isinstance(v, dict): + key = v['ref'] + else: + if k != 'ref': + info[k[:-3]] = info.pop(k) + continue + try: + by_id = self.get_by_id(key, retrieve_refs=True) + except KeyError: + warnings.warn('Ignoring unresolved reference: ' + key) + else: + if k == 'ref': + info.update(by_id) + else: + # by_id.pop('id', None) + info[k[:-3]] = by_id + del info[k] + + + +def read(source, retrieve_refs=True, read_schema=False, iterative=True, use_index=False, huge_tree=False): + """Parse `source` and iterate through transitions. + + Parameters + ---------- + source : str or file + A path to a target TraML file or the file object itself. + + retrieve_refs : bool, optional + If :py:const:`True`, additional information from references will be + automatically added to the results. The file processing time will + increase. Default is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the TraML header. Otherwise, use default parameters. + Not recommended without Internet connection or + if you don't like to get the related warnings. + + iterative : bool, optional + Defines whether iterative parsing should be used. It helps reduce + memory usage at almost the same parsing speed. Default is + :py:const:`True`. + + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + spectrum elements. Default is :py:const:`False`. + + huge_tree : bool, optional + This option is passed to the `lxml` parser and defines whether + security checks for XML tree depth and node size should be disabled. + Default is :py:const:`False`. + Enable this option for trusted files to avoid XMLSyntaxError exceptions + (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). + + Returns + ------- + out : TraML + A :py:class:`TraML` object, suitable for iteration and possibly random access. + """ + + return TraML(source, retrieve_refs=retrieve_refs, read_schema=read_schema, iterative=iterative, + use_index=use_index, huge_tree=huge_tree) + + +def iterfind(source, path, **kwargs): + """Parse `source` and yield info on elements with specified local + name or by specified "XPath". + + .. note:: This function is provided for backward compatibility only. + If you do multiple :py:func:`iterfind` calls on one file, you should + create an :py:class:`TraML` object and use its + :py:meth:`!iterfind` method. + + Parameters + ---------- + source : str or file + File name or file-like object. + + path : str + Element name or XPath-like expression. Only local names separated + with slashes are accepted. An asterisk (`*`) means any element. + You can specify a single condition in the end, such as: + ``"/path/to/element[some_value>1.5]"`` + Note: you can do much more powerful filtering using plain Python. + The path can be absolute or "free". Please don't specify + namespaces. + + recursive : bool, optional + If :py:const:`False`, subelements will not be processed when + extracting info from elements. Default is :py:const:`True`. + + iterative : bool, optional + Specifies whether iterative XML parsing should be used. Iterative + parsing significantly reduces memory usage and may be just a little + slower. When `retrieve_refs` is :py:const:`True`, however, it is + highly recommended to disable iterative parsing if possible. + Default value is :py:const:`True`. + + read_schema : bool, optional + If :py:const:`True`, attempt to extract information from the XML schema + mentioned in the mzIdentML header. Otherwise, use default + parameters. Not recommended without Internet connection or + if you don't like to get the related warnings. + + Returns + ------- + out : iterator + """ + return TraML(source, **kwargs).iterfind(path, **kwargs) + + +version_info = xml._make_version_info(TraML) + +chain = aux.ChainBase._make_chain(TraML) diff --git a/pyteomics/usi.py b/pyteomics/usi.py new file mode 100644 index 0000000..57a265b --- /dev/null +++ b/pyteomics/usi.py @@ -0,0 +1,527 @@ +""" +usi - Universal Spectrum Identifier (USI) parser and minimal PROXI client +========================================================================= + +Summary +------- +`USI <http://www.psidev.info/usi>`_ is a standardized method of referencing a specific +spectrum in a dataset, possibly attached to an interpretation. This module includes a +:class:`USI` type which can represent these constructs, :meth:`~USI.parse` them and +reconstruct them. + +One use-case for USI is to request spectrum information from a `PROXI <http://www.psidev.info/proxi>`_ +service host. PROXI services are available from several of the major national proteomics data hosts, +including MassIVE, PeptideAtlas, PRIDE, and jPOST. + +.. seealso:: + LeDuc, Richard D., Eric W. Deutsch, Pierre-Alain Binz, Ryan T. Fellers, Anthony J. Cesnik, + Joshua A. Klein, Tim Van Den Bossche, et al. + "Proteomics Standards Initiative's ProForma 2.0: Unifying the Encoding of Proteoforms and Peptidoforms." + ArXiv:2109.11352 [q-Bio], September 23, 2021. http://arxiv.org/abs/2109.11352. + + + +Data access +----------- + + :py:class:`USI` for representing Universal Spectrum Identifiers. Call :meth:`USI.parse` to parse a USI + string. + + :py:func:`proxi` to request a USI from a remote service. Provides access to the PeptideAtlas, MassIVE, + PRIDE and jPOST hosts. + +""" +import json +import warnings +import threading +import multiprocessing + +from collections import namedtuple, defaultdict + +try: + from multiprocessing.dummy import Pool as ThreadPool +except ImportError: + ThreadPool = None + +try: + from urllib2 import Request, urlopen +except ImportError: + from urllib.request import Request, urlopen + +try: + import numpy as np + + def coerce_array(array_data): + return np.array([float(v) for v in array_data]) + +except ImportError: + + def coerce_array(array_data): + return [float(v) for v in array_data] + +from .auxiliary import PyteomicsError + + +class USI(namedtuple("USI", ['protocol', 'dataset', 'datafile', 'scan_identifier_type', 'scan_identifier', 'interpretation'])): + '''Represent a Universal Spectrum Identifier (USI). + + .. note:: + This implementation will capture the interpretation component but will not interpret it at this time. + + Attributes + ---------- + protocol: str + The protocol to use to access the data (usually mzspec) + dataset: str + The name or accession number for the dataset the spectrum residues in + datafile: str + The basename of the data file from :attr:`dataset` to retrieve the spectrum from + scan_identifier_type: str + The format of the scan identifier, one of (scan, index, nativeId, trace) + scan_identifier: str + A usually numerical but potentially comma separated value encoded as a string to uniquely + identify the spectrum to be recovered from :attr:`datafile` in :attr:`dataset`. + interpretation: str + The trailing material of the USI, such as the ProForma peptide sequence and charge + ''' + def __str__(self): + return ':'.join(filter(lambda x: x is not None, self)) + + @classmethod + def parse(cls, usi): + '''Parse a USI string into a :class:`USI` object. + + Parameters + ---------- + usi: str + The USI string to parse + + Returns + ------- + USI + ''' + return cls(*_usi_parser(str(usi))) + + +def cast_numeric(value): + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + return value + + +def _usi_parser(usi): + tokens = usi.split(":", 5) + protocol = tokens[0] + dataset = tokens[1] + datafile = tokens[2] + scan_identifier_type = tokens[3] + scan_identifier = tokens[4] + try: + interpretation = tokens[5] + except IndexError: + interpretation = None + return (protocol, dataset, datafile, scan_identifier_type, scan_identifier, interpretation) + + +class _PROXIBackend(object): + '''A base class for all PROXI backends to implement the gory details of HTTP requests + and protocol parsing. + + If special processing needs to be done to interpret the spectrum returned from the service + provider, override the :meth:`_coerce` method. + + If extra information needs to be provided to the service provider for them to fulfill the + request not passed through the URL, override the :meth:`_request` method. + + Attributes + ---------- + name: str + The name of the backend service + url_template: str + The URL with {} fields to populate with the USI and any other relevant options, like protocol version + or the like. + options: dict + Additional options to be used when preparing the request URL. + ''' + def __init__(self, name, url_template, **kwargs): + kwargs.setdefault('version', '0.1') + self.name = name + self.url_template = url_template + self.options = kwargs + + def __repr__(self): + return "{self.__class__.__name__}({self.options})".format(self=self) + + def _request(self, usi): + url = self.url_template.format(usi=usi, **self.options) + req = Request(url) + response = urlopen(req) + if response.getcode() != 200: + raise ValueError("PROXI Service Response Code %r" % (response.getcode())) + data = response.read().decode("utf-8") + data = json.loads(data) + return data + + def get(self, usi): + '''Retrieve a ``USI`` from the host PROXI service over the network. + + Parameters + ---------- + usi : str or :class:`USI` + The universal spectrum identifier to retrieve. + + Returns + ------- + dict: + The spectrum as represented by the requested PROXI host. + ''' + data = self._request(usi) + result = self._coerce(data) + return result + + def _coerce(self, data): + '''Override and extend this method to change how the spectrum information is refined. + + This implementation just deals with properly formatting the peak arrays and doing minor + cosmetic name normalization. + + Parameters + ---------- + data: dict + The raw mzSpecML representation parsed from JSON + + Returns + ------- + dict: + The coerced spectrum data of appropriate types + ''' + if isinstance(data, list): + data_collection = data + data = data_collection[0] + result = {} + result['attributes'] = data.pop('attributes', []) + for attrib in result['attributes']: + if 'value' in attrib and isinstance(attrib['value'], str) and attrib['value'][0].isdigit(): + try: + attrib['value'] = cast_numeric(attrib['value']) + except TypeError: + continue + result['m/z array'] = coerce_array(data.pop('mzs', [])) + result['intensity array'] = coerce_array(data.pop('intensities', [])) + for key, value in data.items(): + if key in result: + raise ValueError( + "Attempting to set explicit value for {key!r}".format(key=key)) + result[key] = value + return result + + def __call__(self, usi): + return self.get(usi) + + +class PeptideAtlasBackend(_PROXIBackend): + _url_template = "http://www.peptideatlas.org/api/proxi/v{version}/spectra?resultType=full&usi={usi!s}" + + def __init__(self, **kwargs): + + super(PeptideAtlasBackend, self).__init__( + 'PeptideAtlas', self._url_template, **kwargs) + + +class MassIVEBackend(_PROXIBackend): + + _url_template = "http://massive.ucsd.edu/ProteoSAFe/proxi/v{version}/spectra?resultType=full&usi={usi}" + + def __init__(self, **kwargs): + super(MassIVEBackend, self).__init__( + 'MassIVE', self._url_template, **kwargs) + + +class PRIDEBackend(_PROXIBackend): + _url_template = "http://wwwdev.ebi.ac.uk/pride/proxi/archive/v{version}/spectra?resultType=full&usi={usi}" + + def __init__(self, **kwargs): + super(PRIDEBackend, self).__init__( + 'PRIDE', self._url_template, **kwargs) + + +class JPOSTBackend(_PROXIBackend): + _url_template = 'https://repository.jpostdb.org/proxi/spectra?resultType=full&usi={usi}' + + def __init__(self, **kwargs): + super(JPOSTBackend, self).__init__('jPOST', self._url_template, **kwargs) + kwargs.pop("version", None) + + +class ProteomeExchangeBackend(_PROXIBackend): + _url_template = 'http://proteomecentral.proteomexchange.org/api/proxi/v{version}/spectra?resultType=full&usi={usi!s}' + + def __init__(self, **kwargs): + + super(ProteomeExchangeBackend, self).__init__( + 'ProteomeExchange', self._url_template, **kwargs) + + +class PROXIAggregator(object): + '''Aggregate across requests across multiple PROXI servers. + + Will attempt to coalesce responses from responding servers into a single spectrum + representation. + + Attributes + ---------- + backends : :class:`dict` mapping :class:`str` to :class:`_PROXIBackend` + The backend servers to query. Defaults to the set of all available backends. + n_threads : int + The number of threads to run concurrently to while making requests. Defaults + to the number of servers to query. + timeout : float + The number of seconds to wait for a response. + ephemeral_pool : bool + Whether or not to tear down the thread pool between requests. + ''' + + _coalesce_resolution_methods = ("first", ) + + def __init__(self, backends=None, n_threads=None, timeout=15, merge=True, ephemeral_pool=True, **kwargs): + if backends is None: + backends = {k: v() for k, v in _proxies.items()} + if n_threads is None: + n_threads = len(backends) + + self.lock = threading.RLock() + + self.timeout = timeout + self.backends = backends + self.n_threads = n_threads + self.ephemeral_pool = ephemeral_pool + self.pool = None + self.merge = merge + + def _init_pool(self): + if ThreadPool is None: + return False + if self.pool is not None: + return True + with self.lock: + if self.pool is None: + self.pool = ThreadPool(self.n_threads) + return True + + def _clean_up_pool(self): + if self.pool: + self.pool.close() + self.pool.terminate() + self.pool = None + + def _fetch_usi(self, usi): + use_pool = self._init_pool() + agg = [] + if use_pool: + with self.lock: + for backend in self.backends.values(): + result = self.pool.apply_async(backend.get, (usi, )) + agg.append((backend, result)) + tmp = [] + for backend, res in agg: + try: + res = res.get(self.timeout) + tmp.append((backend, res)) + except (multiprocessing.TimeoutError, Exception) as err: + tmp.append((backend, err)) + agg = tmp + if self.ephemeral_pool: + self._clean_up_pool() + else: + for backend in self.backends.values(): + try: + agg.append(backend, backend.get(usi)) + except Exception as err: + agg.append((backend, err)) + continue + return agg + + def coalesce(self, responses, method='first'): + '''Merge responses from disparate servers into a single spectrum representation. + + The merging process will use the first of every array encountered, and all unique + attributes. + + Parameters + ---------- + responses : list + A list of response values, pairs (:class:`_PROXIBackend` and either + :class:`dict` or :class:`Exception`). + method : str + The name of the coalescence technique to use. Currently only "first" is + supported. + + Returns + ------- + result : :class:`dict` + The coalesced spectrum + ''' + if method not in self._coalesce_resolution_methods: + raise ValueError("Coalescence method %r not recognized" % (method, )) + + def collapse_attribute(values): + try: + acc = list(set(v['value'] for v in values)) + except TypeError: + acc = [] + for v in values: + if v['value'] not in acc: + acc.append(v['value']) + + result = [] + template = values[0].copy() + for v in acc: + t = template.copy() + t['value'] = v + result.append(t) + return result + + arrays = {} + attributes = defaultdict(list) + + found = [] + error = [] + + for backend, response in responses: + if isinstance(response, Exception): + error.append((backend.name, (response))) + continue + else: + found.append(backend.name) + for array_name in ('m/z array', 'intensity array'): + if array_name not in arrays: + arrays[array_name] = response[array_name] + else: + array = response[array_name] + if len(array) != len(arrays[array_name]): + warnings.warn("Length mismatch from %s for %s" % + (backend.name, array_name)) + arrays[array_name] = max((array, arrays[array_name]), key=len) + elif not np.allclose(array, arrays[array_name]): + warnings.warn("Value mismatch from %s for %s" % + (backend.name, array_name)) + for attr in response['attributes']: + attributes[attr.get('accession', attr.get('name'))].append(attr) + + finalized_attributes = [] + for k, v in attributes.items(): + finalized_attributes.extend(collapse_attribute(v)) + + result = {"responders": found, 'errors': error, 'attributes': finalized_attributes} + result.update(arrays) + if 'm/z array' not in result: + raise ValueError("No valid responses found") + return result + + def tag_with_source(self, responses): + '''Mark each response with it's source. + + Parameters + ---------- + responses : list + A list of response values, pairs (:class:`_PROXIBackend` and either + :class:`dict` or :class:`Exception`). + + Returns + ------- + result : list[dict] + The tagged :class:`dict` for each response. + ''' + output = [] + for backend, response in responses: + if isinstance(response, dict): + response['source'] = backend + else: + response = { + "source": backend, + "error": response + } + output.append(response) + return output + + def get(self, usi): + '''Retrieve a ``USI`` from each PROXI service over the network. + + Parameters + ---------- + usi : str or :class:`USI` + The universal spectrum identifier to retrieve. + + Returns + ------- + result : dict or list[dict] + The spectrum coalesced from all responding PROXI hosts if :attr:`merge` is :const:`True`, + or a list of responses marked by host. + ''' + agg = self._fetch_usi(usi) + if self.merge: + return self.coalesce(agg) + else: + return self.tag_with_source(agg) + + def __call__(self, usi): + return self.get(usi) + + def __del__(self): + self._clean_up_pool() + +_proxies = { + "peptide_atlas": PeptideAtlasBackend, + "massive": MassIVEBackend, + "pride": PRIDEBackend, + "jpost": JPOSTBackend, + 'proteome_exchange': ProteomeExchangeBackend, +} + +default_backend = 'peptide_atlas' + +AGGREGATOR_KEY = "aggregator" +AGGREGATOR = PROXIAggregator() + + +def proxi(usi, backend=default_backend, **kwargs): + '''Retrieve a ``USI`` from a `PROXI <http://www.psidev.info/proxi>`. + + Parameters + ---------- + usi : str or :class:`USI` + The universal spectrum identifier to request. + backend : str or :class:`Callable` + Either the name of a PROXI host (peptide_atlas, massive, pride, jpost, or aggregator), + or a callable object (which :class:`_PROXIBackend` instances are) which will be used + to resolve the USI. The "aggregator" backend will use a :class:`PROXIAggregator` instance + which will request the same USI from all the registered servers and attempt to merge their + responses into a single whole. See :meth:`PROXIAggregator.coalesce` for more details on the + merging process. + **kwargs: + extra arguments passed when constructing the backend by name. + + Returns + ------- + dict : + The spectrum as represented by the requested PROXI host. + ''' + if isinstance(backend, str): + if backend == AGGREGATOR_KEY: + backend = AGGREGATOR + elif backend in _proxies: + backend = _proxies[backend](**kwargs) + else: + raise PyteomicsError("Unknown PROXI backend name: {}.".format(backend)) + elif isinstance(backend, type) and issubclass(backend, (_PROXIBackend, PROXIAggregator)): + backend = backend(**kwargs) + elif callable(backend): + backend = backend + else: + raise TypeError("Unrecognized backend type: {0.__name__}".format(type(backend))) + return backend(usi) diff --git a/pyteomics/version.py b/pyteomics/version.py new file mode 100644 index 0000000..66aca50 --- /dev/null +++ b/pyteomics/version.py @@ -0,0 +1,66 @@ +""" +version - Pyteomics version information +======================================= + +This module is provided for convenience and captures information about the current version number of Pyteomics. + +Classes +------- + + :py:class:`VersionInfo` - a namedtuple for version numbers that supports comparisons and can be initialized + from a version string. + +Constants +--------- + + :py:const:`version` - a string with the current version. + + :py:const:`version_info` - a tuple with structured information about the current version. + +""" + +__version__ = '4.6.4b3' + +from collections import namedtuple +import re + + +class VersionInfo(namedtuple('VersionInfo', ('major', 'minor', 'micro', 'releaselevel', 'serial'))): + """Tuple mimicking :py:const:`sys.version_info`""" + def __new__(cls, version_str): + if isinstance(version_str, str): + groups = re.match(r'(\d+)\.(\d+)(?:\.)?(\d+)?([a-zA-Z]+)?(\d+)?', version_str).groups() + inst = super(VersionInfo, cls).__new__(cls, *groups) + else: + inst = super(VersionInfo, cls).__new__(cls, *(str(x) if x is not None else x for x in version_str)) + inst._version_str = version_str + inst._version_ints = tuple(int(x) if isinstance(x, str) and x.isdigit() else 0 for x in inst) + return inst + + def __str__(self): + return 'Version {}'.format(self._version_str) + + def __lt__(self, other): + if not isinstance(other, VersionInfo): + other = VersionInfo(other) + return self._version_ints < other._version_ints + + def __gt__(self, other): + if not isinstance(other, VersionInfo): + other = VersionInfo(other) + return self._version_ints > other._version_ints + + def __le__(self, other): + return self == other or self < other + + def __ge__(self, other): + return self == other or self > other + + def __eq__(self, other): + if not isinstance(other, VersionInfo): + other = VersionInfo(other) + return super(VersionInfo, self).__eq__(other) + + +version_info = VersionInfo(__version__) +version = __version__ diff --git a/pyteomics/xml.py b/pyteomics/xml.py new file mode 100644 index 0000000..db960c7 --- /dev/null +++ b/pyteomics/xml.py @@ -0,0 +1,1335 @@ +""" +xml - utilities for XML parsing +=============================== + +This module is not intended for end users. It implements the abstract classes +for all XML parsers, :py:class:`XML` and :py:class:`IndexedXML`, and some utility functions. + +Dependencies +------------ + +This module requres :py:mod:`lxml` and :py:mod:`numpy`. + +-------------------------------------------------------------------------------- +""" + +# Copyright 2012 Anton Goloborodko, Lev Levitsky +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import socket +from traceback import format_exc +import warnings +from collections import OrderedDict, namedtuple +from itertools import islice +from lxml import etree +import numpy as np + +from .auxiliary import FileReader, PyteomicsError, basestring, _file_obj, HierarchicalOffsetIndex +from .auxiliary import unitint, unitfloat, unitstr, cvstr +from .auxiliary import _keepstate_method as _keepstate +from .auxiliary import BinaryDataArrayTransformer +from .auxiliary import TaskMappingMixin, IndexedReaderMixin, IndexSavingMixin + +try: # Python 2.7 + from urllib2 import urlopen, URLError +except ImportError: # Python 3.x + from urllib.request import urlopen, URLError + + +def _local_name(element): + """Strip namespace from the XML element's name""" + tag = element.tag + if tag and tag[0] == '{': + return tag.rpartition('}')[2] + return tag + + +def xsd_parser(schema_url): + """Parse an XSD file from the specified URL into a schema dictionary + that can be used by :class:`XML` parsers to automatically cast data to + the appropriate type. + + Parameters + ---------- + schema_url : str + The URL to retrieve the schema from + + Returns + ------- + dict + """ + ret = {} + if not (schema_url.startswith('http://') or + schema_url.startswith('https://') or + schema_url.startswith('file://')): + schema_url = 'file://' + schema_url + schema_file = urlopen(schema_url) + p = etree.XMLParser(remove_comments=True) + schema_tree = etree.parse(schema_file, parser=p) + types = {'ints': {'int', 'long', 'nonNegativeInteger', 'positiveInt', + 'integer', 'unsignedInt'}, + 'floats': {'float', 'double'}, + 'bools': {'boolean'}, + 'intlists': {'listOfIntegers'}, + 'floatlists': {'listOfFloats'}, + 'charlists': {'listOfChars', 'listOfCharsOrAny'}} + for k, val in types.items(): + tuples = set() + for elem in schema_tree.iter(): + if _local_name(elem) == 'attribute' and elem.attrib.get( + 'type', '').split(':')[-1] in val: + anc = elem.getparent() + anc_name = _local_name(anc) + while not ( + (anc_name == 'complexType' and 'name' in anc.attrib) or anc_name == 'element'): + anc = anc.getparent() + anc_name = _local_name(anc) + if anc is None: + break + else: + if anc_name == 'complexType': + elnames = [x.attrib['name'] for x in + schema_tree.iter() + if x.attrib.get('type', '').split(':')[-1] == anc.attrib['name']] + else: + elnames = (anc.attrib['name'],) + for elname in elnames: + tuples.add( + (elname, elem.attrib['name'])) + ret[k] = tuples + ret['lists'] = set(elem.attrib['name'] for elem in schema_tree.xpath( + '//*[local-name()="element"]') if 'name' in elem.attrib and + elem.attrib.get('maxOccurs', '1') != '1') + return ret + + +class XMLValueConverter(object): + # Adapted from http://stackoverflow.com/questions/2764269/parsing-an-xsduration-datatype-into-a-python-datetime-timedelta-object + _duration_parser = re.compile( + (r'(?P<sign>-?)P(?:(?P<years>\d+\.?\d*)Y)?(?:(?P<months>\d+\.?\d*)M)?(?:(?P<days>\d+\.?\d*)D)?(?:T(?:(?P<hours>\d+\.?\d*)H)?(?:(?P<minutes>\d+\.?\d*)M)?(?:(?P<seconds>\d+\.?\d*)S)?)?')) + + @classmethod + def duration_str_to_float(cls, s): + # Not a duration, so pass along + if not s.startswith('P'): + try: + return unitfloat(s, 'duration') + except ValueError: + return unitstr(s, 'duration') + match = cls._duration_parser.search(s) + if match: + matchdict = match.groupdict() + hours = float(matchdict.get('hours', 0) or 0) + minutes = float(matchdict.get('minutes', 0) or 0) + seconds = float(matchdict.get('seconds', 0) or 0) + minutes += hours * 60. + minutes += (seconds / 60.) + return unitfloat(minutes, 'minute') + else: + return unitstr(s, 'duration') + + @classmethod + def str_to_bool(cls, s): + if s.lower() in {'true', '1', 'y'}: + return True + if s.lower() in {'false', '0', 'n'}: + return False + raise PyteomicsError('Cannot convert string to bool: ' + s) + + @classmethod + def str_to_num(cls, s, numtype): + return numtype(s) if s else None + + @classmethod + def to(cls, t): + def convert_from(s): + return cls.str_to_num(s, t) + return convert_from + + @classmethod + def converters(cls): + return { + 'ints': cls.to(unitint), 'floats': cls.to(unitfloat), 'bools': cls.str_to_bool, + 'intlists': lambda x: np.fromstring(x.replace('\n', ' '), dtype=int, sep=' '), + 'floatlists': lambda x: np.fromstring(x.replace('\n', ' '), sep=' '), + 'charlists': list, + 'duration': cls.duration_str_to_float + } + + +class _XMLParam(namedtuple("XMLParam", ("name", "value", "type"))): + '''A holder for semantic parameters used in several common XML formats + + Attributes + ---------- + name: :class:`~.cvstr` + The name of the attribute, carrying the accession and unit information + value: :class:`~.unitfloat`, :class:`~.unitint` or :class:`~.unitstr` + The value of the parameter + type: :class:`str` + The parameter's local XML tag name. + ''' + __slots__ = () + + def is_empty(self): + value = self.value + return value == "" or value is None + + +class XML(FileReader): + """Base class for all format-specific XML parsers. The instances can be used + as context managers and as iterators. + """ + # Configurable data + file_format = 'XML' + _root_element = None + _default_schema = {} + _read_schema = False + _default_version = 0 + _default_iter_tag = None + _default_iter_path = None + _structures_to_flatten = [] + _schema_location_param = 'schemaLocation' + _default_id_attr = 'id' + _huge_tree = False + _retrieve_refs_enabled = None # only some subclasses implement this + _iterative = True + + # Configurable plugin logic + _converters = XMLValueConverter.converters() + _element_handlers = {} + + # Must be implemented by subclasses + def _get_info_smart(self, element, **kwargs): + raise NotImplementedError + + def __init__(self, source, read_schema=None, iterative=None, build_id_cache=False, **kwargs): + """Create an XML parser object. + + Parameters + ---------- + source : str or file + File name or file-like object corresponding to an XML file. + read_schema : bool, optional + Defines whether schema file referenced in the file header + should be used to extract information about value conversion. + Default is :py:const:`False`. + iterative : bool, optional + Defines whether an :py:class:`ElementTree` object should be + constructed and stored on the instance or if iterative parsing + should be used instead. Iterative parsing keeps the memory usage + low for large XML files. Default is :py:const:`True`. + build_id_cache : bool, optional + Defines whether a dictionary mapping IDs to XML tree elements + should be built and stored on the instance. It is used in + :py:meth:`XML.get_by_id`, e.g. when using + :py:class:`pyteomics.mzid.MzIdentML` with ``retrieve_refs=True``. + huge_tree : bool, optional + This option is passed to the `lxml` parser and defines whether + security checks for XML tree depth and node size should be disabled. + Default is :py:const:`False`. + Enable this option for trusted files to avoid XMLSyntaxError exceptions + (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). + """ + + super(XML, self).__init__(source, mode='rb', parser_func=self.iterfind, pass_file=False, + args=(self._default_iter_path or self._default_iter_tag,), kwargs=kwargs) + if iterative is None: + iterative = self._iterative + if iterative: + self._tree = None + else: + self.build_tree() + if build_id_cache: + self.build_id_cache() + else: + self._id_dict = None + + self.version_info = self._get_version_info() + if read_schema is not None: + self._read_schema = read_schema + self.schema_info = self._get_schema_info(read_schema) + + self._converters_items = self._converters.items() + self._huge_tree = kwargs.get('huge_tree', self._huge_tree) + self._retrieve_refs_enabled = kwargs.get('retrieve_refs') + + def __reduce_ex__(self, protocol): + return self.__class__, ( + self._source_init, self._read_schema, self._tree is None, + False, + ), self.__getstate__() + + def __getstate__(self): + state = super(XML, self).__getstate__() + state['_huge_tree'] = self._huge_tree + state['_retrieve_refs_enabled'] = self._retrieve_refs_enabled + state['_id_dict'] = self._id_dict + return state + + def __setstate__(self, state): + super(XML, self).__setstate__(state) + self._huge_tree = state['_huge_tree'] + self._retrieve_refs_enabled = state['_retrieve_refs_enabled'] + self._id_dict = state['_id_dict'] + + @_keepstate + def _get_version_info(self): + """ + Provide version information about the XML file. + + Returns + ------- + out : tuple + A (version, schema URL) tuple, both elements are strings or None. + """ + for _, elem in etree.iterparse( + self._source, events=('start',), remove_comments=True, huge_tree=self._huge_tree): + if _local_name(elem) == self._root_element: + return (elem.attrib.get('version'), + elem.attrib.get(('{{{}}}'.format(elem.nsmap['xsi']) + if 'xsi' in elem.nsmap else '') + self._schema_location_param)) + + @_keepstate + def _get_schema_info(self, read_schema=True): + """Stores defaults for the schema, tries to retrieve the schema for + other versions. Keys are: 'floats', 'ints', 'bools', 'lists', + 'intlists', 'floatlists', 'charlists'.""" + if not read_schema: + return self._default_schema + + version, schema = self.version_info + if version == self._default_version: + return self._default_schema + + ret = {} + try: + if not schema: + schema_url = '' + raise PyteomicsError( + 'Schema information not found in {}.'.format(self.name)) + schema_url = schema.split()[-1] + ret = xsd_parser(schema_url) + except Exception as e: + if isinstance(e, (URLError, socket.error, socket.timeout)): + warnings.warn("Can't get the {0.file_format} schema for version " + "`{1}` from <{2}> at the moment.\n" + "Using defaults for {0._default_version}.\n" + "You can disable reading the schema by specifying " + "`read_schema=False`.".format(self, version, schema_url)) + else: + warnings.warn("Unknown {0.file_format} version `{1}`.\n" + "Attempt to use schema " + "information from <{2}> failed.\n" + "Exception information:\n{3}\n" + "Falling back to defaults for {0._default_version}\n" + "NOTE: This is just a warning, probably from a badly-" + "generated XML file.\nYou will still most probably get " + "decent results.\nLook here for suppressing warnings:\n" + "http://docs.python.org/library/warnings.html#" + "temporarily-suppressing-warnings\n" + "You can also disable reading the schema by specifying " + "`read_schema=False`.\n" + "If you think this shouldn't have happened, please " + "report this to\n" + "http://github.com/levitsky/pyteomics/issues\n" + "".format(self, version, schema_url, format_exc())) + ret = self._default_schema + return ret + + def _handle_param(self, element, **kwargs): + """Unpacks cvParam and userParam tags into key-value pairs""" + types = {'int': unitint, 'float': unitfloat, 'string': unitstr} + attribs = element.attrib + unit_info = None + unit_accesssion = None + if 'unitCvRef' in attribs or 'unitName' in attribs: + unit_accesssion = attribs.get('unitAccession') + unit_name = attribs.get('unitName', unit_accesssion) + unit_info = unit_name + accession = attribs.get('accession') + value = attribs.get('value', '') + try: + if attribs.get('type') in types: + value = types[attribs['type']](value, unit_info) + else: + value = unitfloat(value, unit_info) + except ValueError: + value = unitstr(value, unit_info) + + # return {cvstr(attribs['name'], accession, unit_accesssion): value} + return _XMLParam(cvstr(attribs['name'], accession, unit_accesssion), value, _local_name(element)) + + def _handle_referenceable_param_group(self, param_group_ref, **kwargs): + raise NotImplementedError() + return [] + + def _find_immediate_params(self, element, **kwargs): + return element.xpath( + './*[local-name()="cvParam" or local-name()="userParam" or local-name()="UserParam" or local-name()="referenceableParamGroupRef"]') + + def _insert_param(self, info_dict, param): + key = param.name + if key in info_dict: + if isinstance(info_dict[key], list): + info_dict[key].append(param.value) + else: + info_dict[key] = [info_dict[key], param.value] + else: + info_dict[key] = param.value + + def _promote_empty_parameter_to_name(self, info, params): + empty_values = [] + not_empty_values = [] + for param in params: + if param.is_empty(): + empty_values.append(param) + else: + not_empty_values.append(param) + + if len(empty_values) == 1 and 'name' not in info: + info['name'] = empty_values[0].name + return info, not_empty_values + return info, params + + def _get_info(self, element, **kwargs): + """Extract info from element's attributes, possibly recursive. + <cvParam> and <userParam> elements are treated in a special way.""" + try: + name = kwargs.pop('ename') + except KeyError: + name = _local_name(element) + schema_info = self.schema_info + if name in {'cvParam', 'userParam', 'UserParam'}: + return self._handle_param(element, **kwargs) + elif name == "referenceableParamGroupRef": + return self._handle_referenceable_param_group(element, **kwargs) + + info = dict(element.attrib) + # process subelements + params = [] + if kwargs.get('recursive'): + for child in element.iterchildren(): + cname = _local_name(child) + if cname in {'cvParam', 'userParam', 'UserParam'}: + newinfo = self._handle_param(child, **kwargs) + params.append(newinfo) + elif cname == "referenceableParamGroupRef": + params.extend(self._handle_referenceable_param_group(child, **kwargs)) + else: + if cname not in schema_info['lists']: + info[cname] = self._get_info_smart(child, ename=cname, **kwargs) + else: + info.setdefault(cname, []).append( + self._get_info_smart(child, ename=cname, **kwargs)) + else: + # handle the case where we do not want to unpack all children, but + # *Param tags are considered part of the current entity, semantically + for child in self._find_immediate_params(element, **kwargs): + param_or_group = self._handle_param(child, **kwargs) + if isinstance(param_or_group, list): + params.extend(param_or_group) + else: + params.append(param_or_group) + + handler = self._element_handlers.get(name) + if handler is not None: + info, params = handler(self, info, params) + + for param in params: + self._insert_param(info, param) + + # process element text + if element.text: + stext = element.text.strip() + if stext: + if info: + info[name] = stext + else: + return stext + + # convert types + try: + for k, v in info.items(): + for t, a in self._converters_items: + if t in schema_info and (name, k) in schema_info[t]: + info[k] = a(v) + except ValueError as e: + message = 'Error when converting types: {}'.format(e.args) + if not self._read_schema: + message += '\nTry reading the file with read_schema=True' + raise PyteomicsError(message) + + # resolve refs + if kwargs.get('retrieve_refs', self._retrieve_refs_enabled): + self._retrieve_refs(info, **kwargs) + + # flatten the excessive nesting + for k, v in dict(info).items(): + if k in self._structures_to_flatten: + if isinstance(v, list): + for vi in v: + info.update(vi) + else: + info.update(v) + del info[k] + + # another simplification + for k, v in dict(info).items(): + if isinstance(v, dict) and 'name' in v and len(v) == 1: + info[k] = v['name'] + if len(info) == 2 and 'name' in info and ( + 'value' in info or 'values' in info): + name = info.pop('name') + info = {name: info.popitem()[1]} + return info + + @_keepstate + def build_tree(self): + """Build and store the :py:class:`ElementTree` instance + for the underlying file""" + p = etree.XMLParser(remove_comments=True, huge_tree=True) + self._tree = etree.parse(self._source, parser=p) + + def clear_tree(self): + """Remove the saved :py:class:`ElementTree`.""" + self._tree = None + + def _retrieve_refs(self, info, **kwargs): + """Retrieves and embeds the data for each attribute in `info` that + ends in _ref. Removes the id attribute from `info`. + + This implementation is a stub and must be implemented for each specific + subclass. It is only called if :attr:`retrieve_refs` """ + raise NotImplementedError( + ("_retrieve_refs is not implemented for {}. " + "Do not use `retrieve_refs=True`.").format( + self.__class__.__name__)) + + def iterfind(self, path, **kwargs): + """Parse the XML and yield info on elements with specified local + name or by specified "XPath". + + Parameters + ---------- + path : str + Element name or XPath-like expression. The path is very close to + full XPath syntax, but local names should be used for all elements in the path. + They will be substituted with local-name() checks, up to the (first) predicate. + The path can be absolute or "free". Please don't specify namespaces. + **kwargs : passed to :py:meth:`self._get_info_smart`. + + Returns + ------- + out : iterator + """ + return Iterfind(self, path, **kwargs) + + @_keepstate + def _iterfind_impl(self, path, **kwargs): + """Parse the XML and yield info on elements with specified local + name or by specified "XPath". + + Parameters + ---------- + path : str + Element name or XPath-like expression. The path is very close to + full XPath syntax, but local names should be used for all elements in the path. + They will be substituted with local-name() checks, up to the (first) predicate. + The path can be absolute or "free". Please don't specify namespaces. + **kwargs : passed to :py:meth:`self._get_info_smart`. + + Returns + ------- + out : iterator + """ + try: + path, tail = re.match(pattern_path, path).groups() + except AttributeError: + raise PyteomicsError('Invalid path: ' + path) + if path[:2] == '//' or path[0] != '/': + absolute = False + if path[:2] == '//': + path = path[2:] + if path[0] == '/' or '//' in path: + raise PyteomicsError("Too many /'s in a row.") + else: + absolute = True + path = path[1:] + nodes = path.rstrip('/').split('/') + if not nodes: + raise PyteomicsError('Invalid path: ' + path) + + if not self._tree: + if tail: + if tail[0] == '[': + tail = '(.)' + tail + else: + raise PyteomicsError('Cannot parse path tail: ' + tail) + xpath = etree.XPath(tail) + localname = nodes[0] + found = False + for ev, elem in etree.iterparse(self, events=('start', 'end'), remove_comments=True, huge_tree=self._huge_tree): + name_lc = _local_name(elem) + if ev == 'start': + if name_lc == localname or localname == '*': + found += 1 + else: + if name_lc == localname or localname == '*': + if (absolute and elem.getparent() is None) or not absolute: + for child in get_rel_path(elem, nodes[1:]): + if tail: + for elem in xpath(child): + info = self._get_info_smart(elem, **kwargs) + yield info + else: + info = self._get_info_smart(child, **kwargs) + yield info + if not localname == '*': + found -= 1 + if not found: + elem.clear() + else: + xpath = ('/' if absolute else '//') + '/'.join( + '*[local-name()="{}"]'.format(node) if node != '*' else '*' for node in nodes ) + tail + for elem in self._tree.xpath(xpath): + info = self._get_info_smart(elem, **kwargs) + yield info + + @_keepstate + def build_id_cache(self): + """Construct a cache for each element in the document, indexed by id + attribute""" + stack = 0 + id_dict = {} + for event, elem in etree.iterparse(self._source, events=('start', 'end'), + remove_comments=True, huge_tree=self._huge_tree): + if event == 'start': + if 'id' in elem.attrib: + stack += 1 + else: + if 'id' in elem.attrib: + stack -= 1 + id_dict[elem.attrib['id']] = elem + elif stack == 0: + elem.clear() + self._id_dict = id_dict + + def clear_id_cache(self): + """Clear the element ID cache""" + self._id_dict = {} + + def _find_by_id_no_reset(self, elem_id, id_key=None): + """ + An almost exact copy of :meth:`get_by_id` with the difference that it does + not reset the file reader's position before iterative parsing. + + Parameters + ---------- + elem_id : str + The element id to query for + + Returns + ------- + lxml.Element + """ + found = False + if id_key is None: + id_key = self._default_id_attr + for event, elem in etree.iterparse( + self._source, events=('start', 'end'), remove_comments=True, huge_tree=self._huge_tree): + if event == 'start': + if elem.attrib.get(id_key) == elem_id: + found = True + else: + if elem.attrib.get(id_key) == elem_id: + return elem + if not found: + elem.clear() + raise KeyError(elem_id) + + @_keepstate + def get_by_id(self, elem_id, **kwargs): + """Parse the file and return the element with `id` attribute equal + to `elem_id`. Returns :py:const:`None` if no such element is found. + + Parameters + ---------- + elem_id : str + The value of the `id` attribute to match. + + Returns + ------- + out : :py:class:`dict` or :py:const:`None` + """ + if not self._id_dict: + elem = self._find_by_id_no_reset(elem_id) + else: + elem = self._id_dict[elem_id] + return self._get_info_smart(elem, **kwargs) + + +# XPath emulator tools +pattern_path = re.compile(r'([\w/*]*)(.*)') + + +def get_rel_path(element, names): + if not names: + yield element + else: + for child in element.iterchildren(): + if names[0] == '*' or _local_name(child) == names[0]: + if len(names) == 1: + yield child + else: + for gchild in get_rel_path(child, names[1:]): + yield gchild + + +def xpath(tree, path, ns=None): + """Return the results of XPath query with added namespaces. + Assumes the ns declaration is on the root element or absent. + + Parameters + ---------- + + tree : ElementTree + path : str + ns : str or None, optional + """ + if hasattr(tree, 'getroot'): + root = tree.getroot() + else: + root = tree + while root.getparent() is not None: + root = root.getparent() + ns = root.nsmap.get(ns) + + def repl(m): + s = m.group(1) + if not ns: return s + if not s: return 'd:' + return '/d:' + new_path = re.sub(r'(\/|^)(?![\*\/])', repl, path) + n_s = ({'d': ns} if ns else None) + return tree.xpath(new_path, namespaces=n_s) + + +def _make_version_info(cls): + def version_info(source): + return cls(source).version_info + version_info.__doc__ = """ + Provide version information about the {0.file_format} file. + + .. note:: This function is provided for backward compatibility only. + It simply creates an :py:class:`{0.__name__}` instance + and returns its :py:data:`!version_info` attribute. + + Parameters + ---------- + source : str or file + File name or file-like object. + + Returns + ------- + out : tuple + A (version, schema URL) tuple, both elements are strings or None. + """.format(cls) + return version_info + + +class ByteCountingXMLScanner(_file_obj): + """ + Carry out the construction of a byte offset index for `source` XML file + for each type of tag in :attr:`indexed_tags`. + + Inheris from :py:class:`pyteomics.auxiliary._file_obj` to support the object-oriented + :py:func:`_keep_state` interface. + """ + entities = { + 'quot': '"', + 'amp': '&', + 'apos': "'", + 'lt': '<', + 'gt': '>', + } + + xml_entity_pattern = re.compile(r"&({});".format('|'.join(entities.keys()))) + + def __init__(self, source, indexed_tags, block_size=1000000): + """ + Parameters + ---------- + indexed_tags : iterable of bytes + The XML tags (without namespaces) to build indices for. + block_size : int, optional + The size of the each chunk or "block" of the file to hold in memory as a + partitioned string at any given time. Defaults to `1000000`. + """ + super(ByteCountingXMLScanner, self).__init__(source, 'rb') + self.indexed_tags = ensure_bytes(indexed_tags) + self.block_size = block_size + + def _chunk_iterator(self): + """ + Read a file in large blocks and chunk up each block into parts + resembling XML tags, yielding each chunk. + + Assumes the file is opened in binary mode. + """ + f = self.file + read_size = self.block_size + delim = b'<' + buff = f.read(read_size) + started_with_delim = buff.startswith(delim) + parts = buff.split(delim) + tail = parts[-1] + front = parts[:-1] + i = 0 + for part in front: + i += 1 + if part == b"": + continue + if i == 1: + if started_with_delim: + yield delim + part + else: + yield part + else: + yield delim + part + running = True + while running: + buff = f.read(read_size) + if not buff: + running = False + buff = tail + else: + buff = tail + buff + parts = buff.split(delim) + tail = parts[-1] + front = parts[:-1] + for part in front: + yield delim + part + + def _generate_offsets(self): + """ + Iterate over the lines of an XML file where each line contains exactly one tag, + tracking the byte count for each line. When a line contains a tag whose name matches + a name in :attr:`indexed_tags`, yield the byte offset, the tag type, and it's attributes. + + Yields + ------ + offset : int + The byte offset of a matched tag's opening line + tag_type : bytes + The type of tag matched + attr_dict : dict + The attributes on the matched tag + """ + i = 0 + packed = b"|".join(self.indexed_tags) + pattern = re.compile((r"^\s*<(%s)\s" % packed.decode()).encode()) + attrs = re.compile(br"(\S+)=[\"']([^\"']*)[\"']") + for line in self._chunk_iterator(): + match = pattern.match(line) + if match: + yield i, match.group(1), dict(attrs.findall(line)) + i += len(line) + + def _entity_sub_cb(self, match): + ent = match.group(1) + return self.entities[ent] + + def replace_entities(self, key): + '''Replace XML entities in a string with their character representation + + Uses the minimal mapping of XML entities pre-defined for all XML documents and + does not attempt to deal with external DTD defined entities. This mapping is found + in :attr:`entities`. + + Parameters + ---------- + key : str + The string to substitute + + Returns + ------- + str + ''' + return self.xml_entity_pattern.sub(self._entity_sub_cb, key) + + @_keepstate + def build_byte_index(self, lookup_id_key_mapping=None): + """ + Builds a byte offset index for one or more types of tags. + + Parameters + ---------- + lookup_id_key_mapping : Mapping, optional + A mapping from tag name to the attribute to look up the identity + for each entity of that type to be extracted. Defaults to 'id' for + each type of tag. + + Returns + ------- + defaultdict(dict) + Mapping from tag type to dict from identifier to byte offset + """ + if lookup_id_key_mapping is None: + lookup_id_key_mapping = {} + lookup_id_key_mapping = {ensure_bytes_single(key): ensure_bytes_single(value) + for key, value in lookup_id_key_mapping.items()} + + for name in self.indexed_tags: + bname = ensure_bytes_single(name) + lookup_id_key_mapping.setdefault(bname, 'id') + lookup_id_key_mapping[bname] = ensure_bytes_single(lookup_id_key_mapping[bname]) + + indices = HierarchicalOffsetIndex() + g = self._generate_offsets() + for offset, offset_type, attrs in g: + k = attrs[lookup_id_key_mapping[offset_type]].decode('utf-8') + if '&' in k: + k = self.replace_entities(k) + indices[offset_type.decode('utf-8')][k] = offset + return indices + + @classmethod + def scan(cls, source, indexed_tags): + inst = cls(source, indexed_tags) + return inst.build_byte_index() + + +class TagSpecificXMLByteIndex(object): + """ + Encapsulates the construction and querying of a byte offset index + for a set of XML tags. + + This type mimics an immutable Mapping. + + Attributes + ---------- + indexed_tags : iterable of bytes + The tag names to index, not including a namespace + offsets : defaultdict(OrderedDict(str, int)) + The hierarchy of byte offsets organized ``{"tag_type": {"id": byte_offset}}`` + indexed_tag_keys: dict(str, str) + A mapping from tag name to unique identifier attribute + + Parameters + ---------- + index_tags: iterable of bytes + The tag names to include in the index + + """ + _default_indexed_tags = [] + _default_keys = {} + _scanner_class = ByteCountingXMLScanner + + def __init__(self, source, indexed_tags=None, keys=None): + if keys is None: + keys = self._default_keys.copy() + if indexed_tags is None: + indexed_tags = self._default_indexed_tags + self.indexed_tags = indexed_tags + self.indexed_tag_keys = keys + self.source = source + self.offsets = HierarchicalOffsetIndex() + self.build_index() + + def __getstate__(self): + state = {} + state['indexed_tags'] = self.indexed_tags + state['indexed_tag_keys'] = self.indexed_tag_keys + state['offsets'] = self.offsets + return state + + def __setstate__(self, state): + self.indexed_tags = state['indexed_tags'] + self.indexed_tag_keys = state['indexed_tag_keys'] + self.offsets = state['offsets'] + + def __getitem__(self, key): + return self.offsets[key] + + def build_index(self): + """ + Perform the byte offset index building for py:attr:`source`. + + Returns + ------- + offsets: defaultdict + The hierarchical offset, stored in offsets + """ + scanner = self._scanner_class(self.source, self.indexed_tags) + self.offsets = scanner.build_byte_index(self.indexed_tag_keys) + return self.offsets + + def items(self): + return self.offsets.items() + + def keys(self): + return self.offsets.keys() + + def __iter__(self): + return iter(self.keys()) + + def __len__(self): + return sum(len(group) for key, group in self.items()) + + @classmethod + def build(cls, source, indexed_tags=None, keys=None): + indexer = cls(source, indexed_tags, keys) + return indexer.offsets + + +def ensure_bytes_single(string): + if isinstance(string, bytes): + return string + try: + return string.encode('utf-8') + except (AttributeError, UnicodeEncodeError): + raise PyteomicsError('{!r} could not be encoded'.format(string)) + + +def ensure_bytes(strings): + if isinstance(strings, basestring): + strings = [strings] + return [ensure_bytes_single(string) for string in strings] + + +def _flatten_map(hierarchical_map): + all_records = [] + for key, records in hierarchical_map.items(): + all_records.extend(records.items()) + + all_records.sort(key=lambda x: x[1]) + return OrderedDict(all_records) + + +class IndexedXML(IndexedReaderMixin, XML): + """Subclass of :py:class:`XML` which uses an index of byte offsets for some + elements for quick random access. + """ + _indexed_tags = set() + _indexed_tag_keys = {} + _use_index = True + + def __init__(self, source, read_schema=False, iterative=True, build_id_cache=False, + use_index=None, *args, **kwargs): + """Create an indexed XML parser object. + + Parameters + ---------- + source : str or file + File name or file-like object corresponding to an XML file. + read_schema : bool, optional + Defines whether schema file referenced in the file header + should be used to extract information about value conversion. + Default is :py:const:`False`. + iterative : bool, optional + Defines whether an :py:class:`ElementTree` object should be + constructed and stored on the instance or if iterative parsing + should be used instead. Iterative parsing keeps the memory usage + low for large XML files. Default is :py:const:`True`. + use_index : bool, optional + Defines whether an index of byte offsets needs to be created for + elements listed in `indexed_tags`. + This is useful for random access to spectra in mzML or elements of mzIdentML files, + or for iterative parsing of mzIdentML with ``retrieve_refs=True``. + If :py:const:`True`, `build_id_cache` is ignored. + If :py:const:`False`, the object acts exactly like :py:class:`XML`. + Default is :py:const:`True`. + indexed_tags : container of bytes, optional + If `use_index` is :py:const:`True`, elements listed in this parameter + will be indexed. Empty set by default. + """ + tags = kwargs.get('indexed_tags') + tag_index_keys = kwargs.get('indexed_tag_keys') + + if tags is not None: + self._indexed_tags = tags + if tag_index_keys is not None: + self._indexed_tag_keys = tag_index_keys + + if use_index is not None: + self._use_index = use_index + + if use_index: + build_id_cache = False + if self._default_iter_path and self._default_iter_path != self._default_iter_tag: + warnings.warn('_default_iter_path differs from _default_iter_tag and index is enabled. ' + '_default_iter_tag will be used in the index, mind the consequences.') + super(IndexedXML, self).__init__(source, read_schema, iterative, build_id_cache, *args, **kwargs) + + self._offset_index = None + self._build_index() + + @property + def default_index(self): + return self._offset_index[self._default_iter_tag] + + def __reduce_ex__(self, protocol): + reconstructor, args, state = XML.__reduce_ex__(self, protocol) + args = args + (False, ) + return reconstructor, args, state + + def __getstate__(self): + state = super(IndexedXML, self).__getstate__() + state['_indexed_tags'] = self._indexed_tags + state['_indexed_tag_keys'] = self._indexed_tag_keys + state['_use_index'] = self._use_index + state['_offset_index'] = self._offset_index + return state + + def __setstate__(self, state): + super(IndexedXML, self).__setstate__(state) + self._indexed_tags = state['_indexed_tags'] + self._indexed_tag_keys = state['_indexed_tag_keys'] + self._use_index = state['_use_index'] + self._offset_index = state['_offset_index'] + + @_keepstate + def _build_index(self): + """ + Build up a `dict` of `dict` of offsets for elements. Calls :func:`find_index_list` + on :attr:`_source` and assigns the return value to :attr:`_offset_index` + """ + if not self._indexed_tags or not self._use_index: + return + self._offset_index = TagSpecificXMLByteIndex.build( + self._source, self._indexed_tags, self._indexed_tag_keys) + + @_keepstate + def _find_by_id_reset(self, elem_id, id_key=None): + return self._find_by_id_no_reset(elem_id, id_key=id_key) + + @_keepstate + def get_by_id(self, elem_id, id_key=None, element_type=None, **kwargs): + """ + Retrieve the requested entity by its id. If the entity + is a spectrum described in the offset index, it will be retrieved + by immediately seeking to the starting position of the entry, otherwise + falling back to parsing from the start of the file. + + Parameters + ---------- + elem_id : str + The id value of the entity to retrieve. + id_key : str, optional + The name of the XML attribute to use for lookup. + Defaults to :py:attr:`self._default_id_attr`. + + Returns + ------- + dict + """ + try: + index = self._offset_index + if element_type is None: + offset, element_type = index.find_no_type(elem_id) + else: + offset = index.find(elem_id, element_type) + self._source.seek(offset) + if id_key is None: + id_key = self._indexed_tag_keys.get(element_type) + elem = self._find_by_id_no_reset(elem_id, id_key=id_key) + except (KeyError, AttributeError, etree.LxmlError): + elem = self._find_by_id_reset(elem_id, id_key=id_key) + data = self._get_info_smart(elem, **kwargs) + return data + + def __contains__(self, key): + return key in self._offset_index[self._default_iter_tag] + + def __len__(self): + return len(self._offset_index[self._default_iter_tag]) + + def iterfind(self, path, **kwargs): + """Parse the XML and yield info on elements with specified local + name or by specified "XPath". + + Parameters + ---------- + path : str + Element name or XPath-like expression. The path is very close to + full XPath syntax, but local names should be used for all elements in the path. + They will be substituted with local-name() checks, up to the (first) predicate. + The path can be absolute or "free". Please don't specify namespaces. + **kwargs : passed to :py:meth:`self._get_info_smart`. + + Returns + ------- + out : iterator + """ + if path in self._indexed_tags and self._use_index: + return IndexedIterfind(self, path, **kwargs) + return Iterfind(self, path, **kwargs) + + +class MultiProcessingXML(IndexedXML, TaskMappingMixin): + """XML reader that feeds indexes to external processes + for parallel parsing and analysis of XML entries.""" + + def _task_map_iterator(self): + """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC + queue used by :meth:`map` + + Returns + ------- + :class:`Iteratable` + """ + return iter(self._offset_index[self._default_iter_tag]) + + +class IndexSavingXML(IndexSavingMixin, IndexedXML): + """An extension to the IndexedXML type which + adds facilities to read and write the byte offset + index externally. + """ + _index_class = HierarchicalOffsetIndex + + def _read_byte_offsets(self): + """Read the byte offset index JSON file at :attr:`_byte_offset_filename` + and populate :attr:`_offset_index` + """ + with open(self._byte_offset_filename, 'r') as f: + index = self._index_class.load(f) + if index.schema_version is None: + raise TypeError("Legacy Offset Index!") + self._offset_index = index + + +class Iterfind(object): + def __init__(self, parser, tag_name, **kwargs): + self.parser = parser + self.tag_name = tag_name + self.config = kwargs + self._iterator = None + + def __repr__(self): + template = "{self.__class__.__name__}({self.tag_name!r}{config})" + if self.config: + config = ", " + repr(self.config) + else: + config = '' + return template.format(self=self, config=config) + + def __iter__(self): + return self + + def _make_iterator(self): + return self.parser._iterfind_impl(self.tag_name, **self.config) + + def __next__(self): + if self._iterator is None: + self._iterator = self._make_iterator() + return next(self._iterator) + + def next(self): + return self.__next__() + + @property + def is_indexed(self): + return False + + def reset(self): + self._iterator = None + self.parser.reset() + + def __enter__(self): + return self + + def __exit__(self, *args, **kwargs): + self.reset() + + def map(self, *args,**kwargs): + raise NotImplementedError("This query isn't indexed, it cannot be mapped with multiprocessing") + + def _get_by_index(self, idx): + self.reset() + value = next(islice(self, idx, idx + 1)) + return value + + def _get_by_slice(self, slc): + self.reset() + value = list(islice(self, slc.start, slc.stop, slc.step)) + return value + + def __getitem__(self, i): + if isinstance(i, slice): + return self._get_by_slice(i) + return self._get_by_index(i) + + +class IndexedIterfind(TaskMappingMixin, Iterfind): + + def __init__(self, parser, tag_name, **kwargs): + TaskMappingMixin.__init__(self, **kwargs) + Iterfind.__init__(self, parser, tag_name, **kwargs) + + def _task_map_iterator(self): + """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC + queue used by :meth:`map` + + Returns + ------- + :class:`Iteratable` + """ + return iter(self._index) + + @property + def _offset_index(self): + return self._index + + @property + def _index(self): + return self.parser.index[self.tag_name] + + def _get_reader_for_worker_spec(self): + return self.parser + + def _yield_from_index(self): + for key in self._task_map_iterator(): + yield self.parser.get_by_id(key, **self.config) + + def _make_iterator(self): + if self.is_indexed: + return self._yield_from_index() + warnings.warn("Non-indexed iterator created from %r" % (self, )) + return super(IndexedIterfind, self)._make_iterator() + + @property + def is_indexed(self): + if hasattr(self.parser, 'index'): + if self.parser.index is not None: + index = self.parser.index + if isinstance(index, HierarchicalOffsetIndex): + return bool(self.tag_name in index and index[self.tag_name]) + return False + + def _get_by_index(self, idx): + index = self._index + key = index.from_index(idx) + return self.parser.get_by_id(key) + + def _get_by_slice(self, slc): + index = self._index + keys = index.from_slice(slc) + return self.parser.get_by_ids(keys) + + def __len__(self): + index = self._index + return len(index) -- GitLab