diff --git a/pyteomics/__init__.py b/pyteomics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd278bfdfaf3fe2212d01b66df913df511692aa7
--- /dev/null
+++ b/pyteomics/__init__.py
@@ -0,0 +1,16 @@
+"""
+Copyright 2012 Anton Goloborodko, Lev Levitsky
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+__import__('pkg_resources').declare_namespace(__name__)
diff --git a/pyteomics/_schema_defaults.py b/pyteomics/_schema_defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31701c7d6fc0eb48838478da0831037b3d15c42
--- /dev/null
+++ b/pyteomics/_schema_defaults.py
@@ -0,0 +1,635 @@
+_protxml_schema_defaults = {'bools': set(),
+ 'charlists': set(),
+ 'floatlists': set(),
+ 'floats': {('ASAPRatio', 'heavy2light_ratio_mean'),
+  ('ASAPRatio', 'heavy2light_ratio_standard_dev'),
+  ('ASAPRatio', 'ratio_mean'),
+  ('ASAPRatio', 'ratio_standard_dev'),
+  ('ASAPRatio_pvalue', 'adj_ratio_mean'),
+  ('ASAPRatio_pvalue', 'adj_ratio_standard_dev'),
+  ('ASAPRatio_pvalue', 'decimal_pvalue'),
+  ('ASAPRatio_pvalue', 'heavy2light_adj_ratio_mean'),
+  ('ASAPRatio_pvalue', 'heavy2light_adj_ratio_standard_dev'),
+  ('ASAPRatio_pvalue', 'pvalue'),
+  ('ASAP_Peak', 'heavy2light_ratio_mean'),
+  ('ASAP_Peak', 'heavy2light_ratio_standard_dev'),
+  ('ASAP_Peak', 'ratio_mean'),
+  ('ASAP_Peak', 'ratio_standard_dev'),
+  ('ASAP_Peak', 'weight'),
+  ('ASAP_Seq', 'heavy2light_ratio_mean'),
+  ('ASAP_Seq', 'heavy2light_ratio_standard_dev'),
+  ('ASAP_Seq', 'ratio_mean'),
+  ('ASAP_Seq', 'ratio_standard_dev'),
+  ('ASAP_Seq', 'weight'),
+  ('ASAP_prot_analysis_summary', 'min_peptide_probability'),
+  ('ASAP_prot_analysis_summary', 'min_peptide_weight'),
+  ('ASAP_prot_analysis_summary', 'min_protein_probability'),
+  ('ASAP_pvalue_analysis_summary', 'background_fitting_error'),
+  ('ASAP_pvalue_analysis_summary', 'background_ratio_mean'),
+  ('ASAP_pvalue_analysis_summary', 'background_ratio_stdev'),
+  ('StPeterQuant', 'SIn'),
+  ('StPeterQuant', 'ng'),
+  ('StPeterQuant_peptide', 'spectralIndex'),
+  ('StPeter_analysis_summary', 'FDR'),
+  ('StPeter_analysis_summary', 'probability'),
+  ('StPeter_analysis_summary', 'sampleLoad'),
+  ('StPeter_analysis_summary', 'tolerance'),
+  ('XPress_analysis_summary', 'min_peptide_probability'),
+  ('XPress_analysis_summary', 'min_peptide_weight'),
+  ('XPress_analysis_summary', 'min_protein_probability'),
+  ('affected_channel', 'correction'),
+  ('decoy_analysis_summary', 'decoy_ratio'),
+  ('error_point', 'error'),
+  ('error_point', 'min_prob'),
+  ('fpkm_distribution', 'alt_pos_to_neg_ratio'),
+  ('fpkm_distribution', 'fpkm_lower_bound_excl'),
+  ('fpkm_distribution', 'fpkm_lower_bound_incl'),
+  ('fpkm_distribution', 'neg_freq'),
+  ('fpkm_distribution', 'pos_freq'),
+  ('fpkm_distribution', 'pos_to_neg_ratio'),
+  ('fragment_masses', 'mz'),
+  ('indistinguishable_peptide', 'calc_neutral_pep_mass'),
+  ('intensity', 'error'),
+  ('intensity', 'mz'),
+  ('intensity', 'ratio'),
+  ('libra_summary', 'mass_tolerance'),
+  ('libra_summary', 'min_pep_prob'),
+  ('libra_summary', 'min_pep_wt'),
+  ('libra_summary', 'min_prot_prob'),
+  ('ni_distribution', 'alt_pos_to_neg_ratio'),
+  ('ni_distribution', 'neg_freq'),
+  ('ni_distribution', 'ni_lower_bound_excl'),
+  ('ni_distribution', 'ni_lower_bound_incl'),
+  ('ni_distribution', 'pos_freq'),
+  ('ni_distribution', 'pos_to_neg_ratio'),
+  ('nsp_distribution', 'alt_pos_to_neg_ratio'),
+  ('nsp_distribution', 'neg_freq'),
+  ('nsp_distribution', 'nsp_lower_bound_excl'),
+  ('nsp_distribution', 'nsp_lower_bound_incl'),
+  ('nsp_distribution', 'pos_freq'),
+  ('nsp_distribution', 'pos_to_neg_ratio'),
+  ('peptide', 'calc_neutral_pep_mass'),
+  ('peptide', 'exp_sibling_ion_bin'),
+  ('peptide', 'exp_sibling_ion_instances'),
+  ('peptide', 'exp_tot_instances'),
+  ('peptide', 'fpkm_adjusted_probability'),
+  ('peptide', 'initial_probability'),
+  ('peptide', 'max_fpkm'),
+  ('peptide', 'n_sibling_peptides'),
+  ('peptide', 'ni_adjusted_probability'),
+  ('peptide', 'nsp_adjusted_probability'),
+  ('peptide', 'weight'),
+  ('point', 'fdr_pp'),
+  ('point', 'fdr_pp_decoy'),
+  ('point', 'logratio'),
+  ('point', 'model_distr'),
+  ('point', 'num_corr_pp'),
+  ('point', 'num_corr_pp_decoy'),
+  ('point', 'obs_distr'),
+  ('point', 'pp_decoy_uncert'),
+  ('point', 'pp_uncert'),
+  ('point', 'prob_cutoff'),
+  ('protein', 'confidence'),
+  ('protein', 'percent_coverage'),
+  ('protein', 'probability'),
+  ('protein_group', 'probability'),
+  ('protein_summary_data_filter', 'false_positive_error_rate'),
+  ('protein_summary_data_filter', 'min_probability'),
+  ('protein_summary_data_filter', 'predicted_num_correct'),
+  ('protein_summary_data_filter', 'predicted_num_incorrect'),
+  ('protein_summary_data_filter', 'sensitivity'),
+  ('protein_summary_header', 'initial_min_peptide_prob'),
+  ('protein_summary_header', 'min_peptide_probability'),
+  ('protein_summary_header', 'min_peptide_weight'),
+  ('protein_summary_header', 'num_predicted_correct_prots'),
+  ('protein_summary_header', 'total_no_spectrum_ids')},
+ 'intlists': set(),
+ 'ints': {('ASAPRatio', 'ratio_number_peptides'),
+  ('ASAP_Peak', 'datanum'),
+  ('ASAP_Seq', 'datanum'),
+  ('ASAP_pvalue_analysis_summary', 'asap_prot_id'),
+  ('ASAP_pvalue_analysis_summary', 'asapratio_id'),
+  ('StPeterQuant_peptide', 'charge'),
+  ('affected_channel', 'channel'),
+  ('analysis_result', 'id'),
+  ('analysis_summary', 'id'),
+  ('contributing_channel', 'channel'),
+  ('error_point', 'num_corr'),
+  ('error_point', 'num_incorr'),
+  ('fpkm_distribution', 'bin_no'),
+  ('fragment_masses', 'channel'),
+  ('intensity', 'channel'),
+  ('libra_result', 'number'),
+  ('libra_summary', 'centroiding_preference'),
+  ('libra_summary', 'normalization'),
+  ('libra_summary', 'output_type'),
+  ('ni_distribution', 'bin_no'),
+  ('nsp_distribution', 'bin_no'),
+  ('peptide', 'charge'),
+  ('peptide', 'fpkm_bin'),
+  ('peptide', 'n_enzymatic_termini'),
+  ('peptide', 'n_instances'),
+  ('peptide', 'n_sibling_peptides_bin'),
+  ('protein', 'n_indistinguishable_proteins'),
+  ('protein', 'total_number_distinct_peptides'),
+  ('protein', 'total_number_peptides'),
+  ('protein_summary_header', 'num_input_1_spectra'),
+  ('protein_summary_header', 'num_input_2_spectra'),
+  ('protein_summary_header', 'num_input_3_spectra'),
+  ('protein_summary_header', 'num_input_4_spectra'),
+  ('protein_summary_header', 'num_input_5_spectra')},
+ 'lists': {'ASAP_Dta',
+  'ASAP_Peak',
+  'ASAP_Seq',
+  'StPeterQuant_peptide',
+  'affected_channel',
+  'analysis_result',
+  'analysis_summary',
+  'contributing_channel',
+  'error_point',
+  'fpkm_distribution',
+  'fpkm_information',
+  'fragment_masses',
+  'indistinguishable_peptide',
+  'indistinguishable_protein',
+  'intensity',
+  'mod_aminoacid_mass',
+  'modification_info',
+  'ni_distribution',
+  'ni_information',
+  'nsp_distribution',
+  'parameter',
+  'peptide',
+  'peptide_parent_protein',
+  'point',
+  'protein',
+  'protein_group',
+  'protein_summary_data_filter'}}
+
+_mzid_schema_defaults = {'bools': {('Enzyme', 'semiSpecific'),
+      ('Enzymes', 'independent'),
+      ('PeptideEvidence', 'isDecoy'),
+      ('ProteinDetectionHypothesis', 'passThreshold'),
+      ('SearchModification', 'fixedMod'),
+      ('SpectrumIdentificationItem', 'passThreshold')},
+ 'charlists': {('Modification', 'residues'),
+      ('SearchModification', 'residues')},
+ 'floatlists': {('FragmentArray', 'values')},
+ 'floats': {('Modification', 'avgMassDelta'),
+      ('Modification', 'monoisotopicMassDelta'),
+      ('Residue', 'mass'),
+      ('SearchModification', 'massDelta'),
+      ('SpectrumIdentificationItem', 'calculatedMassToCharge'),
+      ('SpectrumIdentificationItem', 'calculatedPI'),
+      ('SpectrumIdentificationItem', 'experimentalMassToCharge'),
+      ('SubstitutionModification', 'avgMassDelta'),
+      ('SubstitutionModification', 'monoisotopicMassDelta')},
+ 'intlists': {('IonType', 'index'), ('MassTable', 'msLevel')},
+ 'ints': {('BibliographicReference', 'year'),
+      ('DBSequence', 'length'),
+      ('Enzyme', 'missedCleavages'),
+      ('IonType', 'charge'),
+      ('Modification', 'location'),
+      ('PeptideEvidence', 'end'),
+      ('PeptideEvidence', 'start'),
+      ('SearchDatabase', 'numDatabaseSequences'),
+      ('SearchDatabase', 'numResidues'),
+      ('SpectrumIdentificationItem', 'chargeState'),
+      ('SpectrumIdentificationItem', 'rank'),
+      ('SpectrumIdentificationList', 'numSequencesSearched'),
+      ('SubstitutionModification', 'location')},
+ 'lists': {'Affiliation',
+    'AmbiguousResidue',
+    'AnalysisSoftware',
+    'BibliographicReference',
+    'ContactRole',
+    'DBSequence',
+    'Enzyme',
+    'Filter',
+    'FragmentArray',
+    'InputSpectra',
+    'InputSpectrumIdentifications',
+    'IonType',
+    'MassTable',
+    'Measure',
+    'Modification',
+    'Peptide',
+    'PeptideEvidence',
+    'PeptideEvidenceRef',
+    'PeptideHypothesis',
+    'ProteinAmbiguityGroup',
+    'ProteinDetectionHypothesis',
+    'Residue',
+    'Sample',
+    'SearchDatabase',
+    'SearchDatabaseRef',
+    'SearchModification',
+    'SourceFile',
+    'SpecificityRules',
+    'SpectraData',
+    'SpectrumIdentification',
+    'SpectrumIdentificationItem',
+    'SpectrumIdentificationItemRef',
+    'SpectrumIdentificationList',
+    'SpectrumIdentificationProtocol',
+    'SpectrumIdentificationResult',
+    'SubSample',
+    'SubstitutionModification',
+    'TranslationTable',
+    'cv',
+    'cvParam'}}
+
+_trafoxml_schema_defaults = {'bools': set(),
+     'charlists': set(),
+     'floatlists': set(),
+     'floats': {('Pair', 'from'), ('Pair', 'to'), ('TrafoXML', 'version')},
+     'intlists': set(),
+     'ints': {('Pairs', 'count')},
+     'lists': {'Pair', 'Param'}}
+
+_featurexml_schema_defaults = {
+ 'ints': {('PeptideHit', 'charge'),
+    # ('PeptideIdentification', 'spectrum_reference'),
+    ('SearchParameters', 'missed_cleavages'),
+    # ('UnassignedPeptideIdentification', 'spectrum_reference'),
+    ('featureList', 'count'),
+    ('quality', 'dim'),
+    ('position', 'dim'),
+    ('feature', 'charge'),
+    ('convexhull', 'nr'),
+  },
+ 'floats': {('PeptideHit', 'score'),
+    ('PeptideIdentification', 'MZ'),
+    ('PeptideIdentification', 'RT'),
+    ('PeptideIdentification', 'significance_threshold'),
+    ('ProteinHit', 'coverage'),
+    ('ProteinHit', 'score'),
+    ('ProteinIdentification', 'significance_threshold'),
+    ('SearchParameters', 'peak_mass_tolerance'),
+    ('SearchParameters', 'precursor_peak_tolerance'),
+    ('UnassignedPeptideIdentification', 'MZ'),
+    ('UnassignedPeptideIdentification', 'RT'),
+    ('UnassignedPeptideIdentification', 'significance_threshold'),
+    ('featureMap', 'version'),
+    ('pt', 'x'),
+    ('pt', 'y'),
+    ('quality', 'quality'),
+    ('position', 'position'),
+    ('feature', 'overallquality'),
+    ('feature', 'intensity'),
+    },
+ 'bools': {('PeptideIdentification', 'higher_score_better'),
+    ('ProteinIdentification', 'higher_score_better'),
+    ('SearchParameters', 'peak_mass_tolerance_ppm'),
+    ('SearchParameters', 'precursor_peak_tolerance_ppm'),
+    ('UnassignedPeptideIdentification', 'higher_score_better')},
+ 'intlists': set(),
+ 'floatlists': set(),
+ 'charlists': set(),
+ 'lists': {'FixedModification',
+    'IdentificationRun',
+    'PeptideHit',
+    'PeptideIdentification',
+    'ProteinHit',
+    'ProteinIdentification',
+    'SearchParameters',
+    'UnassignedPeptideIdentification',
+    'UserParam',
+    'VariableModification',
+    'convexhull',
+    'dataProcessing',
+    'feature',
+    'hposition',
+    'hullpoint',
+    'param',
+    'position',
+    'processingAction',
+    'pt',
+    'quality'}}
+
+_tandem_schema_defaults = {'ints': {
+        ('group', 'z'), ('aa', 'at')} | {('domain', k) for k in [
+            'missed_cleavages', 'start', 'end', 'y_ions', 'b_ions',
+            'a_ions', 'x_ions', 'c_ions', 'z_ions']},
+
+            'floats': {('group', k) for k in [
+                'fI', 'sumI', 'maxI', 'mh', 'expect']} | {
+                   ('domain', k) for k in [
+                       'expect', 'hyperscore', 'b_score', 'y_score',
+                       'a_score', 'x_score', 'c_score', 'z_score',
+                       'nextscore', 'delta', 'mh']} | {
+                   ('protein', 'expect'), ('protein', 'sumI'),
+                   ('aa', 'modified')},
+
+            'bools': set(),
+            'lists': {'group', 'trace', 'attribute', 'protein', 'aa', 'note'},
+            'floatlists': {('values', 'values')},
+            'intlists': set(), 'charlists': set(), 'duration': {('group', 'rt')}}
+
+_mzxml_schema_defaults = {'bools': {('dataProcessing', 'centroided'),
+                                 ('dataProcessing', 'chargeDeconvoluted'),
+                                 ('dataProcessing', 'deisotoped'),
+                                 ('dataProcessing', 'spotIntegration'),
+                                 ('maldi', 'collisionGas'),
+                                 ('scan', 'centroided'),
+                                 ('scan', 'chargeDeconvoluted'),
+                                 ('scan', 'deisotoped')},
+                       'charlists': set(),
+                       'floatlists': set(),
+                       'floats': {('dataProcessing', 'intensityCutoff'),
+                                  ('precursorMz', 'precursorIntensity'),
+                                  ('precursorMz', 'windowWideness'),
+                                  ('precursorMz', 'precursorMz'),
+                                  ('scan', 'basePeakIntensity'),
+                                  ('scan', 'basePeakMz'),
+                                  ('scan', 'cidGasPressure'),
+                                  ('scan', 'collisionEnergy'),
+                                  ('scan', 'compensationVoltage'),
+                                  ('scan', 'endMz'),
+                                  ('scan', 'highMz'),
+                                  ('scan', 'ionisationEnergy'),
+                                  ('scan', 'lowMz'),
+                                  ('scan', 'startMz'),
+                                  ('scan', 'totIonCurrent')},
+                       'duration': {("scan", "retentionTime")
+                                    },
+                       'intlists': set(),
+                       'ints': {('msInstrument', 'msInstrumentID'),
+                                ('peaks', 'compressedLen'),
+                                ('precursorMz', 'precursorCharge'),
+                                ('robot', 'deadVolume'),
+                                ('scan', 'msInstrumentID'),
+                                ('scan', 'peaksCount'),
+                                ('scanOrigin', 'num'),
+                                ('scan', 'msLevel')},
+                       'lists': {'dataProcessing',
+                                 'msInstrument',
+                                 'parentFile',
+                                 'peaks',
+                                 'plate',
+                                 'precursorMz',
+                                 'scanOrigin',
+                                 'spot'}}
+
+_mzml_schema_defaults = {'ints': {
+    ('spectrum', 'index'),
+     ('instrumentConfigurationList', 'count'),
+     ('binaryDataArray', 'encodedLength'),
+     ('cvList', 'count'),
+     ('binaryDataArray', 'arrayLength'),
+     ('scanWindowList', 'count'),
+     ('componentList', 'count'),
+     ('sourceFileList', 'count'),
+     ('productList', 'count'),
+     ('referenceableParamGroupList', 'count'),
+     ('scanList', 'count'),
+     ('spectrum', 'defaultArrayLength'),
+     ('dataProcessingList', 'count'),
+     ('sourceFileRefList', 'count'),
+     ('scanSettingsList', 'count'),
+     ('selectedIonList', 'count'),
+     ('chromatogram', 'defaultArrayLength'),
+     ('precursorList', 'count'),
+     ('chromatogram', 'index'),
+     ('processingMethod', 'order'),
+     ('targetList', 'count'),
+     ('sampleList', 'count'),
+     ('softwareList', 'count'),
+     ('binaryDataArrayList', 'count'),
+     ('spectrumList', 'count'),
+     ('chromatogramList', 'count'),
+     ('selectedIon', 'charge state')},
+        'floats': {},
+        'bools': {},
+        'lists': {'scan', 'spectrum', 'sample', 'cv', 'dataProcessing',
+            'cvParam', 'source', 'userParam', 'detector', 'product',
+            'referenceableParamGroupRef', 'selectedIon', 'sourceFileRef',
+            'binaryDataArray', 'analyzer', 'scanSettings',
+            'instrumentConfiguration', 'chromatogram', 'target',
+            'processingMethod', 'precursor', 'sourceFile',
+            'referenceableParamGroup', 'contact', 'scanWindow', 'software'},
+        'intlists': {},
+        'floatlists': {},
+        'charlists': {}}
+
+_pepxml_schema_defaults = {'ints':
+    {('xpressratio_summary', 'xpress_light'),
+     ('distribution_point', 'obs_5_distr'),
+     ('distribution_point', 'obs_2_distr'),
+     ('enzymatic_search_constraint', 'max_num_internal_cleavages'),
+     ('asapratio_lc_heavypeak', 'right_valley'),
+     ('libra_summary', 'output_type'),
+     ('distribution_point', 'obs_7_distr'),
+     ('spectrum_query', 'index'),
+     ('data_filter', 'number'),
+     ('roc_data_point', 'num_incorr'),
+     ('search_hit', 'num_tol_term'),
+     ('search_hit', 'num_missed_cleavages'),
+     ('asapratio_lc_lightpeak', 'right_valley'),
+     ('libra_summary', 'normalization'),
+     ('specificity', 'min_spacing'),
+     ('database_refresh_timestamp', 'min_num_enz_term'),
+     ('enzymatic_search_constraint', 'min_number_termini'),
+     ('xpressratio_result', 'light_lastscan'),
+     ('distribution_point', 'obs_3_distr'),
+     ('spectrum_query', 'end_scan'),
+     ('analysis_result', 'id'),
+     ('search_database', 'size_in_db_entries'),
+     ('search_hit', 'hit_rank'),
+     ('alternative_protein', 'num_tol_term'),
+     ('search_hit', 'num_tot_proteins'),
+     ('asapratio_summary', 'elution'),
+     ('search_hit', 'tot_num_ions'),
+     ('error_point', 'num_incorr'),
+     ('mixture_model', 'precursor_ion_charge'),
+     ('roc_data_point', 'num_corr'),
+     ('search_hit', 'num_matched_ions'),
+     ('dataset_derivation', 'generation_no'),
+     ('xpressratio_result', 'heavy_firstscan'),
+     ('xpressratio_result', 'heavy_lastscan'),
+     ('error_point', 'num_corr'),
+     ('spectrum_query', 'assumed_charge'),
+     ('analysis_timestamp', 'id'),
+     ('xpressratio_result', 'light_firstscan'),
+     ('distribution_point', 'obs_4_distr'),
+     ('asapratio_lc_heavypeak', 'left_valley'),
+     ('fragment_masses', 'channel'),
+     ('distribution_point', 'obs_6_distr'),
+     ('affected_channel', 'channel'),
+     ('search_result', 'search_id'),
+     ('contributing_channel', 'channel'),
+     ('asapratio_lc_lightpeak', 'left_valley'),
+     ('asapratio_peptide_data', 'area_flag'),
+     ('search_database', 'size_of_residues'),
+     ('asapratio_peptide_data', 'cidIndex'),
+     ('mixture_model', 'num_iterations'),
+     ('mod_aminoacid_mass', 'position'),
+     ('spectrum_query', 'start_scan'),
+     ('asapratio_summary', 'area_flag'),
+     ('mixture_model', 'tot_num_spectra'),
+     ('search_summary', 'search_id'),
+     ('xpressratio_timestamp', 'xpress_light'),
+     ('distribution_point', 'obs_1_distr'),
+     ('intensity', 'channel'),
+     ('asapratio_contribution', 'charge'),
+     ('libra_summary', 'centroiding_preference')},
+    'floats':
+    {('asapratio_contribution', 'error'),
+     ('asapratio_lc_heavypeak', 'area_error'),
+     ('modification_info', 'mod_nterm_mass'),
+     ('distribution_point', 'model_4_neg_distr'),
+     ('distribution_point', 'model_5_pos_distr'),
+     ('spectrum_query', 'precursor_neutral_mass'),
+     ('asapratio_lc_heavypeak', 'time_width'),
+     ('xpressratio_summary', 'masstol'),
+     ('affected_channel', 'correction'),
+     ('distribution_point', 'model_7_neg_distr'),
+     ('error_point', 'error'),
+     ('intensity', 'target_mass'),
+     ('roc_data_point', 'sensitivity'),
+     ('distribution_point', 'model_4_pos_distr'),
+     ('distribution_point', 'model_2_neg_distr'),
+     ('distribution_point', 'model_3_pos_distr'),
+     ('mixture_model', 'prior_probability'),
+     ('roc_data_point', 'error'),
+     ('intensity', 'normalized'),
+     ('modification_info', 'mod_cterm_mass'),
+     ('asapratio_lc_lightpeak', 'area_error'),
+     ('distribution_point', 'fvalue'),
+     ('distribution_point', 'model_1_neg_distr'),
+     ('peptideprophet_summary', 'min_prob'),
+     ('asapratio_result', 'mean'),
+     ('point', 'pos_dens'),
+     ('fragment_masses', 'mz'),
+     ('mod_aminoacid_mass', 'mass'),
+     ('distribution_point', 'model_6_neg_distr'),
+     ('asapratio_lc_lightpeak', 'time_width'),
+     ('asapratio_result', 'heavy2light_error'),
+     ('peptideprophet_result', 'probability'),
+     ('error_point', 'min_prob'),
+     ('peptideprophet_summary', 'est_tot_num_correct'),
+     ('roc_data_point', 'min_prob'),
+     ('asapratio_result', 'heavy2light_mean'),
+     ('distribution_point', 'model_5_neg_distr'),
+     ('mixturemodel', 'neg_bandwidth'),
+     ('asapratio_result', 'error'),
+     ('xpressratio_result', 'light_mass'),
+     ('point', 'neg_dens'),
+     ('asapratio_lc_lightpeak', 'area'),
+     ('distribution_point', 'model_1_pos_distr'),
+     ('xpressratio_result', 'mass_tol'),
+     ('mixturemodel', 'pos_bandwidth'),
+     ('xpressratio_result', 'light_area'),
+     ('asapratio_peptide_data', 'heavy_mass'),
+     ('distribution_point', 'model_2_pos_distr'),
+     ('search_hit', 'calc_neutral_pep_mass'),
+     ('intensity', 'absolute'),
+     ('asapratio_peptide_data', 'light_mass'),
+     ('distribution_point', 'model_3_neg_distr'),
+     ('aminoacid_modification', 'mass'),
+     ('asapratio_lc_heavypeak', 'time'),
+     ('asapratio_lc_lightpeak', 'time'),
+     ('asapratio_lc_lightpeak', 'background'),
+     ('mixture_model', 'est_tot_correct'),
+     ('point', 'value'),
+     ('asapratio_lc_heavypeak', 'background'),
+     ('terminal_modification', 'mass'),
+     ('fragment_masses', 'offset'),
+     ('xpressratio_result', 'heavy_mass'),
+     ('search_hit', 'protein_mw'),
+     ('libra_summary', 'mass_tolerance'),
+     ('spectrum_query', 'retention_time_sec'),
+     ('distribution_point', 'model_7_pos_distr'),
+     ('asapratio_lc_heavypeak', 'area'),
+     ('alternative_protein', 'protein_mw'),
+     ('asapratio_contribution', 'ratio'),
+     ('xpressratio_result', 'heavy_area'),
+     ('distribution_point', 'model_6_pos_distr')},
+    'bools':
+    {('sample_enzyme', 'independent'),
+     ('intensity', 'reject'),
+     ('libra_result', 'is_rejected')},
+    'intlists': set(),
+    'floatlists': set(),
+    'charlists': set(),
+    'lists': {'point', 'aminoacid_modification', 'msms_run_summary',
+            'mixturemodel', 'search_hit', 'mixturemodel_distribution',
+            'sequence_search_constraint', 'specificity', 'alternative_protein',
+            'analysis_result', 'data_filter', 'fragment_masses', 'error_point',
+            'parameter', 'spectrum_query', 'search_result', 'affected_channel',
+            'analysis_summary', 'roc_data_point', 'distribution_point',
+            'search_summary', 'mod_aminoacid_mass', 'search_score', 'intensity',
+            'analysis_timestamp', 'mixture_model', 'terminal_modification',
+            'contributing_channel', 'inputfile'}}
+
+
+_traml_schema_defaults = {'bools': set(),
+ 'charlists': set(),
+ 'floatlists': set(),
+ 'floats': {('Modification', 'averageMassDelta'),
+            ('Modification', 'monoisotopicMassDelta')},
+ 'intlists': set(),
+ 'ints': {('Modification', 'location')},
+ 'lists': {'Compound',
+           'Configuration',
+           'Contact',
+           'Instrument',
+           'IntermediateProduct',
+           'Interpretation',
+           'Modification',
+           'Peptide',
+           'Protein',
+           'ProteinRef',
+           'Publication',
+           'RetentionTime',
+           'RetentionTimeList',
+           'Software',
+           'SourceFile',
+           'Target',
+           'Transition',
+           'ValidationStatus',
+           'cv',
+           'cvParam',
+           'userParam'}}
+
+_idxml_schema_defaults = {
+ 'ints': {('PeptideHit', 'charge'), ('SearchParameters', 'missed_cleavages'),
+ ('PeptideHit', 'NumMatchedMainIons'), ('PeptideHit', 'IsotopeError')},
+ 'floats': {('IdXML', 'version'),
+  ('PeptideHit', 'score'),
+  ('PeptideIdentification', 'MZ'),
+  ('PeptideIdentification', 'RT'),
+  ('PeptideIdentification', 'significance_threshold'),
+  ('PeptideHit', 'MS2IonCurrent'),
+  ('PeptideHit', 'MeanErrorAll'),
+  ('PeptideHit', 'MeanErrorTop7'),
+  ('PeptideHit', 'MeanRelErrorAll'),
+  ('PeptideHit', 'MeanRelErrorTop7'),
+  ('PeptideHit', 'NTermIonCurrentRatio'),
+  ('PeptideHit', 'CTermIonCurrentRatio'),
+  ('PeptideHit', 'StdevErrorAll'),
+  ('PeptideHit', 'StdevErrorTop7'),
+  ('PeptideHit', 'StdevRelErrorAll'),
+  ('PeptideHit', 'StdevRelErrorTop7'),
+  ('PeptideHit', 'ExplainedIonCurrentRatio'),
+  ('ProteinHit', 'coverage'),
+  ('ProteinHit', 'score'),
+  ('ProteinIdentification', 'significance_threshold'),
+  ('SearchParameters', 'peak_mass_tolerance'),
+  ('SearchParameters', 'precursor_peak_tolerance')},
+ 'bools': {('PeptideIdentification', 'higher_score_better'),
+  ('ProteinIdentification', 'higher_score_better'),
+  ('SearchParameters', 'peak_mass_tolerance_ppm'),
+  ('SearchParameters', 'precursor_peak_tolerance_ppm')},
+ 'intlists': set(),
+ 'floatlists': set(),
+ 'charlists': set(),
+ 'lists': {'FixedModification',
+  'IdentificationRun',
+  'PeptideHit',
+  'PeptideIdentification',
+  'ProteinHit',
+  'ProteinIdentification',
+  'SearchParameters',
+  'UserParam',
+  'VariableModification'}}
diff --git a/pyteomics/achrom.py b/pyteomics/achrom.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a3224618136616491571bf51156099e5efeed4
--- /dev/null
+++ b/pyteomics/achrom.py
@@ -0,0 +1,1326 @@
+"""
+achrom - additive model of polypeptide chromatography
+=====================================================
+
+Summary
+-------
+
+The additive model of polypeptide chromatography, or achrom, is the most basic
+model for peptide retention time prediction. The main equation behind
+achrom has the following form:
+
+.. math::
+
+    RT = (1 + m\\,ln N) \\sum_{i=1}^{i=N}{RC_i n_i} + RT_0
+
+
+Here, :math:`RC_i` is the retention coefficient of the amino acid
+residues of the i-th type, :math:`n_i` corresponds to the number of amino acid
+residues of type :math:`i` in the peptide sequence, N is the total number of
+different *types* of amino acid residues present,
+and :math:`RT_0` is a constant retention time shift.
+
+In order to use achrom, one needs to find the retention
+coeffcients, using experimentally determined retention times for a training set
+of peptide retention times, i.e. to *calibrate* the model.
+
+Calibration
+-----------
+
+  :py:func:`get_RCs` - find a set of retention coefficients using a
+  given set of peptides with known retention times and a fixed value of
+  length correction parameter.
+
+  :py:func:`get_RCs_vary_lcp` - find the best length correction parameter
+  and a set of retention coefficients for a given peptide sample.
+
+Retention time calculation
+--------------------------
+
+  :py:func:`calculate_RT` - calculate the retention time of a peptide
+  using a given set of retention coefficients.
+
+Data
+----
+
+  :py:data:`RCs_guo_ph2_0` - a set of retention coefficients (RCs)
+  from [#Guo1]_. Conditions: Synchropak RP-P C18 column (250 x 4.1 mm
+  I.D.), gradient (A = 0.1% aq. TFA, pH 2.0; B = 0.1% TFA in acetonitrile) at
+  1% B/min, flow rate 1 ml/min, 26 centigrades.
+
+  :py:data:`RCs_guo_ph7_0` - a set of retention coefficients (RCs)
+  from [#Guo1]_. Conditions: Synchropak RP-P C18 column (250 x 4.1 mm
+  I.D.), gradient (A = aq. 10 mM (NH4)2HPO4 - 0.1 M NaClO4, pH 7.0; B
+  = 0.1 M NaClO4 in 60% aq. acetonitrile) at 1.67% B/min, flow rate 1
+  ml/min, 26 centigrades.
+
+  :py:data:`RCs_meek_ph2_1` - a set of RCs from [#Meek]_. Conditions: Bio-Rad
+  "ODS" column, gradient (A = 0.1 M NaClO4, 0.1% phosphoric acid in
+  water; B = 0.1 M NaClO4, 0.1% phosphoric acid in 60%
+  aq. acetonitrile) at 1.25% B/min, room temperature.
+
+  :py:data:`RCs_meek_ph7_4` - a set of RCs from [#Meek]_. Conditions: Bio-Rad
+  "ODS" column, gradient (A = 0.1 M NaClO4, 5 mM phosphate buffer in
+  water; B = 0.1 M NaClO4, 5 mM phosphate buffer in 60%
+  aq. acetonitrile) at 1.25% B/min, room temperature.
+
+  :py:data:`RCs_browne_tfa` - a set of RCs found in
+  [#Browne]_. Conditions: Waters mjuBondapak C18 column, gradient (A =
+  0.1% aq. TFA, B = 0.1% TFA in acetonitrile) at 0.33% B/min, flow
+  rate 1.5 ml/min.
+
+  :py:data:`RCs_browne_hfba` - a set of RCs found in
+  [#Browne]_. Conditions: Waters mjuBondapak C18 column, gradient (A =
+  0.13% aq. HFBA, B = 0.13% HFBA in acetonitrile) at 0.33% B/min, flow
+  rate 1.5 ml/min.
+
+  :py:data:`RCs_palmblad` - a set of RCs from
+  [#Palmblad]_. Conditions: a fused silica column (80-100 x 0.200 mm
+  I.D.) packed in-house with C18 ODS-AQ; solvent A = 0.5% aq. HAc,
+  B = 0.5% HAc in acetonitrile.
+
+  :py:data:`RCs_yoshida` - a set of RCs for normal phase chromatography
+  from [#Yoshida]_. Conditions:
+  TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = 0.1% TFA
+  in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at 0.6%
+  water/min, flow rate 1.0 ml/min, 40 centigrades.
+
+  :py:data:`RCs_yoshida_lc` - a set of length-corrected RCs for normal phase
+  chromatography. The set was calculated in [#Moskovets]_ for the data from
+  [#Yoshida]_.
+  Conditions:
+  TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = 0.1% TFA
+  in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at 0.6%
+  water/min, flow rate 1.0 ml/min, 40 centigrades.
+
+  :py:data:`RCs_zubarev` - a set of length-corrected RCs calculated
+  on a dataset used in [#Goloborodko]_.
+  Conditions: Reprosil-Pur C18-AQ column (150 x 0.075 mm I.D.), gradient (A =
+  0.5% AA in water; B = 0.5% AA in ACN-water (90:10)) at
+  0.5% water/min, flow rate 200.0 nl/min, room temperature.
+
+  :py:data:`RCs_gilar_atlantis_ph3_0` - a set of retention coefficients obtained
+  in [#Gilar]_.
+  Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A,
+  gradient (A = water, B = ACN, C = 200 mM ammonium formate):
+  0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C
+  at 0.2 ml/min, temperature 40 C, pH 3.0
+
+  :py:data:`RCs_gilar_atlantis_ph4_5` - a set of retention coefficients obtained
+  in [#Gilar]_.
+  Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A,
+  gradient (A = water, B = ACN, C = 200 mM ammonium formate):
+  0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C
+  at 0.2 ml/min, temperature 40 C, pH 4.5
+
+  :py:data:`RCs_gilar_atlantis_ph10_0` - a set of retention coefficients
+  obtained in [#Gilar]_.
+  Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A,
+  gradient (A = water, B = ACN, C = 200 mM ammonium formate):
+  0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C
+  at 0.2 ml/min, temperature 40 C, pH 10.0
+
+  :py:data:`RCs_gilar_beh` - a set of retention coefficients obtained in
+  [#Gilar]_.
+  Conditions: ACQUITY UPLC BEH HILIC column (150 x 2.1 mm I.D.), 1.7 um, 130 A,
+  Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by
+  titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B:
+  90% ACN, 10% mobile phase A (v:v).
+  Gradient: 90-60% B in 50 min.
+
+  :py:data:`RCs_gilar_beh_amide` - a set of retention coefficients obtained in
+  [#Gilar]_.
+  Conditions: ACQUITY UPLC BEH glycan column (150 x 2.1 mm I.D.), 1.7 um, 130 A,
+  Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by
+  titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B:
+  90% ACN, 10% mobile phase A (v:v).
+  Gradient: 90-60% B in 50 min.
+
+  :py:data:`RCs_gilar_rp` - a set of retention coefficients obtained in
+  [#Gilar]_.
+  Conditions: ACQUITY UPLC BEH C18 column (100 mm x 2.1 mm I.D.), 1.7 um, 130 A.
+  Mobile phase A: 0.02% TFA in water, mobile phase B: 0.018% TFA in ACN.
+  Gradient: 0 to 50% B in 50 min, flow rate 0.2 ml/min, temperature 40 C.,
+  pH 2.6.
+
+  :py:data:`RCs_krokhin_100A_fa` - a set of retention coefficients obtained in
+  [#Krokhin]_.
+  Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% FA), packed with
+  5-um Luna C18(2) (Phenomenex, Torrance, CA), pH=2.0.
+  Both eluents A (2% ACN in water) and B (98% ACN) contained
+  0.1% FA as ion-pairing modifier. 0.33% ACN/min
+  linear gradient (0-30% B).
+
+  :py:data:`RCs_krokhin_100A_tfa` - a set of retention coefficients obtained in
+  [#Krokhin]_.
+  Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% TFA), packed with
+  5-um Luna C18(2) (Phenomenex, Torrance, CA), pH=2.0.
+  Both eluents A (2% ACN in water) and B (98% ACN) contained
+  0.1% TFA as ion-pairing modifier. 0.33% ACN/min
+  linear gradient (0-30% B).
+
+Theory
+------
+
+The additive model of polypeptide chromatography, or the model of
+retention coefficients was the earliest attempt to describe the dependence of
+retention time of a polypeptide in liquid chromatography on its sequence
+[#Meek]_, [#Guo1]_. In this model, each amino acid is assigned a number, or
+a *retention coefficient* (RC) describing its retention properties. The
+retention time (RT) during a gradient elution is then calculated as:
+
+.. math::
+
+    RT = \\sum_{i=1}^{i=N}{RC_i \\cdot n_i} + RT_0,
+
+which is the sum of retention coefficients of all amino acid residues in a
+polypeptide. This equation can also be expressed in terms of linear
+algebra:
+
+.. math::
+
+    RT = \\bar{aa} \\cdot \\bar{RC} + RT_0,
+
+where :math:`\\bar{aa}` is a vector of amino acid composition,
+i.e. :math:`\\bar{aa}_i` is the number of amino acid residues of i-th
+type in a polypeptide; :math:`\\bar{RC}` is a vector of respective
+retention coefficients.
+
+In this formulation, it is clear that additive model gives the same results for
+any two peptides with different sequences but the same amino acid
+composition. In other words, **additive model is not sequence-specific**.
+
+The additive model has two advantages over all other models of chromatography
+- it is easy to understand and use. The rule behind the additive model is as
+simple as it could be: **each amino acid residue shifts retention time by a
+fixed value, depending only on its type**. This rule allows geometrical
+interpretation. Each peptide may be represented by a point in 21-dimensional
+space, with first 20 coordinates equal to the amounts of corresponding amino
+acid residues in the peptide and 21-st coordinate equal to RT. The additive
+model assumes that a line may be drawn through these points. Of course, this
+assumption is valid only partially, and most points would not lie on the
+line. But the line would describe the main trend and could be used to estimate
+retention time for peptides with known amino acid composition.
+
+This best fit line is described by retention coefficients and :math:`RT_0`.
+The procedure of finding these coefficients is called *calibration*. There is
+`an analytical solution to calibration of linear models
+<http://en.wikipedia.org/wiki/Linear_regression>`_, which makes them
+especially useful in real applications.
+
+Several attempts were made in order to improve the accuracy of prediction by
+the additive model (for a review of the field we suggest to read [#Baczek]_
+and [#Babushok]_). The two implemented in this module are the logarithmic
+length correction term described in [#MantLogLen]_ and additional sets of
+retention coefficients for terminal amino acid residues [#Tripet]_.
+
+Logarithmic length correction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This enhancement was firstly described in [#MantLogLen]_. Briefly, it was
+found that the following equation better describes the dependence of RT on the
+peptide sequence:
+
+.. math::
+
+    RT = \\sum_{i=1}^{i=N}{RC_i} + m\\,ln N \\sum_{i=1}^{i=N}{RC_i} + RT_0
+
+We would call the second term :math:`m\\,ln N \\sum_{i=1}^{i=N}{RC_i}` *the
+length correction term* and m - *the length correction parameter*. The
+simplified and vectorized form of this equation would be:
+
+.. math::
+
+    RT = (1 + m\\,ln N) \\, \\bar{RC} \\cdot \\bar{aa} + RT_0
+
+This equation may be reduced to a linear form and solved by the standard
+methods.
+
+Terminal retention coefficients
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Another significant improvement may be obtained through introduction of
+separate sets of retention coefficients for terminal amino acid residues
+[#Tripet]_.
+
+References
+----------
+
+.. [#Meek] Meek, J. L. `Prediction of peptide retention times in high-pressure
+   liquid chromatography on the basis of amino acid composition.
+   <http://www.ncbi.nlm.nih.gov/pubmed/6929513>`_
+   PNAS, 1980, 77 (3), 1632-1636.
+
+.. [#Guo1] Guo, D.; Mant, C. T.; Taneja, A. K.; Parker, J. M. R.; Hodges,
+   R. S.  `Prediction of peptide retention times in reversed-phase
+   high-performance liquid chromatography I. Determination of retention
+   coefficients of amino acid residues of model synthetic peptides.
+   <http://dx.doi.org/10.1016/0021-9673(86)80102-9>`_
+   Journal of Chromatography A, 1986, 359, 499-518.
+
+.. [#Baczek] Baczek, T.; Kaliszan, R. `Predictions of peptides' retention times
+   in reversed-phase liquid chromatography as a new supportive tool to improve
+   protein identification in proteomics.
+   <http://dx.doi.org/10.1002/pmic.200800544>`_
+   Proteomics, 2009, 9 (4), 835-47.
+
+.. [#Babushok] Babushok, V. I.; Zenkevich, I. G. `Retention Characteristics of
+   Peptides in RP-LC: Peptide Retention Prediction.
+   <http://dx.doi.org/10.1365/s10337-010-1721-8>`_
+   Chromatographia, 2010, 72 (9-10), 781-797.
+
+.. [#MantLogLen] Mant, C. T.; Zhou, N. E.; Hodges, R. S. `Correlation of
+   protein retention times in reversed-phase chromatography with polypeptide
+   chain length and hydrophobicity.
+   <http://dx.doi.org/10.1016/S0021-9673(01)93882-8>`_
+   Journal of Chromatography A, 1989, 476, 363-375.
+
+.. [#Tripet] Tripet, B.; Cepeniene, D.; Kovacs, J. M.; Mant, C. T.; Krokhin,
+   O. V.; Hodges, R. S. `Requirements for prediction of peptide retention time
+   in reversed-phase high-performance liquid chromatography:
+   hydrophilicity/hydrophobicity of side-chains at the N- and C-termini of
+   peptides are dramatically affected by the end-groups and location.
+   <http://dx.doi.org/10.1016/j.chroma.2006.12.024>`_
+   Journal of chromatography A, 2007, 1141 (2), 212-25.
+
+.. [#Browne] Browne, C. A.; Bennett, H. P. J.; Solomon, S. `The
+   isolation of peptides by high-performance liquid chromatography
+   using predicted elution positions
+   <http://www.sciencedirect.com/science/article/pii/000326978290238X>`_.
+   Analytical Biochemistry, 1982, 124 (1), 201-208.
+
+.. [#Palmblad] Palmblad, M.; Ramstrom, M.; Markides, K. E.; Hakansson,
+   P.; Bergquist, J. `Prediction of Chromatographic Retention and
+   Protein Identification in Liquid Chromatography/Mass
+   Spectrometry
+   <http://pubs.acs.org/doi/abs/10.1021/ac0256890>`_.
+   Analytical Chemistry, 2002, 74 (22), 5826-5830.
+
+.. [#Yoshida] Yoshida, T. Calculation of peptide retention
+   coefficients in normal-phase liquid chromatography. Journal of
+   Chromatography A, 1998, 808 (1-2), 105-112.
+
+.. [#Moskovets] Moskovets, E.; Goloborodko A. A.; Gorshkov A. V.; Gorshkov M.V.
+   `Limitation of predictive 2-D liquid chromatography in reducing the database
+   search space in shotgun proteomics: In silico studies.
+   <http://dx.doi.org/10.1002/jssc.201100798>`_
+   Journal of Separation Science, 2012, 35 (14), 1771-1778.
+
+.. [#Goloborodko] Goloborodko A. A.; Mayerhofer C.; Zubarev A. R.;
+   Tarasova I. A.; Gorshkov A. V.; Zubarev, R. A.; Gorshkov, M. V.
+   `Empirical approach to false discovery rate
+   estimation in shotgun proteomics. <http://dx.doi.org/10.1002/rcm.4417>`_
+   Rapid communications in mass spectrometry, 2010, 24(4), 454-62.
+
+.. [#Gilar] Gilar, M., & Jaworski, A. (2011). `Retention behavior of peptides in
+    hydrophilic-interaction chromatography.
+    <http://dx.doi.org/10.1016/j.chroma.2011.04.005>`_
+    Journal of chromatography A, 1218(49), 8890-6.
+
+.. [#Krokhin] Dwivedi, R. C.; Spicer, V.; Harder, M.; Antonovici, M.; Ens, W.;
+    Standing, K. G.; Wilkins, J. A.; Krokhin, O. V. (2008). `Practical
+    implementation of 2D HPLC scheme with accurate peptide retention prediction
+    in both dimensions for high-throughput bottom-up proteomics
+    <http://pubs.acs.org/doi/abs/10.1021/ac800984n>`_.
+    Analytical Chemistry, 80(18), 7036-42.
+
+Dependencies
+------------
+
+This module requires :py:mod:`numpy` and, optionally, :py:mod:`scikit-learn`
+(for MAE regression).
+
+--------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import numpy as np
+from .auxiliary import linear_regression, PyteomicsError
+try:
+    from sklearn.linear_model import QuantileRegressor
+except ImportError:
+    QuantileRegressor = None
+
+from . import parser
+
+def get_RCs(sequences, RTs, lcp=-0.21, term_aa=False, metric='mse', **kwargs):
+    """Calculate the retention coefficients of amino acids using
+    retention times of a peptide sample and a fixed value of length
+    correction parameter.
+
+    Parameters
+    ----------
+    sequences : list of str
+        List of peptide sequences.
+    RTs: list of float
+        List of corresponding retention times.
+    lcp : float, optional
+        A multiplier before ln(L) term in the equation for the retention
+        time of a peptide. Set to -0.21 by default.
+    term_aa : bool, optional
+        If :py:const:`True`, terminal amino acids are treated as being
+        modified with 'ntermX'/'ctermX' modifications. :py:const:`False`
+        by default.
+    metric : str, optional
+        Metric for the regression problem. Set to "mse" (mean squared
+        error) by default. Alternative: "mae" (mean absolute error),
+        which uses quantile regression.
+
+        .. note ::
+            `"mae"` requires :py:mod:`scikit-learn` for
+            `quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_.
+
+    labels : list of str, optional
+        List of all possible amino acids and terminal groups
+        If not given, any modX labels are allowed.
+
+    Returns
+    -------
+    RC_dict : dict
+        Dictionary with the calculated retention coefficients.
+
+        - RC_dict['aa'] -- amino acid retention coefficients.
+
+        - RC_dict['const'] -- constant retention time shift.
+
+        - RC_dict['lcp'] -- length correction parameter.
+
+    Examples
+    --------
+    >>> RCs = get_RCs(['A','AA'], [1.0, 2.0], 0.0, labels=['A'])
+    >>> abs(RCs['aa']['A'] - 1) < 1e-6 and abs(RCs['const']) < 1e-6
+    True
+    >>> RCs = get_RCs(['A','AA','B'], [1.0, 2.0, 2.0], 0.0, labels=['A','B'])
+    >>> abs(RCs['aa']['A'] - 1) + abs(RCs['aa']['B'] - 2) + \
+            abs(RCs['const']) < 1e-6
+    True
+    """
+
+    labels = kwargs.get('labels')
+
+    # Make a list of all amino acids present in the sample.
+    peptide_dicts = [
+            parser.amino_acid_composition(peptide, False, term_aa,
+                               allow_unknown_modifications=True,
+                               labels=labels)
+            if not isinstance(peptide, dict) else peptide
+        for peptide in sequences]
+
+    detected_amino_acids = {aa for peptide_dict in peptide_dicts
+                                for aa in peptide_dict}
+
+    # Determine retention coefficients using multidimensional linear
+    # regression.
+    composition_array = []
+    for pdict in peptide_dicts:
+        loglen = np.log(parser.length(pdict))
+        composition_array.append([pdict.get(aa, 0.)
+             * (1. + lcp * loglen)
+               for aa in detected_amino_acids] + [1.])
+
+    # Add normalizing conditions for terminal retention coefficients. The
+    # condition we are using here is quite arbitrary. It implies that the sum
+    # of N- or C-terminal RCs minus the sum of corresponding internal RCs must
+    # be equal to zero.
+    if term_aa:
+        for term_label in ['nterm', 'cterm']:
+            normalizing_peptide = []
+            for aa in detected_amino_acids:
+                if aa.startswith(term_label):
+                    normalizing_peptide.append(1.0)
+                elif (term_label+aa) in detected_amino_acids:
+                    normalizing_peptide.append(-1.0)
+                else:
+                    normalizing_peptide.append(0.0)
+            normalizing_peptide.append(0.0)
+            composition_array.append(normalizing_peptide)
+            RTs.append(0.0)
+
+    if metric == 'mse':
+        # # Use least square linear regression.
+        RCs, _, _, _ = np.linalg.lstsq(np.array(composition_array), np.array(RTs), rcond=None)
+
+    elif metric == 'mae':
+        if QuantileRegressor is None:
+            raise PyteomicsError("`metric='mae'` requires scikit-learn.")
+        # Use Quantile regression.
+        QR = QuantileRegressor(fit_intercept=False, alpha=0, solver='highs')
+        QR.fit(np.array(composition_array), np.array(RTs))
+        RCs = QR.coef_
+    else:
+        raise PyteomicsError('Invalid metric "{}". Must be "mse" or "mae".'.format(metric))
+
+    # Remove normalizing elements from the RTs vector.
+    if term_aa:
+        for term_label in ['nterm', 'cterm']:
+            RTs.pop()
+
+    # Form output.
+    RC_dict = {}
+    RC_dict['aa'] = dict(
+        zip(list(detected_amino_acids),
+            RCs[:len(detected_amino_acids)]))
+    RC_dict['aa'][parser.std_nterm] = 0.0
+    RC_dict['aa'][parser.std_cterm] = 0.0
+    RC_dict['const'] = RCs[len(detected_amino_acids)]
+    RC_dict['lcp'] = lcp
+
+    # Find remaining terminal RCs.
+    if term_aa:
+        for term_label in ['nterm', 'cterm']:
+            # Check if there are terminal RCs remaining undefined.
+            undefined_term_RCs = [aa for aa in RC_dict['aa']
+                                if aa[1:5] != 'term'
+                                and term_label + aa not in RC_dict['aa']]
+            if not undefined_term_RCs:
+                continue
+
+            # Find a linear relationship between internal and terminal RCs.
+            defined_term_RCs = [aa for aa in RC_dict['aa']
+                              if aa[1:5] != 'term'
+                              and term_label + aa in RC_dict['aa']]
+
+            a, b, r, stderr = linear_regression(
+                [RC_dict['aa'][aa] for aa in defined_term_RCs],
+                [RC_dict['aa'][term_label+aa] for aa in defined_term_RCs])
+
+            # Define missing terminal RCs using this linear equation.
+            for aa in undefined_term_RCs:
+                RC_dict['aa'][term_label + aa] = a * RC_dict['aa'][aa] + b
+
+    return RC_dict
+
+
+def get_RCs_vary_lcp(sequences, RTs, term_aa=False, lcp_range=(-1.0, 1.0), metric='mse', **kwargs):
+    """Find the best combination of a length correction parameter and
+    retention coefficients for a given peptide sample.
+
+    Parameters
+    ----------
+    sequences : list of str
+        List of peptide sequences.
+    RTs : list of float
+        List of corresponding retention times.
+    term_aa : bool, optional
+        If True, terminal amino acids are treated as being
+        modified with 'ntermX'/'ctermX' modifications. False by default.
+    metric : str, optional
+        Metric for the regression problem. Set to "mse" (mean squared
+        error) by default. Alternative: "mae" (mean absolute error).
+
+        .. note ::
+            `"mae"` requires :py:mod:`scikit-learn` for
+            `quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_.
+
+    lcp_range : 2-tuple of float, optional
+        Range of possible values of the length correction parameter.
+    labels : list of str, optional
+        List of labels for all possible amino acids and terminal groups
+        If not given, any modX labels are allowed.
+    lcp_accuracy : float, optional
+        The accuracy of the length correction parameter calculation.
+
+    Returns
+    -------
+    RC_dict : dict
+        Dictionary with the calculated retention coefficients.
+
+        - RC_dict['aa'] -- amino acid retention coefficients.
+
+        - RC_dict['const'] -- constant retention time shift.
+
+        - RC_dict['lcp'] -- length correction parameter.
+
+    Examples
+    --------
+    >>> RCs = get_RCs_vary_lcp(['A', 'AA', 'AAA'], \
+        [1.0, 2.0, 3.0], \
+        labels=['A'])
+    >>> abs(RCs['aa']['A'] - 1) + abs(RCs['lcp']) + abs(RCs['const']) < 1e-6
+    True
+    """
+    labels = kwargs.get('labels')
+
+    best_r = -1.1
+    best_RC_dict = {}
+    lcp_accuracy = kwargs.get('lcp_accuracy', 0.1)
+
+    min_lcp = lcp_range[0]
+    max_lcp = lcp_range[1]
+    step = (max_lcp - min_lcp) / 10.0
+    peptide_dicts = [
+            parser.amino_acid_composition(peptide, False, term_aa,
+                                      allow_unknown_modifications=True,
+                                      labels=labels)
+            if not isinstance(peptide, dict) else peptide
+        for peptide in sequences]
+    while step > lcp_accuracy:
+        lcp_grid = np.arange(min_lcp, max_lcp,
+                                (max_lcp - min_lcp) / 10.0)
+        for lcp in lcp_grid:
+            RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels, metric=metric)
+            regression_coeffs = linear_regression(
+                RTs,
+                [calculate_RT(peptide, RC_dict) for peptide in peptide_dicts])
+            if regression_coeffs[2] > best_r:
+                best_r = regression_coeffs[2]
+                best_RC_dict = dict(RC_dict)
+        min_lcp = best_RC_dict['lcp'] - step
+        max_lcp = best_RC_dict['lcp'] + step
+        step = (max_lcp - min_lcp) / 10.0
+
+    return best_RC_dict
+
+
+def calculate_RT(peptide, RC_dict, raise_no_mod=True):
+    """Calculate the retention time of a peptide using a given set
+    of retention coefficients.
+
+    Parameters
+    ----------
+    peptide : str or dict
+        A peptide sequence or amino acid composition.
+    RC_dict : dict
+        A set of retention coefficients, length correction parameter and
+        a fixed retention time shift. Keys are: 'aa', 'lcp' and 'const'.
+    raise_no_mod : bool, optional
+        If :py:const:`True` then an exception is raised when a modified amino
+        acid from `peptides` is not found in `RC_dict`. If :py:const:`False`,
+        then the retention coefficient for the non-modified amino acid residue
+        is used instead. :py:const:`True` by default.
+
+    Returns
+    -------
+    RT : float
+        Calculated retention time.
+
+    Examples
+    --------
+    >>> RT = calculate_RT('AA', {'aa': {'A': 1.1}, 'lcp':0.0, 'const': 0.1})
+    >>> abs(RT - 2.3) < 1e-6      # Float comparison
+    True
+    >>> RT = calculate_RT('AAA', {'aa': {'ntermA': 1.0, 'A': 1.1, 'ctermA': 1.2},\
+        'lcp': 0.0, 'const':0.1})
+    >>> abs(RT - 3.4) < 1e-6      # Float comparison
+    True
+    >>> RT = calculate_RT({'A': 3}, {'aa': {'ntermA': 1.0, 'A': 1.1, 'ctermA': 1.2},\
+        'lcp': 0.0, 'const':0.1})
+    >>> abs(RT - 3.4) < 1e-6      # Float comparison
+    True
+    """
+
+    amino_acids = [aa for aa in RC_dict['aa']
+                   if not (aa[:5] == 'nterm' or aa[:5] == 'cterm')]
+
+    # Check if there are retention coefficients for terminal amino acids.
+    term_aa = False
+    for aa in RC_dict['aa']:
+        if aa[:5] == 'nterm' or aa[:5] == 'cterm':
+            term_aa = True
+            break
+
+    # Calculate retention time.
+    if isinstance(peptide, dict):
+        peptide_dict = peptide
+    else:
+        peptide_dict = parser.amino_acid_composition(peptide, False, term_aa,
+            allow_unknown_modifications=True, labels=amino_acids)
+    RT = 0.0
+    for aa in peptide_dict:
+        if aa not in RC_dict['aa']:
+            if len(aa) == 1:
+                raise PyteomicsError('No RC for residue "{}".'.format(aa))
+            if (not raise_no_mod) and aa[-1] in RC_dict['aa']:
+                RT += peptide_dict[aa] * RC_dict['aa'][aa[-1]]
+            else:
+                raise PyteomicsError(
+                    'Residue "{0}" not found in RC_dict. '.format(aa) +
+                    'Set raise_no_mod=False to ignore this error ' +
+                    'and use the RC for "{0}"" instead.'.format(aa[-1]))
+        else:
+            RT += peptide_dict[aa] * RC_dict['aa'][aa]
+
+    length_correction_term = (
+        1.0 + RC_dict.get('lcp', 0) * np.log(parser.length(peptide_dict)))
+    RT *= length_correction_term
+
+    RT += RC_dict.get('const', 0)
+
+    return RT
+
+RCs_guo_ph2_0 = {'aa':{'K': -2.1,
+                       'G': -0.2,
+                       'L':  8.1,
+                       'A':  2.0,
+                       'C':  2.6,
+                       'E':  1.1,
+                       'D':  0.2,
+                       'F':  8.1,
+                       'I':  7.4,
+                       'H': -2.1,
+                       'M':  5.5,
+                       'N': -0.6,
+                       'Q':  0.0,
+                       'P':  2.0,
+                       'S': -0.2,
+                       'R': -0.6,
+                       'T':  0.6,
+                       'W':  8.8,
+                       'V':  5.0,
+                       'Y':  4.5,
+                       'H-': 0.0,
+                       '-OH':0.0},
+                 'lcp': 0.0,
+                 'const': 0.0}
+"""A set of retention coefficients from Guo, D.; Mant, C. T.; Taneja,
+A. K.; Parker, J. M. R.; Hodges, R. S.  Prediction of peptide
+retention times in reversed-phase high-performance liquid
+chromatography I. Determination of retention coefficients of amino
+acid residues of model synthetic peptides. Journal of Chromatography
+A, 1986, 359, 499-518.
+
+Conditions: Synchropak RP-P C18 column (250 x 4.1 mm I.D.), gradient
+(A = 0.1% aq. TFA, pH 2.0; B = 0.1% TFA in acetonitrile) at 1% B/min,
+flow rate 1 ml/min, 26 centigrades.
+"""
+
+RCs_guo_ph7_0 = {'aa':{'K': -0.2,
+                       'G': -0.2,
+                       'L':  9.0,
+                       'A':  2.2,
+                       'C':  2.6,
+                       'E': -1.3,
+                       'D': -2.6,
+                       'F':  9.0,
+                       'I':  8.3,
+                       'H':  2.2,
+                       'M':  6.0,
+                       'N': -0.8,
+                       'Q':  0.0,
+                       'P':  2.2,
+                       'S': -0.5,
+                       'R':  0.9,
+                       'T':  0.3,
+                       'W':  9.5,
+                       'V':  5.7,
+                       'Y':  4.6,
+                       'H-': 0.0,
+                       '-OH':0.0},
+                 'lcp': 0.0,
+                 'const': 0.0}
+"""A set of retention coefficients from Guo, D.; Mant, C. T.; Taneja,
+A. K.; Parker, J. M. R.; Hodges, R. S.  Prediction of peptide
+retention times in reversed-phase high-performance liquid
+chromatography I. Determination of retention coefficients of amino
+acid residues of model synthetic peptides. Journal of Chromatography
+A, 1986, 359, 499-518.
+
+Conditions: Synchropak RP-P C18 column (250 x 4.1 mm I.D.), gradient
+(A = aq. 10 mM (NH4)2HPO4 - 0.1 M NaClO4, pH 7.0; B = 0.1 M NaClO4 in
+60% aq. acetonitrile) at 1.67% B/min, flow rate 1 ml/min, 26
+centigrades.
+"""
+
+RCs_meek_ph2_1 = {'aa':{'K': -3.2,
+                        'G': -0.5,
+                        'L': 10.0,
+                        'A': -0.1,
+                        'C': -2.2,
+                        'E': -7.5,
+                        'D': -2.8,
+                        'F': 13.9,
+                        'I': 11.8,
+                        'H':  0.8,
+                        'M':  7.1,
+                        'N': -1.6,
+                        'Q': -2.5,
+                        'P':  8.0,
+                        'S': -3.7,
+                        'R': -4.5,
+                        'T':  1.5,
+                        'W': 18.1,
+                        'V':  3.3,
+                        'Y':  8.2,
+                        'H-': 0.0,
+                        '-OH':0.0},
+                  'lcp': 0.0,
+                  'const': 0.0}
+"""A set of retention coefficients determined in Meek,
+J. L. Prediction of peptide retention times in high-pressure liquid
+chromatography on the basis of amino acid composition. PNAS, 1980, 77
+(3), 1632-1636.
+
+.. note :: C stands for Cystine.
+
+Conditions: Bio-Rad "ODS" column, gradient (A = 0.1 M NaClO4,
+0.1% phosphoric acid in water; B = 0.1 M NaClO4, 0.1% phosphoric acid
+in 60% aq. acetonitrile) at 1.25% B/min, room temperature.
+"""
+
+RCs_meek_ph7_4 = {'aa':{'K':  0.1,
+                        'G':  0.0,
+                        'L':  8.8,
+                        'A':  0.5,
+                        'C': -6.8,
+                        'E':-16.9,
+                        'D': -8.2,
+                        'F': 13.2,
+                        'I': 13.9,
+                        'H': -3.5,
+                        'M':  4.8,
+                        'N':  0.8,
+                        'Q': -4.8,
+                        'P':  6.1,
+                        'S':  1.2,
+                        'R':  0.8,
+                        'T':  2.7,
+                        'W': 14.9,
+                        'V':  2.7,
+                        'Y':  6.1,
+                        'H-': 0.0,
+                        '-OH':0.0},
+                  'lcp': 0.0,
+                  'const': 0.0}
+"""A set of retention coefficients determined in Meek,
+J. L. Prediction of peptide retention times in high-pressure liquid
+chromatography on the basis of amino acid composition. PNAS, 1980, 77
+(3), 1632-1636.
+
+.. note :: C stands for Cystine.
+
+Conditions: Bio-Rad "ODS" column, gradient (A = 0.1 M NaClO4,
+5 mM phosphate buffer in water; B = 0.1 M NaClO4, 5 mM phosphate buffer
+in 60% aq. acetonitrile) at 1.25% B/min, room temperature.
+"""
+
+RCs_browne_tfa = {'aa':{'K': -3.7,
+                        'G': -1.2,
+                        'L': 20.0,
+                        'A':  7.3,
+                        'C': -9.2,
+                        'E': -7.1,
+                        'D': -2.9,
+                        'F': 19.2,
+                        'I':  6.6,
+                        'H': -2.1,
+                        'M':  5.6,
+                        'N': -5.7,
+                        'Q': -0.3,
+                        'P':  5.1,
+                        'S': -4.1,
+                        'pS':-6.5,
+                        'R': -3.6,
+                        'T':  0.8,
+                        'pT':-1.6,
+                        'W': 16.3,
+                        'V':  3.5,
+                        'Y':  5.9,
+                        'pY': 3.5,
+                        'H-': 0.0,
+                        '-OH':0.0},
+                  'lcp': 0.0,
+                  'const': 0.0}
+"""A set of retention coefficients determined in Browne, C. A.;
+Bennett, H. P. J.; Solomon, S. The isolation of peptides by
+high-performance liquid chromatography using predicted elution
+positions. Analytical Biochemistry, 1982, 124 (1), 201-208.
+
+Conditions: Waters mjuBondapak C18 column, gradient (A = 0.1% aq. TFA,
+B = 0.1% TFA in acetonitrile) at 0.33% B/min, flow rate 1.5 ml/min.
+"""
+
+RCs_browne_hfba = {'aa':{'K': -2.5,
+                         'G': -2.3,
+                         'L': 15.0,
+                         'A':  3.9,
+                         'C':-14.3,
+                         'E': -7.5,
+                         'D': -2.8,
+                         'F': 14.7,
+                         'I': 11.0,
+                         'H':  2.0,
+                         'M':  4.1,
+                         'N': -2.8,
+                         'Q':  1.8,
+                         'P':  5.6,
+                         'S': -3.5,
+                         'pS':-7.6,
+                         'R':  3.2,
+                         'T':  1.1,
+                         'pT':-3.0,
+                         'W': 17.8,
+                         'V':  2.1,
+                         'Y':  3.8,
+                         'pY':-0.3,
+                         'H-': 0.0,
+                         '-OH':0.0},
+                   'lcp': 0.0,
+                   'const': 0.0}
+"""A set of retention coefficients determined in Browne, C. A.;
+Bennett, H. P. J.; Solomon, S. The isolation of peptides by
+high-performance liquid chromatography using predicted elution
+positions. Analytical Biochemistry, 1982, 124 (1), 201-208.
+
+Conditions: Waters mjuBondapak C18 column, gradient (A = 0.13% aq. HFBA,
+B = 0.13% HFBA in acetonitrile) at 0.33% B/min, flow rate 1.5 ml/min.
+"""
+
+RCs_palmblad = {'aa':{'K': -0.66,
+                      'G': -0.29,
+                      'L':  2.28,
+                      'A':  0.41,
+                      'C': -1.32,
+                      'E': -0.26,
+                      'D':  0.04,
+                      'F':  2.68,
+                      'I':  2.70,
+                      'H':  0.57,
+                      'M':  0.98,
+                      'N': -0.54,
+                      'Q':  1.02,
+                      'P':  0.97,
+                      'S': -0.71,
+                      'R': -0.76,
+                      'T':  0.37,
+                      'W':  4.68,
+                      'V':  2.44,
+                      'Y':  2.78,
+                      'H-': 0.0,
+                      '-OH':0.0},
+                'lcp': 0.0,
+                'const': 0.0}
+"""A set of retention coefficients determined in Palmblad, M.;
+Ramstrom, M.; Markides, K. E.; Hakansson, P.; Bergquist, J. Prediction
+of Chromatographic Retention and Protein Identification in Liquid
+Chromatography/Mass Spectrometry. Analytical Chemistry, 2002, 74 (22),
+5826-5830.
+
+Conditions: a fused silica column (80-100 x 0.200 mm I.D.) packed
+in-house with C18 ODS-AQ; solvent A = 0.5% aq. HAc, B = 0.5% HAc in
+acetonitrile.
+"""
+
+RCs_yoshida = {'aa':{'K':  2.77,
+                     'G': -0.16,
+                     'L': -2.31,
+                     'A':  0.28,
+                     'C':  0.80,
+                  'camC':  0.80,
+                     'E':  1.58,
+                     'D':  2.45,
+                     'F': -2.94,
+                     'I': -1.34,
+                     'H':  3.44,
+                     'M': -0.14,
+                     'N':  3.25,
+                     'Q':  2.35,
+                     'P':  0.77,
+                     'S':  2.53,
+                     'R':  3.90,
+                     'T':  1.73,
+                     'W': -1.80,
+                     'V': -2.19,
+                     'Y': -0.11,
+                     'H-': 0.0,
+                     '-OH':0.0},
+               'lcp': 0.0,
+               'const': 0.0}
+"""A set of retention coefficients determined in Yoshida,
+T. Calculation of peptide retention coefficients in normal-phase
+liquid chromatography. Journal of Chromatography A, 1998, 808 (1-2),
+105-112.
+
+.. note::  Cysteine is Carboxymethylated.
+
+Conditions: TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A =
+0.1% TFA in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at
+0.6% water/min, flow rate 1.0 ml/min, 40 centigrades.
+"""
+
+RCs_yoshida_lc = {'aa': {'A': 1.29,
+                         'C': 0.94,
+                      'camC': 0.94,
+                         'D': 3.89,
+                         'E': 4.40,
+                         'F': -4.18,
+                         'G': 1.29,
+                         'H': 7.57,
+                         'I': -2.65,
+                         'K': 7.33,
+                         'L': -3.93,
+                         'M': -1.48,
+                         'N': 6.65,
+                         'P': 1.03,
+                         'Q': 6.68,
+                         'R': 7.08,
+                         'S': 5.09,
+                         'T': 3.46,
+                         'V': -2.52,
+                         'W': -1.87,
+                         'Y': -0.46,
+                         'H-': 0.0,
+                         '-OH': 0.0},
+                  'const': 0.0,
+                  'lcp': -0.2}
+"""A set of retention coefficients from the length-corrected model
+of normal-phase peptide chromatography. The dataset comes from Yoshida, T.
+Calculation of peptide retention coefficients in normal-phase
+liquid chromatography. Journal of Chromatography A, 1998, 808 (1-2),
+105-112. The RCs were calculated in Moskovets, E.; Goloborodko A. A.;
+Gorshkov A. V.; Gorshkov M.V. Limitation of predictive 2-D liquid chromatography
+in reducing the database search space in shotgun proteomics: In silico studies.
+Journal of Separation Science, 2012, 35 (14), 1771-1778.
+
+.. note::  Cysteine is Carboxymethylated.
+
+Conditions: TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A =
+0.1% TFA in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at
+0.6% water/min, flow rate 1.0 ml/min, 40 centigrades.
+"""
+
+RCs_zubarev = {'aa': {'A': 6.73,
+                      'E': 5.66,
+                      'C': 3.25,
+                      'D': 5.64,
+                      'G': 2.35,
+                      'F': 27.43,
+                      'I': 20.50,
+                      'H': -0.66,
+                      'K': -4.47,
+                      'M': 17.39,
+                      'L': 23.38,
+                      'N': 2.57,
+                      'Q': 2.93,
+                      'P': 5.66,
+                      'S': 3.58,
+                      'R': -2.55,
+                      'T': 4.88,
+                      'Y': 13.22,
+                      'W': 31.27,
+                      'V': 13.05,
+                   'camC': 3.25,
+                      'C': 3.25,
+                    'oxM': -7.61,
+                    '-OH': 0.0,
+                     'H-': 0.0},
+            'const': 0.53,
+              'lcp': -0.21}
+"""A set of retention coefficients from the length-corrected model
+of reversed-phase peptide chromatography. The dataset was taken from
+Goloborodko A. A.; Mayerhofer C.; Zubarev A. R.; Tarasova I. A.; Gorshkov A. V.;
+Zubarev, R. A.; Gorshkov, M. V. Empirical approach to false discovery rate
+estimation in shotgun proteomics. Rapid communications in mass spectrometry,
+2010, 24(4), 454-62.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: Reprosil-Pur C18-AQ column (150 x 0.075 mm I.D.), gradient (A =
+0.5% AA in water; B = 0.5% AA in ACN-water (90:10)) at
+0.5% water/min, flow rate 200.0 nl/min, room temperature.
+"""
+
+RCs_gilar_atlantis_ph3_0 = {'aa': {'K': 15.90,
+    'R': 13.64,
+    'H': 12.94,
+    'E': 2.97,
+    'P': 4.77,
+    'Q': 5.43,
+    'D': 3.20,
+   'C*': 4.87,
+    'C': 4.87,
+    'N': 3.91,
+    'A': 3.34,
+    'G': 3.33,
+    'S': 3.04,
+    'T': 2.71,
+    'V': 1.75,
+    'I': 0.65,
+    'M': 1.13,
+    'L': 0.13,
+    'F': -1.17,
+    'Y': -0.22,
+    'W': -2.47},
+        'lcp': 0.0,
+        'const': 21.33}
+"""A set of retention coefficients for normal phase chromatography obtained in
+Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in
+hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49),
+8890-6.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A,
+gradient (A = water, B = ACN, C = 200 mM ammonium formate):
+0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C
+at 0.2 ml/min, temperature 40 C, pH 3.0"""
+
+RCs_gilar_atlantis_ph4_5 = {'aa': {'K': 15.49,
+    'R': 13.33,
+    'H': 12.19,
+    'E': 6.93,
+    'P': 5.89,
+    'Q': 5.68,
+    'D': 5.31,
+   'C*': 5.23,
+    'C': 5.23,
+    'N': 4.07,
+    'A': 3.6,
+    'G': 3.46,
+    'S': 2.62,
+    'T': 2.33,
+    'V': 1.42,
+    'I': 0.84,
+    'M': 0.34,
+    'L': 0.29,
+    'F': -1.21,
+    'Y': -1.62,
+    'W': -2.08},
+        'lcp': 0.0,
+        'const': 23.95}
+"""A set of retention coefficients for normal phase chromatography obtained in
+Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in
+hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49),
+8890-6.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A,
+gradient (A = water, B = ACN, C = 200 mM ammonium formate):
+0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C
+at 0.2 ml/min, temperature 40 C, pH 4.5"""
+
+RCs_gilar_atlantis_ph10_0 = {'aa': {'K': 25.23,
+    'R': 23.38,
+    'H': 5.94,
+    'E': 0.59,
+    'P': 4.00,
+    'Q': 3.53,
+    'D': -0.84,
+   'C*': 3.52,
+    'C': 3.52,
+    'N': 3.26,
+    'A': 3.64,
+    'G': 3.02,
+    'S': 2.28,
+    'T': 1.74,
+    'V': 1.05,
+    'I': 1.51,
+    'M': -0.61,
+    'L': 0.25,
+    'F': -0.17,
+    'Y': -0.79,
+    'W': 0.23},
+        'lcp': 0.0,
+        'const': 13.78}
+"""A set of retention coefficients for normal phase chromatography obtained in
+Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in
+hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49),
+8890-6.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A,
+gradient (A = water, B = ACN, C = 200 mM ammonium formate):
+0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C
+at 0.2 ml/min, temperature 40 C, pH 10.0"""
+
+RCs_gilar_beh = {'aa': {'K': 9.49,
+    'R': 8.56,
+    'H': 8.40,
+    'E': 5.95,
+    'P': 4.73,
+    'Q': 4.65,
+    'D': 4.97,
+    'C': 3.47,
+   'C*': 3.47,
+    'N': 3.50,
+    'A': 2.90,
+    'G': 2.63,
+    'S': 2.14,
+    'T': 2.19,
+    'V': 1.71,
+    'I': 1.30,
+    'M': 1.40,
+    'L': 0.73,
+    'F': -0.09,
+    'Y': -0.40,
+    'W': 0.11},
+        'lcp': 0.0,
+        'const': 18.41}
+"""A set of retention coefficients for normal phase chromatography obtained in
+Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in
+hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49),
+8890-6.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: ACQUITY UPLC BEH HILIC column (150 x 2.1 mm I.D.), 1.7 um, 130 A,
+Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by
+titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B:
+90% ACN, 10% mobile phase A (v:v).
+Gradient: 90-60% B in 50 min."""
+
+RCs_gilar_beh_amide = {'aa': {'K': 7.19,
+    'R': 6.68,
+    'H': 6.16,
+    'E': 6.11,
+    'P': 3.18,
+    'Q': 5.19,
+    'D': 6.02,
+   'C*': 3.71,
+    'C': 3.71,
+    'N': 4.16,
+    'A': 2.64,
+    'G': 3.12,
+    'S': 3.17,
+    'T': 3.41,
+    'V': 0.83,
+    'I': -0.69,
+    'M': -0.12,
+    'L': -1.24,
+    'F': -1.93,
+    'Y': 0.46,
+    'W': -2.11},
+        'lcp': 0.0,
+        'const': 24.26}
+"""A set of retention coefficients for normal phase chromatography obtained in
+Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in
+hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49),
+8890-6.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: ACQUITY UPLC BEH glycan column (150 x 2.1 mm I.D.), 1.7 um, 130 A,
+Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by
+titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B:
+90% ACN, 10% mobile phase A (v:v).
+Gradient: 90-60% B in 50 min."""
+
+RCs_gilar_rp = {'aa': {'K': -1.015,
+    'R': -0.681,
+    'H': -1.937,
+    'E': 1.475,
+    'P': 3.496,
+    'Q': 1.228,
+    'D': 1.326,
+    'C': 1.832,
+   'C*': 1.832,
+    'N': 0.299,
+    'A': 2.322,
+    'G': 1.172,
+    'S': 1.165,
+    'T': 1.894,
+    'V': 5.695,
+    'I': 8.343,
+    'M': 5.128,
+    'L': 9.069,
+    'F': 10.877,
+    'Y': 5.603,
+    'W': 12.183},
+        'lcp': 0.0,
+        'const': -3.696}
+"""A set of retention coefficients for normal phase chromatography obtained in
+Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in
+hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49),
+8890-6.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: ACQUITY UPLC BEH C18 column (100 mm x 2.1 mm I.D.), 1.7 um, 130 A.
+Mobile phase A: 0.02% TFA in water, mobile phase B: 0.018% TFA in ACN.
+Gradient: 0 to 50% B in 50 min, flow rate 0.2 ml/min, temperature 40 C., pH 2.6.
+"""
+
+RCs_krokhin_100A_fa = {'aa':{'K': -5.08,
+                       'G': -0.07,
+                       'L':  9.89,
+                       'A':  1.63,
+                       'C':  0.7,
+                    'camC':  0.7,
+                       'E':  1.75,
+                       'D':  0.95,
+                       'F':  11.92,
+                       'I':  9.06,
+                       'H': -5.05,
+                       'M':  6.96,
+                       'N': -0.59,
+                       'Q':  0.2,
+                       'P':  1.98,
+                       'S': 0.27,
+                       'R': -3.55,
+                       'T':  1.37,
+                       'W':  13.67,
+                       'V':  5.72,
+                       'Y':  5.97},
+                 'lcf': 0.0,
+                 'const': 0.0}
+"""A set of retention coefficients from R.C. Dwivedi, V. Spicer,
+M. Harder, M. Antonovici, W. Ens, K.G. Standing, J.A. Wilkins, and O.V. Krokhin;
+Analytical Chemistry 2008 80 (18), 7036-7042.
+Practical Implementation of 2D HPLC Scheme with Accurate Peptide
+Retention Prediction in Both Dimensions for High-Throughput Bottom-Up Proteomics.
+
+.. note::  Cysteine is Carbamidomethylated.
+
+Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% FA), packed with
+5-um Luna C18(2) (Phenomenex, Torrance, CA), pore size 100A,  pH=2.0.
+Both eluents A (2% ACN in water) and B (98% ACN) contained
+0.1% FA as ion-pairing modifier. 0.33% ACN/min
+linear gradient (0-30% B).
+"""
+
+RCs_krokhin_100A_tfa = {'aa':{'K': -3.53,
+                       'G': -0.35,
+                       'L':  9.44,
+                       'A':  1.11,
+                       'C':  0.04,
+                    'camC':  0.04,
+                       'E':  1.08,
+                       'D':  -0.22,
+                       'F':  11.34,
+                       'I':  7.86,
+                       'H': -3.04,
+                       'M':  6.57,
+                       'N': -1.44,
+                       'Q':  -0.53,
+                       'P':  1.62,
+                       'S': -0.33,
+                       'R': -2.58,
+                       'T':  0.48,
+                       'W':  13.12,
+                       'V':  4.86,
+                       'Y':  5.4},
+                 'lcf': 0.0,
+                 'const': 0.0}
+"""A set of retention coefficients from R.C. Dwivedi, V. Spicer,
+M. Harder, M. Antonovici, W. Ens, K.G. Standing, J.A. Wilkins, and O.V. Krokhin;
+Analytical Chemistry 2008 80 (18), 7036-7042.
+Practical Implementation of 2D HPLC Scheme with Accurate Peptide
+Retention Prediction in Both Dimensions for High-Throughput Bottom-Up Proteomics.
+
+.. note :: Cysteine is Carbamidomethylated.
+
+Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% TFA), packed with
+5-um Luna C18(2) (Phenomenex, Torrance, CA), pore size 100 A,  pH=2.0.
+Both eluents A (2% ACN in water) and B (98% ACN) contained
+0.1% TFA as ion-pairing modifier. 0.33% ACN/min
+linear gradient (0-30% B).
+"""
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/pyteomics/auxiliary/__init__.py b/pyteomics/auxiliary/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e3e5cde833c70404a051db917cad52c83abc89
--- /dev/null
+++ b/pyteomics/auxiliary/__init__.py
@@ -0,0 +1,35 @@
+try:
+    basestring = basestring
+except NameError:
+    basestring = (str, bytes)
+
+from .structures import (
+    PyteomicsError, Charge, ChargeList,
+    _parse_charge, _parse_ion, BasicComposition,
+    unitfloat, unitint, unitstr, cvstr,
+    cvquery)
+
+from .constants import _nist_mass
+
+from .file_helpers import (
+    _file_obj, _keepstate, _keepstate_method, IteratorContextManager,
+    FileReader, IndexedTextReader, IndexedReaderMixin, TimeOrderedIndexedReaderMixin,
+    IndexSavingMixin, OffsetIndex, HierarchicalOffsetIndex, IndexSavingTextReader,
+    _file_reader, _file_writer,
+    _make_chain, _check_use_index, FileReadingProcess, TaskMappingMixin,
+    serializer, ChainBase, TableJoiner)
+
+from .math import (
+    linear_regression, linear_regression_perpendicular,
+    linear_regression_vertical)
+
+from .target_decoy import (
+    _calculate_qvalues, _qvalues_df, _decoy_or_pep_label,
+    _construct_dtype, _make_qvalues, _make_filter,
+    _itercontext, _iter, qvalues, filter, log_factorial,
+    _expectation, _confidence_value, _log_pi_r,
+    _log_pi, _make_fdr, fdr, sigma_T, sigma_fdr)
+
+from .utils import (
+    print_tree, memoize, BinaryDataArrayTransformer, ArrayConversionMixin, BinaryArrayConversionMixin,
+    MaskedArrayConversionMixin, _decode_base64_data_array)
diff --git a/pyteomics/auxiliary/constants.py b/pyteomics/auxiliary/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc76648f88ffadeaf6e61e8b655f99800e193df
--- /dev/null
+++ b/pyteomics/auxiliary/constants.py
@@ -0,0 +1,3297 @@
+### Bulky constants for other modules are defined below.
+
+_nist_mass = {'Ac': {0: (227, 1.0),
+  206: (206.0145, 0.0),
+  207: (207.01195, 0.0),
+  208: (208.01155, 0.0),
+  209: (209.00949, 0.0),
+  210: (210.00944, 0.0),
+  211: (211.00773, 0.0),
+  212: (212.00781, 0.0),
+  213: (213.00661, 0.0),
+  214: (214.006902, 0.0),
+  215: (215.006454, 0.0),
+  216: (216.00872, 0.0),
+  217: (217.009347, 0.0),
+  218: (218.01164, 0.0),
+  219: (219.01242, 0.0),
+  220: (220.014763, 0.0),
+  221: (221.01559, 0.0),
+  222: (222.017844, 0.0),
+  223: (223.019137, 0.0),
+  224: (224.021723, 0.0),
+  225: (225.02323, 0.0),
+  226: (226.026098, 0.0),
+  227: (227.0277521, 0.0),
+  228: (228.0310211, 0.0),
+  229: (229.03302, 0.0),
+  230: (230.03629, 0.0),
+  231: (231.03856, 0.0),
+  232: (232.04203, 0.0),
+  233: (233.04455, 0.0),
+  234: (234.04842, 0.0),
+  235: (235.05123, 0.0),
+  236: (236.0553, 0.0)},
+ 'Ag': {0: (106.905097, 1.0),
+  93: (92.94978, 0.0),
+  94: (93.94278, 0.0),
+  95: (94.93548, 0.0),
+  96: (95.93068, 0.0),
+  97: (96.92397, 0.0),
+  98: (97.92157, 0.0),
+  99: (98.9176, 0.0),
+  100: (99.9161, 0.0),
+  101: (100.9128, 0.0),
+  102: (101.91169, 0.0),
+  103: (102.908973, 0.0),
+  104: (103.908629, 0.0),
+  105: (104.906529, 0.0),
+  106: (105.906669, 0.0),
+  107: (106.905097, 0.51839),
+  108: (107.905956, 0.0),
+  109: (108.904752, 0.48161),
+  110: (109.906107, 0.0),
+  111: (110.905291, 0.0),
+  112: (111.907005, 0.0),
+  113: (112.906567, 0.0),
+  114: (113.908804, 0.0),
+  115: (114.90876, 0.0),
+  116: (115.91136, 0.0),
+  117: (116.91168, 0.0),
+  118: (117.91458, 0.0),
+  119: (118.91567, 0.0),
+  120: (119.91879, 0.0),
+  121: (120.91985, 0.0),
+  122: (121.92353, 0.0),
+  123: (122.9249, 0.0),
+  124: (123.92864, 0.0),
+  125: (124.93043, 0.0),
+  126: (125.9345, 0.0),
+  127: (126.93677, 0.0),
+  128: (127.94117, 0.0),
+  129: (128.94369, 0.0),
+  130: (129.95045, 0.0)},
+ 'Al': {0: (26.98153863, 1.0),
+  21: (21.02804, 0.0),
+  22: (22.01952, 0.0),
+  23: (23.007267, 0.0),
+  24: (23.9999389, 0.0),
+  25: (24.9904281, 0.0),
+  26: (25.98689169, 0.0),
+  27: (26.98153863, 1.0),
+  28: (27.98191031, 0.0),
+  29: (28.980445, 0.0),
+  30: (29.98296, 0.0),
+  31: (30.983947, 0.0),
+  32: (31.98812, 0.0),
+  33: (32.99084, 0.0),
+  34: (33.99685, 0.0),
+  35: (34.99986, 0.0),
+  36: (36.00621, 0.0),
+  37: (37.01068, 0.0),
+  38: (38.01723, 0.0),
+  39: (39.02297, 0.0),
+  40: (40.03145, 0.0),
+  41: (41.03833, 0.0),
+  42: (42.04689, 0.0)},
+ 'Am': {0: (243, 1.0),
+  231: (231.04556, 0.0),
+  232: (232.04659, 0.0),
+  233: (233.04635, 0.0),
+  234: (234.04781, 0.0),
+  235: (235.04795, 0.0),
+  236: (236.04958, 0.0),
+  237: (237.05, 0.0),
+  238: (238.05198, 0.0),
+  239: (239.0530245, 0.0),
+  240: (240.0553, 0.0),
+  241: (241.0568291, 0.0),
+  242: (242.0595492, 0.0),
+  243: (243.0613811, 0.0),
+  244: (244.0642848, 0.0),
+  245: (245.066452, 0.0),
+  246: (246.069775, 0.0),
+  247: (247.07209, 0.0),
+  248: (248.07575, 0.0),
+  249: (249.07848, 0.0)},
+ 'Ar': {0: (39.9623831225, 1.0),
+  30: (30.02156, 0.0),
+  31: (31.01212, 0.0),
+  32: (31.997638, 0.0),
+  33: (32.9899257, 0.0),
+  34: (33.9802712, 0.0),
+  35: (34.9752576, 0.0),
+  36: (35.967545106, 0.003365),
+  37: (36.96677632, 0.0),
+  38: (37.9627324, 0.000632),
+  39: (38.964313, 0.0),
+  40: (39.9623831225, 0.996003),
+  41: (40.9645006, 0.0),
+  42: (41.963046, 0.0),
+  43: (42.965636, 0.0),
+  44: (43.964924, 0.0),
+  45: (44.96804, 0.0),
+  46: (45.96809, 0.0),
+  47: (46.97219, 0.0),
+  48: (47.97454, 0.0),
+  49: (48.98052, 0.0),
+  50: (49.98443, 0.0),
+  51: (50.99163, 0.0),
+  52: (51.99678, 0.0),
+  53: (53.00494, 0.0)},
+ 'As': {0: (74.9215965, 1.0),
+  60: (59.99313, 0.0),
+  61: (60.98062, 0.0),
+  62: (61.9732, 0.0),
+  63: (62.96369, 0.0),
+  64: (63.95757, 0.0),
+  65: (64.94956, 0.0),
+  66: (65.94471, 0.0),
+  67: (66.93919, 0.0),
+  68: (67.93677, 0.0),
+  69: (68.93227, 0.0),
+  70: (69.93092, 0.0),
+  71: (70.927112, 0.0),
+  72: (71.926752, 0.0),
+  73: (72.923825, 0.0),
+  74: (73.9239287, 0.0),
+  75: (74.9215965, 1.0),
+  76: (75.922394, 0.0),
+  77: (76.9206473, 0.0),
+  78: (77.921827, 0.0),
+  79: (78.920948, 0.0),
+  80: (79.922534, 0.0),
+  81: (80.922132, 0.0),
+  82: (81.9245, 0.0),
+  83: (82.92498, 0.0),
+  84: (83.92906, 0.0),
+  85: (84.93202, 0.0),
+  86: (85.9365, 0.0),
+  87: (86.9399, 0.0),
+  88: (87.94494, 0.0),
+  89: (88.94939, 0.0),
+  90: (89.9555, 0.0),
+  91: (90.96043, 0.0),
+  92: (91.9668, 0.0)},
+ 'At': {0: (210, 1.0),
+  193: (192.99984, 0.0),
+  194: (193.99873, 0.0),
+  195: (194.996268, 0.0),
+  196: (195.99579, 0.0),
+  197: (196.99319, 0.0),
+  198: (197.99284, 0.0),
+  199: (198.99053, 0.0),
+  200: (199.990351, 0.0),
+  201: (200.988417, 0.0),
+  202: (201.98863, 0.0),
+  203: (202.986942, 0.0),
+  204: (203.987251, 0.0),
+  205: (204.986074, 0.0),
+  206: (205.986667, 0.0),
+  207: (206.985784, 0.0),
+  208: (207.98659, 0.0),
+  209: (208.986173, 0.0),
+  210: (209.987148, 0.0),
+  211: (210.9874963, 0.0),
+  212: (211.990745, 0.0),
+  213: (212.992937, 0.0),
+  214: (213.996372, 0.0),
+  215: (214.998653, 0.0),
+  216: (216.002423, 0.0),
+  217: (217.004719, 0.0),
+  218: (218.008694, 0.0),
+  219: (219.011162, 0.0),
+  220: (220.01541, 0.0),
+  221: (221.01805, 0.0),
+  222: (222.02233, 0.0),
+  223: (223.02519, 0.0)},
+ 'Au': {0: (196.9665687, 1.0),
+  169: (168.99808, 0.0),
+  170: (169.99612, 0.0),
+  171: (170.991879, 0.0),
+  172: (171.99004, 0.0),
+  173: (172.986237, 0.0),
+  174: (173.98476, 0.0),
+  175: (174.98127, 0.0),
+  176: (175.9801, 0.0),
+  177: (176.976865, 0.0),
+  178: (177.97603, 0.0),
+  179: (178.973213, 0.0),
+  180: (179.972521, 0.0),
+  181: (180.970079, 0.0),
+  182: (181.969618, 0.0),
+  183: (182.967593, 0.0),
+  184: (183.967452, 0.0),
+  185: (184.965789, 0.0),
+  186: (185.965953, 0.0),
+  187: (186.964568, 0.0),
+  188: (187.965324, 0.0),
+  189: (188.963948, 0.0),
+  190: (189.9647, 0.0),
+  191: (190.9637, 0.0),
+  192: (191.964813, 0.0),
+  193: (192.96415, 0.0),
+  194: (193.965365, 0.0),
+  195: (194.9650346, 0.0),
+  196: (195.96657, 0.0),
+  197: (196.9665687, 1.0),
+  198: (197.9682423, 0.0),
+  199: (198.9687652, 0.0),
+  200: (199.97073, 0.0),
+  201: (200.971657, 0.0),
+  202: (201.97381, 0.0),
+  203: (202.975155, 0.0),
+  204: (203.97772, 0.0),
+  205: (204.97987, 0.0)},
+ 'B': {0: (11.0093054, 1.0),
+  6: (6.04681, 0.0),
+  7: (7.02992, 0.0),
+  8: (8.0246072, 0.0),
+  9: (9.0133288, 0.0),
+  10: (10.012937, 0.199),
+  11: (11.0093054, 0.801),
+  12: (12.0143521, 0.0),
+  13: (13.0177802, 0.0),
+  14: (14.025404, 0.0),
+  15: (15.031103, 0.0),
+  16: (16.03981, 0.0),
+  17: (17.04699, 0.0),
+  18: (18.05617, 0.0),
+  19: (19.06373, 0.0)},
+ 'Ba': {0: (137.9052472, 1.0),
+  114: (113.95068, 0.0),
+  115: (114.94737, 0.0),
+  116: (115.94138, 0.0),
+  117: (116.9385, 0.0),
+  118: (117.93304, 0.0),
+  119: (118.93066, 0.0),
+  120: (119.92604, 0.0),
+  121: (120.92405, 0.0),
+  122: (121.9199, 0.0),
+  123: (122.918781, 0.0),
+  124: (123.915094, 0.0),
+  125: (124.914473, 0.0),
+  126: (125.91125, 0.0),
+  127: (126.911094, 0.0),
+  128: (127.908318, 0.0),
+  129: (128.908679, 0.0),
+  130: (129.9063208, 0.00106),
+  131: (130.906941, 0.0),
+  132: (131.9050613, 0.00101),
+  133: (132.9060075, 0.0),
+  134: (133.9045084, 0.02417),
+  135: (134.9056886, 0.06592),
+  136: (135.9045759, 0.07854),
+  137: (136.9058274, 0.11232),
+  138: (137.9052472, 0.71698),
+  139: (138.9088413, 0.0),
+  140: (139.910605, 0.0),
+  141: (140.914411, 0.0),
+  142: (141.916453, 0.0),
+  143: (142.920627, 0.0),
+  144: (143.922953, 0.0),
+  145: (144.92763, 0.0),
+  146: (145.93022, 0.0),
+  147: (146.93495, 0.0),
+  148: (147.93772, 0.0),
+  149: (148.94258, 0.0),
+  150: (149.94568, 0.0),
+  151: (150.95081, 0.0),
+  152: (151.95427, 0.0),
+  153: (152.95961, 0.0)},
+ 'Be': {0: (9.0121822, 1.0),
+  5: (5.04079, 0.0),
+  6: (6.019726, 0.0),
+  7: (7.01692983, 0.0),
+  8: (8.0053051, 0.0),
+  9: (9.0121822, 1.0),
+  10: (10.0135338, 0.0),
+  11: (11.021658, 0.0),
+  12: (12.026921, 0.0),
+  13: (13.03569, 0.0),
+  14: (14.04289, 0.0),
+  15: (15.05346, 0.0),
+  16: (16.06192, 0.0)},
+ 'Bh': {0: (272, 1.0),
+  260: (260.12197, 0.0),
+  261: (261.12166, 0.0),
+  262: (262.12289, 0.0),
+  263: (263.12304, 0.0),
+  264: (264.1246, 0.0),
+  265: (265.12515, 0.0),
+  266: (266.12694, 0.0),
+  267: (267.12765, 0.0),
+  268: (268.12976, 0.0),
+  269: (269.13069, 0.0),
+  270: (270.13362, 0.0),
+  271: (271.13518, 0.0),
+  272: (272.13803, 0.0),
+  273: (273.13962, 0.0),
+  274: (274.14244, 0.0),
+  275: (275.14425, 0.0)},
+ 'Bi': {0: (208.9803987, 1.0),
+  184: (184.00112, 0.0),
+  185: (184.99763, 0.0),
+  186: (185.9966, 0.0),
+  187: (186.993158, 0.0),
+  188: (187.99227, 0.0),
+  189: (188.9892, 0.0),
+  190: (189.9883, 0.0),
+  191: (190.985786, 0.0),
+  192: (191.98546, 0.0),
+  193: (192.98296, 0.0),
+  194: (193.98283, 0.0),
+  195: (194.980651, 0.0),
+  196: (195.980667, 0.0),
+  197: (196.978864, 0.0),
+  198: (197.97921, 0.0),
+  199: (198.977672, 0.0),
+  200: (199.978132, 0.0),
+  201: (200.977009, 0.0),
+  202: (201.977742, 0.0),
+  203: (202.976876, 0.0),
+  204: (203.977813, 0.0),
+  205: (204.977389, 0.0),
+  206: (205.978499, 0.0),
+  207: (206.9784707, 0.0),
+  208: (207.9797422, 0.0),
+  209: (208.9803987, 1.0),
+  210: (209.9841204, 0.0),
+  211: (210.987269, 0.0),
+  212: (211.9912857, 0.0),
+  213: (212.994385, 0.0),
+  214: (213.998712, 0.0),
+  215: (215.00177, 0.0),
+  216: (216.006306, 0.0),
+  217: (217.00947, 0.0),
+  218: (218.01432, 0.0)},
+ 'Bk': {0: (247, 1.0),
+  235: (235.05658, 0.0),
+  236: (236.05733, 0.0),
+  237: (237.057, 0.0),
+  238: (238.05828, 0.0),
+  239: (239.05828, 0.0),
+  240: (240.05976, 0.0),
+  241: (241.06023, 0.0),
+  242: (242.06198, 0.0),
+  243: (243.063008, 0.0),
+  244: (244.065181, 0.0),
+  245: (245.0663616, 0.0),
+  246: (246.06867, 0.0),
+  247: (247.070307, 0.0),
+  248: (248.07309, 0.0),
+  249: (249.0749867, 0.0),
+  250: (250.078317, 0.0),
+  251: (251.08076, 0.0),
+  252: (252.08431, 0.0),
+  253: (253.08688, 0.0),
+  254: (254.0906, 0.0)},
+ 'Br': {0: (78.9183371, 1.0),
+  67: (66.96479, 0.0),
+  68: (67.95852, 0.0),
+  69: (68.95011, 0.0),
+  70: (69.94479, 0.0),
+  71: (70.93874, 0.0),
+  72: (71.93664, 0.0),
+  73: (72.93169, 0.0),
+  74: (73.929891, 0.0),
+  75: (74.925776, 0.0),
+  76: (75.924541, 0.0),
+  77: (76.921379, 0.0),
+  78: (77.921146, 0.0),
+  79: (78.9183371, 0.5069),
+  80: (79.9185293, 0.0),
+  81: (80.9162906, 0.4931),
+  82: (81.9168041, 0.0),
+  83: (82.91518, 0.0),
+  84: (83.916479, 0.0),
+  85: (84.915608, 0.0),
+  86: (85.918798, 0.0),
+  87: (86.920711, 0.0),
+  88: (87.92407, 0.0),
+  89: (88.92639, 0.0),
+  90: (89.93063, 0.0),
+  91: (90.93397, 0.0),
+  92: (91.93926, 0.0),
+  93: (92.94305, 0.0),
+  94: (93.94868, 0.0),
+  95: (94.95287, 0.0),
+  96: (95.95853, 0.0),
+  97: (96.9628, 0.0)},
+ 'C': {0: (12.0, 1.0),
+  8: (8.037675, 0.0),
+  9: (9.0310367, 0.0),
+  10: (10.0168532, 0.0),
+  11: (11.0114336, 0.0),
+  12: (12.0, 0.9893),
+  13: (13.0033548378, 0.0107),
+  14: (14.003241989, 0.0),
+  15: (15.0105993, 0.0),
+  16: (16.014701, 0.0),
+  17: (17.022586, 0.0),
+  18: (18.02676, 0.0),
+  19: (19.03481, 0.0),
+  20: (20.04032, 0.0),
+  21: (21.04934, 0.0),
+  22: (22.0572, 0.0)},
+ 'Ca': {0: (39.96259098, 1.0),
+  34: (34.01412, 0.0),
+  35: (35.00494, 0.0),
+  36: (35.99309, 0.0),
+  37: (36.98587, 0.0),
+  38: (37.976318, 0.0),
+  39: (38.9707197, 0.0),
+  40: (39.96259098, 0.96941),
+  41: (40.96227806, 0.0),
+  42: (41.95861801, 0.00647),
+  43: (42.9587666, 0.00135),
+  44: (43.9554818, 0.02086),
+  45: (44.9561866, 0.0),
+  46: (45.9536926, 4e-05),
+  47: (46.954546, 0.0),
+  48: (47.952534, 0.00187),
+  49: (48.955674, 0.0),
+  50: (49.957519, 0.0),
+  51: (50.9615, 0.0),
+  52: (51.9651, 0.0),
+  53: (52.97005, 0.0),
+  54: (53.97435, 0.0),
+  55: (54.98055, 0.0),
+  56: (55.98557, 0.0),
+  57: (56.99236, 0.0)},
+ 'Cd': {0: (113.9033585, 1.0),
+  95: (94.94987, 0.0),
+  96: (95.93977, 0.0),
+  97: (96.93494, 0.0),
+  98: (97.9274, 0.0),
+  99: (98.92501, 0.0),
+  100: (99.92029, 0.0),
+  101: (100.91868, 0.0),
+  102: (101.91446, 0.0),
+  103: (102.913419, 0.0),
+  104: (103.909849, 0.0),
+  105: (104.909468, 0.0),
+  106: (105.906459, 0.0125),
+  107: (106.906618, 0.0),
+  108: (107.904184, 0.0089),
+  109: (108.904982, 0.0),
+  110: (109.9030021, 0.1249),
+  111: (110.9041781, 0.128),
+  112: (111.9027578, 0.2413),
+  113: (112.9044017, 0.1222),
+  114: (113.9033585, 0.2873),
+  115: (114.905431, 0.0),
+  116: (115.904756, 0.0749),
+  117: (116.907219, 0.0),
+  118: (117.906915, 0.0),
+  119: (118.90992, 0.0),
+  120: (119.90985, 0.0),
+  121: (120.91298, 0.0),
+  122: (121.91333, 0.0),
+  123: (122.917, 0.0),
+  124: (123.91765, 0.0),
+  125: (124.92125, 0.0),
+  126: (125.92235, 0.0),
+  127: (126.92644, 0.0),
+  128: (127.92776, 0.0),
+  129: (128.93215, 0.0),
+  130: (129.9339, 0.0),
+  131: (130.94067, 0.0),
+  132: (131.94555, 0.0)},
+ 'Ce': {0: (139.9054387, 1.0),
+  119: (118.95276, 0.0),
+  120: (119.94664, 0.0),
+  121: (120.94342, 0.0),
+  122: (121.93791, 0.0),
+  123: (122.9354, 0.0),
+  124: (123.93041, 0.0),
+  125: (124.92844, 0.0),
+  126: (125.92397, 0.0),
+  127: (126.92273, 0.0),
+  128: (127.91891, 0.0),
+  129: (128.9181, 0.0),
+  130: (129.91474, 0.0),
+  131: (130.91442, 0.0),
+  132: (131.91146, 0.0),
+  133: (132.911515, 0.0),
+  134: (133.908925, 0.0),
+  135: (134.909151, 0.0),
+  136: (135.907172, 0.00185),
+  137: (136.907806, 0.0),
+  138: (137.905991, 0.00251),
+  139: (138.906653, 0.0),
+  140: (139.9054387, 0.8845),
+  141: (140.9082763, 0.0),
+  142: (141.909244, 0.11114),
+  143: (142.912386, 0.0),
+  144: (143.913647, 0.0),
+  145: (144.91723, 0.0),
+  146: (145.91876, 0.0),
+  147: (146.92267, 0.0),
+  148: (147.92443, 0.0),
+  149: (148.9284, 0.0),
+  150: (149.93041, 0.0),
+  151: (150.93398, 0.0),
+  152: (151.93654, 0.0),
+  153: (152.94058, 0.0),
+  154: (153.94342, 0.0),
+  155: (154.94804, 0.0),
+  156: (155.95126, 0.0),
+  157: (156.95634, 0.0)},
+ 'Cf': {0: (251, 1.0),
+  237: (237.06207, 0.0),
+  238: (238.06141, 0.0),
+  239: (239.06242, 0.0),
+  240: (240.0623, 0.0),
+  241: (241.06373, 0.0),
+  242: (242.0637, 0.0),
+  243: (243.06543, 0.0),
+  244: (244.066001, 0.0),
+  245: (245.068049, 0.0),
+  246: (246.0688053, 0.0),
+  247: (247.071001, 0.0),
+  248: (248.072185, 0.0),
+  249: (249.0748535, 0.0),
+  250: (250.0764061, 0.0),
+  251: (251.079587, 0.0),
+  252: (252.081626, 0.0),
+  253: (253.085133, 0.0),
+  254: (254.087323, 0.0),
+  255: (255.09105, 0.0),
+  256: (256.09344, 0.0)},
+ 'Cl': {0: (34.96885268, 1.0),
+  28: (28.02851, 0.0),
+  29: (29.01411, 0.0),
+  30: (30.00477, 0.0),
+  31: (30.99241, 0.0),
+  32: (31.98569, 0.0),
+  33: (32.9774519, 0.0),
+  34: (33.97376282, 0.0),
+  35: (34.96885268, 0.7576),
+  36: (35.96830698, 0.0),
+  37: (36.96590259, 0.2424),
+  38: (37.96801043, 0.0),
+  39: (38.9680082, 0.0),
+  40: (39.97042, 0.0),
+  41: (40.97068, 0.0),
+  42: (41.97325, 0.0),
+  43: (42.97405, 0.0),
+  44: (43.97828, 0.0),
+  45: (44.98029, 0.0),
+  46: (45.98421, 0.0),
+  47: (46.98871, 0.0),
+  48: (47.99495, 0.0),
+  49: (49.00032, 0.0),
+  50: (50.00784, 0.0),
+  51: (51.01449, 0.0)},
+ 'Cm': {0: (247, 1.0),
+  233: (233.05077, 0.0),
+  234: (234.05016, 0.0),
+  235: (235.05143, 0.0),
+  236: (236.05141, 0.0),
+  237: (237.0529, 0.0),
+  238: (238.05303, 0.0),
+  239: (239.05496, 0.0),
+  240: (240.0555295, 0.0),
+  241: (241.057653, 0.0),
+  242: (242.0588358, 0.0),
+  243: (243.0613891, 0.0),
+  244: (244.0627526, 0.0),
+  245: (245.0654912, 0.0),
+  246: (246.0672237, 0.0),
+  247: (247.070354, 0.0),
+  248: (248.072349, 0.0),
+  249: (249.075953, 0.0),
+  250: (250.078357, 0.0),
+  251: (251.082285, 0.0),
+  252: (252.08487, 0.0)},
+ 'Cn': {0: (285, 1.0),
+  277: (277.16394, 0.0),
+  278: (278.16431, 0.0),
+  279: (279.16655, 0.0),
+  280: (280.16704, 0.0),
+  281: (281.16929, 0.0),
+  282: (282.16977, 0.0),
+  283: (283.17179, 0.0),
+  284: (284.17238, 0.0),
+  285: (285.17411, 0.0)},
+ 'Co': {0: (58.933195, 1.0),
+  47: (47.01149, 0.0),
+  48: (48.00176, 0.0),
+  49: (48.98972, 0.0),
+  50: (49.98154, 0.0),
+  51: (50.97072, 0.0),
+  52: (51.96359, 0.0),
+  53: (52.954219, 0.0),
+  54: (53.9484596, 0.0),
+  55: (54.941999, 0.0),
+  56: (55.9398393, 0.0),
+  57: (56.9362914, 0.0),
+  58: (57.9357528, 0.0),
+  59: (58.933195, 1.0),
+  60: (59.9338171, 0.0),
+  61: (60.9324758, 0.0),
+  62: (61.934051, 0.0),
+  63: (62.933612, 0.0),
+  64: (63.93581, 0.0),
+  65: (64.936478, 0.0),
+  66: (65.93976, 0.0),
+  67: (66.94089, 0.0),
+  68: (67.94487, 0.0),
+  69: (68.94632, 0.0),
+  70: (69.951, 0.0),
+  71: (70.9529, 0.0),
+  72: (71.95781, 0.0),
+  73: (72.96024, 0.0),
+  74: (73.96538, 0.0),
+  75: (74.96833, 0.0)},
+ 'Cr': {0: (51.9405075, 1.0),
+  42: (42.00643, 0.0),
+  43: (42.99771, 0.0),
+  44: (43.98555, 0.0),
+  45: (44.97964, 0.0),
+  46: (45.968359, 0.0),
+  47: (46.9629, 0.0),
+  48: (47.954032, 0.0),
+  49: (48.9513357, 0.0),
+  50: (49.9460442, 0.04345),
+  51: (50.9447674, 0.0),
+  52: (51.9405075, 0.83789),
+  53: (52.9406494, 0.09501),
+  54: (53.9388804, 0.02365),
+  55: (54.9408397, 0.0),
+  56: (55.9406531, 0.0),
+  57: (56.943613, 0.0),
+  58: (57.94435, 0.0),
+  59: (58.94859, 0.0),
+  60: (59.95008, 0.0),
+  61: (60.95472, 0.0),
+  62: (61.95661, 0.0),
+  63: (62.96186, 0.0),
+  64: (63.96441, 0.0),
+  65: (64.97016, 0.0),
+  66: (65.97338, 0.0),
+  67: (66.97955, 0.0)},
+ 'Cs': {0: (132.905451933, 1.0),
+  112: (111.9503, 0.0),
+  113: (112.94449, 0.0),
+  114: (113.94145, 0.0),
+  115: (114.93591, 0.0),
+  116: (115.93337, 0.0),
+  117: (116.92867, 0.0),
+  118: (117.926559, 0.0),
+  119: (118.922377, 0.0),
+  120: (119.920677, 0.0),
+  121: (120.917229, 0.0),
+  122: (121.91611, 0.0),
+  123: (122.912996, 0.0),
+  124: (123.912258, 0.0),
+  125: (124.909728, 0.0),
+  126: (125.909452, 0.0),
+  127: (126.907418, 0.0),
+  128: (127.907749, 0.0),
+  129: (128.906064, 0.0),
+  130: (129.906709, 0.0),
+  131: (130.905464, 0.0),
+  132: (131.9064343, 0.0),
+  133: (132.905451933, 1.0),
+  134: (133.906718475, 0.0),
+  135: (134.905977, 0.0),
+  136: (135.9073116, 0.0),
+  137: (136.9070895, 0.0),
+  138: (137.911017, 0.0),
+  139: (138.913364, 0.0),
+  140: (139.917282, 0.0),
+  141: (140.920046, 0.0),
+  142: (141.924299, 0.0),
+  143: (142.927352, 0.0),
+  144: (143.932077, 0.0),
+  145: (144.935526, 0.0),
+  146: (145.94029, 0.0),
+  147: (146.94416, 0.0),
+  148: (147.94922, 0.0),
+  149: (148.95293, 0.0),
+  150: (149.95817, 0.0),
+  151: (150.96219, 0.0)},
+ 'Cu': {0: (62.9295975, 1.0),
+  52: (51.99718, 0.0),
+  53: (52.98555, 0.0),
+  54: (53.97671, 0.0),
+  55: (54.96605, 0.0),
+  56: (55.95856, 0.0),
+  57: (56.949211, 0.0),
+  58: (57.9445385, 0.0),
+  59: (58.939498, 0.0),
+  60: (59.937365, 0.0),
+  61: (60.9334578, 0.0),
+  62: (61.932584, 0.0),
+  63: (62.9295975, 0.6915),
+  64: (63.9297642, 0.0),
+  65: (64.9277895, 0.3085),
+  66: (65.9288688, 0.0),
+  67: (66.9277303, 0.0),
+  68: (67.9296109, 0.0),
+  69: (68.9294293, 0.0),
+  70: (69.9323923, 0.0),
+  71: (70.9326768, 0.0),
+  72: (71.9358203, 0.0),
+  73: (72.936675, 0.0),
+  74: (73.939875, 0.0),
+  75: (74.9419, 0.0),
+  76: (75.945275, 0.0),
+  77: (76.94785, 0.0),
+  78: (77.95196, 0.0),
+  79: (78.95456, 0.0),
+  80: (79.96087, 0.0)},
+ 'Db': {0: (268, 1.0),
+  255: (255.1074, 0.0),
+  256: (256.10813, 0.0),
+  257: (257.10772, 0.0),
+  258: (258.10923, 0.0),
+  259: (259.10961, 0.0),
+  260: (260.1113, 0.0),
+  261: (261.11206, 0.0),
+  262: (262.11408, 0.0),
+  263: (263.11499, 0.0),
+  264: (264.1174, 0.0),
+  265: (265.1186, 0.0),
+  266: (266.12103, 0.0),
+  267: (267.12238, 0.0),
+  268: (268.12545, 0.0),
+  269: (269.12746, 0.0),
+  270: (270.13071, 0.0)},
+ 'Ds': {0: (281, 1.0),
+  267: (267.14434, 0.0),
+  268: (268.1438, 0.0),
+  269: (269.14512, 0.0),
+  270: (270.14472, 0.0),
+  271: (271.14606, 0.0),
+  272: (272.14632, 0.0),
+  273: (273.14886, 0.0),
+  274: (274.14949, 0.0),
+  275: (275.15218, 0.0),
+  276: (276.15303, 0.0),
+  277: (277.15565, 0.0),
+  278: (278.15647, 0.0),
+  279: (279.15886, 0.0),
+  280: (280.1598, 0.0),
+  281: (281.16206, 0.0)},
+ 'Dy': {0: (163.9291748, 1.0),
+  138: (137.96249, 0.0),
+  139: (138.95954, 0.0),
+  140: (139.95401, 0.0),
+  141: (140.95135, 0.0),
+  142: (141.94637, 0.0),
+  143: (142.94383, 0.0),
+  144: (143.93925, 0.0),
+  145: (144.93743, 0.0),
+  146: (145.932845, 0.0),
+  147: (146.931092, 0.0),
+  148: (147.92715, 0.0),
+  149: (148.927305, 0.0),
+  150: (149.925585, 0.0),
+  151: (150.926185, 0.0),
+  152: (151.924718, 0.0),
+  153: (152.925765, 0.0),
+  154: (153.924424, 0.0),
+  155: (154.925754, 0.0),
+  156: (155.924283, 0.00056),
+  157: (156.925466, 0.0),
+  158: (157.924409, 0.00095),
+  159: (158.9257392, 0.0),
+  160: (159.9251975, 0.02329),
+  161: (160.9269334, 0.18889),
+  162: (161.9267984, 0.25475),
+  163: (162.9287312, 0.24896),
+  164: (163.9291748, 0.2826),
+  165: (164.9317033, 0.0),
+  166: (165.9328067, 0.0),
+  167: (166.93566, 0.0),
+  168: (167.93713, 0.0),
+  169: (168.94031, 0.0),
+  170: (169.94239, 0.0),
+  171: (170.9462, 0.0),
+  172: (171.94876, 0.0),
+  173: (172.953, 0.0)},
+ 'Er': {0: (165.9302931, 1.0),
+  143: (142.96634, 0.0),
+  144: (143.96038, 0.0),
+  145: (144.95739, 0.0),
+  146: (145.952, 0.0),
+  147: (146.94949, 0.0),
+  148: (147.94455, 0.0),
+  149: (148.94231, 0.0),
+  150: (149.937914, 0.0),
+  151: (150.937449, 0.0),
+  152: (151.93505, 0.0),
+  153: (152.935063, 0.0),
+  154: (153.932783, 0.0),
+  155: (154.933209, 0.0),
+  156: (155.931065, 0.0),
+  157: (156.93192, 0.0),
+  158: (157.929893, 0.0),
+  159: (158.930684, 0.0),
+  160: (159.929083, 0.0),
+  161: (160.929995, 0.0),
+  162: (161.928778, 0.00139),
+  163: (162.930033, 0.0),
+  164: (163.9292, 0.01601),
+  165: (164.930726, 0.0),
+  166: (165.9302931, 0.33503),
+  167: (166.9320482, 0.22869),
+  168: (167.9323702, 0.26978),
+  169: (168.9345904, 0.0),
+  170: (169.9354643, 0.1491),
+  171: (170.9380298, 0.0),
+  172: (171.939356, 0.0),
+  173: (172.9424, 0.0),
+  174: (173.94423, 0.0),
+  175: (174.94777, 0.0),
+  176: (175.95008, 0.0),
+  177: (176.95405, 0.0)},
+ 'Es': {0: (252, 1.0),
+  240: (240.06892, 0.0),
+  241: (241.06854, 0.0),
+  242: (242.06975, 0.0),
+  243: (243.06955, 0.0),
+  244: (244.07088, 0.0),
+  245: (245.07132, 0.0),
+  246: (246.0729, 0.0),
+  247: (247.07366, 0.0),
+  248: (248.07547, 0.0),
+  249: (249.07641, 0.0),
+  250: (250.07861, 0.0),
+  251: (251.079992, 0.0),
+  252: (252.08298, 0.0),
+  253: (253.0848247, 0.0),
+  254: (254.088022, 0.0),
+  255: (255.090273, 0.0),
+  256: (256.0936, 0.0),
+  257: (257.09598, 0.0),
+  258: (258.09952, 0.0)},
+ 'Eu': {0: (152.9212303, 1.0),
+  130: (129.96357, 0.0),
+  131: (130.95775, 0.0),
+  132: (131.95437, 0.0),
+  133: (132.94924, 0.0),
+  134: (133.94651, 0.0),
+  135: (134.94182, 0.0),
+  136: (135.9396, 0.0),
+  137: (136.93557, 0.0),
+  138: (137.93371, 0.0),
+  139: (138.929792, 0.0),
+  140: (139.92809, 0.0),
+  141: (140.924931, 0.0),
+  142: (141.92343, 0.0),
+  143: (142.920298, 0.0),
+  144: (143.918817, 0.0),
+  145: (144.916265, 0.0),
+  146: (145.917206, 0.0),
+  147: (146.916746, 0.0),
+  148: (147.918086, 0.0),
+  149: (148.917931, 0.0),
+  150: (149.919702, 0.0),
+  151: (150.9198502, 0.4781),
+  152: (151.9217445, 0.0),
+  153: (152.9212303, 0.5219),
+  154: (153.9229792, 0.0),
+  155: (154.9228933, 0.0),
+  156: (155.924752, 0.0),
+  157: (156.925424, 0.0),
+  158: (157.92785, 0.0),
+  159: (158.929089, 0.0),
+  160: (159.93197, 0.0),
+  161: (160.93368, 0.0),
+  162: (161.93704, 0.0),
+  163: (162.93921, 0.0),
+  164: (163.94299, 0.0),
+  165: (164.94572, 0.0),
+  166: (165.94997, 0.0),
+  167: (166.95321, 0.0)},
+ 'F': {0: (18.99840322, 1.0),
+  14: (14.03506, 0.0),
+  15: (15.01801, 0.0),
+  16: (16.011466, 0.0),
+  17: (17.00209524, 0.0),
+  18: (18.000938, 0.0),
+  19: (18.99840322, 1.0),
+  20: (19.99998132, 0.0),
+  21: (20.999949, 0.0),
+  22: (22.002999, 0.0),
+  23: (23.00357, 0.0),
+  24: (24.00812, 0.0),
+  25: (25.0121, 0.0),
+  26: (26.01962, 0.0),
+  27: (27.02676, 0.0),
+  28: (28.03567, 0.0),
+  29: (29.04326, 0.0),
+  30: (30.0525, 0.0),
+  31: (31.06043, 0.0)},
+ 'Fe': {0: (55.9349375, 1.0),
+  45: (45.01458, 0.0),
+  46: (46.00081, 0.0),
+  47: (46.99289, 0.0),
+  48: (47.9805, 0.0),
+  49: (48.97361, 0.0),
+  50: (49.96299, 0.0),
+  51: (50.95682, 0.0),
+  52: (51.948114, 0.0),
+  53: (52.9453079, 0.0),
+  54: (53.9396105, 0.05845),
+  55: (54.9382934, 0.0),
+  56: (55.9349375, 0.91754),
+  57: (56.935394, 0.02119),
+  58: (57.9332756, 0.00282),
+  59: (58.9348755, 0.0),
+  60: (59.934072, 0.0),
+  61: (60.936745, 0.0),
+  62: (61.936767, 0.0),
+  63: (62.94037, 0.0),
+  64: (63.9412, 0.0),
+  65: (64.94538, 0.0),
+  66: (65.94678, 0.0),
+  67: (66.95095, 0.0),
+  68: (67.9537, 0.0),
+  69: (68.95878, 0.0),
+  70: (69.96146, 0.0),
+  71: (70.96672, 0.0),
+  72: (71.96962, 0.0)},
+ 'Fm': {0: (257, 1.0),
+  242: (242.07343, 0.0),
+  243: (243.07435, 0.0),
+  244: (244.07408, 0.0),
+  245: (245.07539, 0.0),
+  246: (246.0753, 0.0),
+  247: (247.07685, 0.0),
+  248: (248.077195, 0.0),
+  249: (249.07903, 0.0),
+  250: (250.079521, 0.0),
+  251: (251.081575, 0.0),
+  252: (252.082467, 0.0),
+  253: (253.085185, 0.0),
+  254: (254.0868542, 0.0),
+  255: (255.089962, 0.0),
+  256: (256.091773, 0.0),
+  257: (257.095105, 0.0),
+  258: (258.09708, 0.0),
+  259: (259.1006, 0.0),
+  260: (260.10268, 0.0)},
+ 'Fr': {0: (223, 1.0),
+  199: (199.00726, 0.0),
+  200: (200.00657, 0.0),
+  201: (201.00386, 0.0),
+  202: (202.00337, 0.0),
+  203: (203.000925, 0.0),
+  204: (204.000653, 0.0),
+  205: (204.998594, 0.0),
+  206: (205.99867, 0.0),
+  207: (206.99695, 0.0),
+  208: (207.99714, 0.0),
+  209: (208.995954, 0.0),
+  210: (209.996408, 0.0),
+  211: (210.995537, 0.0),
+  212: (211.996202, 0.0),
+  213: (212.996189, 0.0),
+  214: (213.998971, 0.0),
+  215: (215.000341, 0.0),
+  216: (216.003198, 0.0),
+  217: (217.004632, 0.0),
+  218: (218.007578, 0.0),
+  219: (219.009252, 0.0),
+  220: (220.012327, 0.0),
+  221: (221.014255, 0.0),
+  222: (222.017552, 0.0),
+  223: (223.0197359, 0.0),
+  224: (224.02325, 0.0),
+  225: (225.02557, 0.0),
+  226: (226.02939, 0.0),
+  227: (227.03184, 0.0),
+  228: (228.03573, 0.0),
+  229: (229.03845, 0.0),
+  230: (230.04251, 0.0),
+  231: (231.04544, 0.0),
+  232: (232.04977, 0.0)},
+ 'Ga': {0: (68.9255736, 1.0),
+  56: (55.99491, 0.0),
+  57: (56.98293, 0.0),
+  58: (57.97425, 0.0),
+  59: (58.96337, 0.0),
+  60: (59.95706, 0.0),
+  61: (60.94945, 0.0),
+  62: (61.944175, 0.0),
+  63: (62.9392942, 0.0),
+  64: (63.9368387, 0.0),
+  65: (64.9327348, 0.0),
+  66: (65.931589, 0.0),
+  67: (66.9282017, 0.0),
+  68: (67.9279801, 0.0),
+  69: (68.9255736, 0.60108),
+  70: (69.926022, 0.0),
+  71: (70.9247013, 0.39892),
+  72: (71.9263663, 0.0),
+  73: (72.9251747, 0.0),
+  74: (73.926946, 0.0),
+  75: (74.9265002, 0.0),
+  76: (75.9288276, 0.0),
+  77: (76.9291543, 0.0),
+  78: (77.9316082, 0.0),
+  79: (78.93289, 0.0),
+  80: (79.93652, 0.0),
+  81: (80.93775, 0.0),
+  82: (81.94299, 0.0),
+  83: (82.94698, 0.0),
+  84: (83.95265, 0.0),
+  85: (84.957, 0.0),
+  86: (85.96312, 0.0)},
+ 'Gd': {0: (157.9241039, 1.0),
+  134: (133.95537, 0.0),
+  135: (134.95257, 0.0),
+  136: (135.94734, 0.0),
+  137: (136.94502, 0.0),
+  138: (137.94012, 0.0),
+  139: (138.93824, 0.0),
+  140: (139.93367, 0.0),
+  141: (140.932126, 0.0),
+  142: (141.92812, 0.0),
+  143: (142.92675, 0.0),
+  144: (143.92296, 0.0),
+  145: (144.921709, 0.0),
+  146: (145.918311, 0.0),
+  147: (146.919094, 0.0),
+  148: (147.918115, 0.0),
+  149: (148.919341, 0.0),
+  150: (149.918659, 0.0),
+  151: (150.920348, 0.0),
+  152: (151.919791, 0.002),
+  153: (152.9217495, 0.0),
+  154: (153.9208656, 0.0218),
+  155: (154.922622, 0.148),
+  156: (155.9221227, 0.2047),
+  157: (156.9239601, 0.1565),
+  158: (157.9241039, 0.2484),
+  159: (158.9263887, 0.0),
+  160: (159.9270541, 0.2186),
+  161: (160.9296692, 0.0),
+  162: (161.930985, 0.0),
+  163: (162.93399, 0.0),
+  164: (163.93586, 0.0),
+  165: (164.93938, 0.0),
+  166: (165.9416, 0.0),
+  167: (166.94557, 0.0),
+  168: (167.94836, 0.0),
+  169: (168.95287, 0.0)},
+ 'Ge': {0: (73.9211778, 1.0),
+  58: (57.99101, 0.0),
+  59: (58.98175, 0.0),
+  60: (59.97019, 0.0),
+  61: (60.96379, 0.0),
+  62: (61.95465, 0.0),
+  63: (62.94964, 0.0),
+  64: (63.94165, 0.0),
+  65: (64.93944, 0.0),
+  66: (65.93384, 0.0),
+  67: (66.932734, 0.0),
+  68: (67.928094, 0.0),
+  69: (68.9279645, 0.0),
+  70: (69.9242474, 0.2038),
+  71: (70.924951, 0.0),
+  72: (71.9220758, 0.2731),
+  73: (72.9234589, 0.0776),
+  74: (73.9211778, 0.3672),
+  75: (74.9228589, 0.0),
+  76: (75.9214026, 0.0783),
+  77: (76.9235486, 0.0),
+  78: (77.922853, 0.0),
+  79: (78.9254, 0.0),
+  80: (79.92537, 0.0),
+  81: (80.92882, 0.0),
+  82: (81.92955, 0.0),
+  83: (82.93462, 0.0),
+  84: (83.93747, 0.0),
+  85: (84.94303, 0.0),
+  86: (85.94649, 0.0),
+  87: (86.95251, 0.0),
+  88: (87.95691, 0.0),
+  89: (88.96383, 0.0)},
+ 'H': {0: (1.00782503207, 1.0),
+  1: (1.00782503207, 0.999885),
+  2: (2.0141017778, 0.000115),
+  3: (3.0160492777, 0.0),
+  4: (4.02781, 0.0),
+  5: (5.03531, 0.0),
+  6: (6.04494, 0.0),
+  7: (7.05275, 0.0)},
+ 'H+': {0: (1.00727646677, 1.0), 1: (1.00727646677, 1.0)},
+ 'He': {0: (4.00260325415, 1.0),
+  3: (3.0160293191, 1.34e-06),
+  4: (4.00260325415, 0.99999866),
+  5: (5.01222, 0.0),
+  6: (6.0188891, 0.0),
+  7: (7.028021, 0.0),
+  8: (8.033922, 0.0),
+  9: (9.04395, 0.0),
+  10: (10.0524, 0.0)},
+ 'Hf': {0: (179.94655, 1.0),
+  153: (152.97069, 0.0),
+  154: (153.96486, 0.0),
+  155: (154.96339, 0.0),
+  156: (155.95936, 0.0),
+  157: (156.9584, 0.0),
+  158: (157.954799, 0.0),
+  159: (158.953995, 0.0),
+  160: (159.950684, 0.0),
+  161: (160.950275, 0.0),
+  162: (161.94721, 0.0),
+  163: (162.94709, 0.0),
+  164: (163.944367, 0.0),
+  165: (164.94457, 0.0),
+  166: (165.94218, 0.0),
+  167: (166.9426, 0.0),
+  168: (167.94057, 0.0),
+  169: (168.94126, 0.0),
+  170: (169.93961, 0.0),
+  171: (170.94049, 0.0),
+  172: (171.939448, 0.0),
+  173: (172.94051, 0.0),
+  174: (173.940046, 0.0016),
+  175: (174.941509, 0.0),
+  176: (175.9414086, 0.0526),
+  177: (176.9432207, 0.186),
+  178: (177.9436988, 0.2728),
+  179: (178.9458161, 0.1362),
+  180: (179.94655, 0.3508),
+  181: (180.9491012, 0.0),
+  182: (181.950554, 0.0),
+  183: (182.95353, 0.0),
+  184: (183.95545, 0.0),
+  185: (184.95882, 0.0),
+  186: (185.96089, 0.0),
+  187: (186.96459, 0.0),
+  188: (187.96685, 0.0)},
+ 'Hg': {0: (201.970643, 1.0),
+  171: (171.00376, 0.0),
+  172: (171.99883, 0.0),
+  173: (172.99724, 0.0),
+  174: (173.992864, 0.0),
+  175: (174.99142, 0.0),
+  176: (175.987355, 0.0),
+  177: (176.98628, 0.0),
+  178: (177.982483, 0.0),
+  179: (178.981834, 0.0),
+  180: (179.978266, 0.0),
+  181: (180.977819, 0.0),
+  182: (181.97469, 0.0),
+  183: (182.97445, 0.0),
+  184: (183.971713, 0.0),
+  185: (184.971899, 0.0),
+  186: (185.969362, 0.0),
+  187: (186.969814, 0.0),
+  188: (187.967577, 0.0),
+  189: (188.96819, 0.0),
+  190: (189.966322, 0.0),
+  191: (190.967157, 0.0),
+  192: (191.965634, 0.0),
+  193: (192.966665, 0.0),
+  194: (193.965439, 0.0),
+  195: (194.96672, 0.0),
+  196: (195.965833, 0.0015),
+  197: (196.967213, 0.0),
+  198: (197.966769, 0.0997),
+  199: (198.9682799, 0.1687),
+  200: (199.968326, 0.231),
+  201: (200.9703023, 0.1318),
+  202: (201.970643, 0.2986),
+  203: (202.9728725, 0.0),
+  204: (203.9734939, 0.0687),
+  205: (204.976073, 0.0),
+  206: (205.977514, 0.0),
+  207: (206.98259, 0.0),
+  208: (207.98594, 0.0),
+  209: (208.99104, 0.0),
+  210: (209.99451, 0.0)},
+ 'Ho': {0: (164.9303221, 1.0),
+  140: (139.96854, 0.0),
+  141: (140.9631, 0.0),
+  142: (141.95977, 0.0),
+  143: (142.95461, 0.0),
+  144: (143.95148, 0.0),
+  145: (144.9472, 0.0),
+  146: (145.94464, 0.0),
+  147: (146.94006, 0.0),
+  148: (147.93772, 0.0),
+  149: (148.933775, 0.0),
+  150: (149.933496, 0.0),
+  151: (150.931688, 0.0),
+  152: (151.931714, 0.0),
+  153: (152.930199, 0.0),
+  154: (153.930602, 0.0),
+  155: (154.929103, 0.0),
+  156: (155.92984, 0.0),
+  157: (156.928256, 0.0),
+  158: (157.928941, 0.0),
+  159: (158.927712, 0.0),
+  160: (159.928729, 0.0),
+  161: (160.927855, 0.0),
+  162: (161.929096, 0.0),
+  163: (162.9287339, 0.0),
+  164: (163.9302335, 0.0),
+  165: (164.9303221, 1.0),
+  166: (165.9322842, 0.0),
+  167: (166.933133, 0.0),
+  168: (167.93552, 0.0),
+  169: (168.936872, 0.0),
+  170: (169.93962, 0.0),
+  171: (170.94147, 0.0),
+  172: (171.94482, 0.0),
+  173: (172.94729, 0.0),
+  174: (173.95115, 0.0),
+  175: (174.95405, 0.0)},
+ 'Hs': {0: (270, 1.0),
+  263: (263.12856, 0.0),
+  264: (264.12839, 0.0),
+  265: (265.13009, 0.0),
+  266: (266.1301, 0.0),
+  267: (267.13179, 0.0),
+  268: (268.13216, 0.0),
+  269: (269.13406, 0.0),
+  270: (270.13465, 0.0),
+  271: (271.13766, 0.0),
+  272: (272.13905, 0.0),
+  273: (273.14199, 0.0),
+  274: (274.14313, 0.0),
+  275: (275.14595, 0.0),
+  276: (276.14721, 0.0),
+  277: (277.14984, 0.0)},
+ 'I': {0: (126.904473, 1.0),
+  108: (107.94348, 0.0),
+  109: (108.93815, 0.0),
+  110: (109.93524, 0.0),
+  111: (110.93028, 0.0),
+  112: (111.92797, 0.0),
+  113: (112.92364, 0.0),
+  114: (113.92185, 0.0),
+  115: (114.91805, 0.0),
+  116: (115.91681, 0.0),
+  117: (116.91365, 0.0),
+  118: (117.913074, 0.0),
+  119: (118.91007, 0.0),
+  120: (119.910048, 0.0),
+  121: (120.907367, 0.0),
+  122: (121.907589, 0.0),
+  123: (122.905589, 0.0),
+  124: (123.9062099, 0.0),
+  125: (124.9046302, 0.0),
+  126: (125.905624, 0.0),
+  127: (126.904473, 1.0),
+  128: (127.905809, 0.0),
+  129: (128.904988, 0.0),
+  130: (129.906674, 0.0),
+  131: (130.9061246, 0.0),
+  132: (131.907997, 0.0),
+  133: (132.907797, 0.0),
+  134: (133.909744, 0.0),
+  135: (134.910048, 0.0),
+  136: (135.91465, 0.0),
+  137: (136.917871, 0.0),
+  138: (137.92235, 0.0),
+  139: (138.9261, 0.0),
+  140: (139.931, 0.0),
+  141: (140.93503, 0.0),
+  142: (141.94018, 0.0),
+  143: (142.94456, 0.0),
+  144: (143.94999, 0.0)},
+ 'In': {0: (114.903878, 1.0),
+  97: (96.94954, 0.0),
+  98: (97.94214, 0.0),
+  99: (98.93422, 0.0),
+  100: (99.93111, 0.0),
+  101: (100.92634, 0.0),
+  102: (101.92409, 0.0),
+  103: (102.919914, 0.0),
+  104: (103.9183, 0.0),
+  105: (104.914674, 0.0),
+  106: (105.913465, 0.0),
+  107: (106.910295, 0.0),
+  108: (107.909698, 0.0),
+  109: (108.907151, 0.0),
+  110: (109.907165, 0.0),
+  111: (110.905103, 0.0),
+  112: (111.905532, 0.0),
+  113: (112.904058, 0.0429),
+  114: (113.904914, 0.0),
+  115: (114.903878, 0.9571),
+  116: (115.90526, 0.0),
+  117: (116.904514, 0.0),
+  118: (117.906354, 0.0),
+  119: (118.905845, 0.0),
+  120: (119.90796, 0.0),
+  121: (120.907846, 0.0),
+  122: (121.91028, 0.0),
+  123: (122.910438, 0.0),
+  124: (123.91318, 0.0),
+  125: (124.9136, 0.0),
+  126: (125.91646, 0.0),
+  127: (126.91735, 0.0),
+  128: (127.92017, 0.0),
+  129: (128.9217, 0.0),
+  130: (129.92497, 0.0),
+  131: (130.92685, 0.0),
+  132: (131.93299, 0.0),
+  133: (132.93781, 0.0),
+  134: (133.94415, 0.0),
+  135: (134.94933, 0.0)},
+ 'Ir': {0: (192.9629264, 1.0),
+  164: (163.9922, 0.0),
+  165: (164.98752, 0.0),
+  166: (165.98582, 0.0),
+  167: (166.981665, 0.0),
+  168: (167.97988, 0.0),
+  169: (168.976295, 0.0),
+  170: (169.97497, 0.0),
+  171: (170.97163, 0.0),
+  172: (171.97046, 0.0),
+  173: (172.967502, 0.0),
+  174: (173.966861, 0.0),
+  175: (174.964113, 0.0),
+  176: (175.963649, 0.0),
+  177: (176.961302, 0.0),
+  178: (177.961082, 0.0),
+  179: (178.959122, 0.0),
+  180: (179.959229, 0.0),
+  181: (180.957625, 0.0),
+  182: (181.958076, 0.0),
+  183: (182.956846, 0.0),
+  184: (183.95748, 0.0),
+  185: (184.9567, 0.0),
+  186: (185.957946, 0.0),
+  187: (186.957363, 0.0),
+  188: (187.958853, 0.0),
+  189: (188.958719, 0.0),
+  190: (189.960546, 0.0),
+  191: (190.960594, 0.373),
+  192: (191.962605, 0.0),
+  193: (192.9629264, 0.627),
+  194: (193.9650784, 0.0),
+  195: (194.9659796, 0.0),
+  196: (195.9684, 0.0),
+  197: (196.969653, 0.0),
+  198: (197.97228, 0.0),
+  199: (198.9738, 0.0)},
+ 'K': {0: (38.96370668, 1.0),
+  32: (32.02192, 0.0),
+  33: (33.00726, 0.0),
+  34: (33.99841, 0.0),
+  35: (34.98801, 0.0),
+  36: (35.981292, 0.0),
+  37: (36.97337589, 0.0),
+  38: (37.9690812, 0.0),
+  39: (38.96370668, 0.932581),
+  40: (39.96399848, 0.000117),
+  41: (40.96182576, 0.067302),
+  42: (41.96240281, 0.0),
+  43: (42.960716, 0.0),
+  44: (43.96156, 0.0),
+  45: (44.960699, 0.0),
+  46: (45.961977, 0.0),
+  47: (46.961678, 0.0),
+  48: (47.965514, 0.0),
+  49: (48.96745, 0.0),
+  50: (49.97278, 0.0),
+  51: (50.97638, 0.0),
+  52: (51.98261, 0.0),
+  53: (52.98712, 0.0),
+  54: (53.9942, 0.0),
+  55: (54.99971, 0.0)},
+ 'Kr': {0: (83.911507, 1.0),
+  69: (68.96518, 0.0),
+  70: (69.95526, 0.0),
+  71: (70.94963, 0.0),
+  72: (71.942092, 0.0),
+  73: (72.939289, 0.0),
+  74: (73.9330844, 0.0),
+  75: (74.930946, 0.0),
+  76: (75.92591, 0.0),
+  77: (76.92467, 0.0),
+  78: (77.9203648, 0.00355),
+  79: (78.920082, 0.0),
+  80: (79.916379, 0.02286),
+  81: (80.916592, 0.0),
+  82: (81.9134836, 0.11593),
+  83: (82.914136, 0.115),
+  84: (83.911507, 0.56987),
+  85: (84.9125273, 0.0),
+  86: (85.91061073, 0.17279),
+  87: (86.91335486, 0.0),
+  88: (87.914447, 0.0),
+  89: (88.91763, 0.0),
+  90: (89.919517, 0.0),
+  91: (90.92345, 0.0),
+  92: (91.926156, 0.0),
+  93: (92.93127, 0.0),
+  94: (93.93436, 0.0),
+  95: (94.93984, 0.0),
+  96: (95.94307, 0.0),
+  97: (96.94856, 0.0),
+  98: (97.95191, 0.0),
+  99: (98.9576, 0.0),
+  100: (99.96114, 0.0)},
+ 'La': {0: (138.9063533, 1.0),
+  117: (116.95007, 0.0),
+  118: (117.94673, 0.0),
+  119: (118.94099, 0.0),
+  120: (119.93807, 0.0),
+  121: (120.93301, 0.0),
+  122: (121.93071, 0.0),
+  123: (122.92624, 0.0),
+  124: (123.92457, 0.0),
+  125: (124.920816, 0.0),
+  126: (125.91951, 0.0),
+  127: (126.916375, 0.0),
+  128: (127.91559, 0.0),
+  129: (128.912693, 0.0),
+  130: (129.912369, 0.0),
+  131: (130.91007, 0.0),
+  132: (131.9101, 0.0),
+  133: (132.90822, 0.0),
+  134: (133.908514, 0.0),
+  135: (134.906977, 0.0),
+  136: (135.90764, 0.0),
+  137: (136.906494, 0.0),
+  138: (137.907112, 0.0009),
+  139: (138.9063533, 0.9991),
+  140: (139.9094776, 0.0),
+  141: (140.910962, 0.0),
+  142: (141.914079, 0.0),
+  143: (142.916063, 0.0),
+  144: (143.9196, 0.0),
+  145: (144.92165, 0.0),
+  146: (145.92579, 0.0),
+  147: (146.92824, 0.0),
+  148: (147.93223, 0.0),
+  149: (148.93473, 0.0),
+  150: (149.93877, 0.0),
+  151: (150.94172, 0.0),
+  152: (151.94625, 0.0),
+  153: (152.94962, 0.0),
+  154: (153.9545, 0.0),
+  155: (154.95835, 0.0)},
+ 'Li': {0: (7.01600455, 1.0),
+  3: (3.03078, 0.0),
+  4: (4.02719, 0.0),
+  5: (5.01254, 0.0),
+  6: (6.015122795, 0.0759),
+  7: (7.01600455, 0.9241),
+  8: (8.02248736, 0.0),
+  9: (9.0267895, 0.0),
+  10: (10.035481, 0.0),
+  11: (11.043798, 0.0),
+  12: (12.05378, 0.0)},
+ 'Lr': {0: (262, 1.0),
+  251: (251.09436, 0.0),
+  252: (252.09537, 0.0),
+  253: (253.09521, 0.0),
+  254: (254.09645, 0.0),
+  255: (255.09668, 0.0),
+  256: (256.09863, 0.0),
+  257: (257.09956, 0.0),
+  258: (258.10181, 0.0),
+  259: (259.1029, 0.0),
+  260: (260.1055, 0.0),
+  261: (261.10688, 0.0),
+  262: (262.10963, 0.0),
+  263: (263.11129, 0.0),
+  264: (264.11404, 0.0),
+  265: (265.11584, 0.0),
+  266: (266.11931, 0.0)},
+ 'Lu': {0: (174.9407718, 1.0),
+  150: (149.97323, 0.0),
+  151: (150.96758, 0.0),
+  152: (151.96412, 0.0),
+  153: (152.95877, 0.0),
+  154: (153.95752, 0.0),
+  155: (154.954316, 0.0),
+  156: (155.95303, 0.0),
+  157: (156.950098, 0.0),
+  158: (157.949313, 0.0),
+  159: (158.94663, 0.0),
+  160: (159.94603, 0.0),
+  161: (160.94357, 0.0),
+  162: (161.94328, 0.0),
+  163: (162.94118, 0.0),
+  164: (163.94134, 0.0),
+  165: (164.939407, 0.0),
+  166: (165.93986, 0.0),
+  167: (166.93827, 0.0),
+  168: (167.93874, 0.0),
+  169: (168.937651, 0.0),
+  170: (169.938475, 0.0),
+  171: (170.9379131, 0.0),
+  172: (171.939086, 0.0),
+  173: (172.9389306, 0.0),
+  174: (173.9403375, 0.0),
+  175: (174.9407718, 0.9741),
+  176: (175.9426863, 0.0259),
+  177: (176.9437581, 0.0),
+  178: (177.945955, 0.0),
+  179: (178.947327, 0.0),
+  180: (179.94988, 0.0),
+  181: (180.95197, 0.0),
+  182: (181.95504, 0.0),
+  183: (182.95757, 0.0),
+  184: (183.96091, 0.0)},
+ 'Md': {0: (258, 1.0),
+  245: (245.08083, 0.0),
+  246: (246.08189, 0.0),
+  247: (247.08164, 0.0),
+  248: (248.08282, 0.0),
+  249: (249.08301, 0.0),
+  250: (250.08442, 0.0),
+  251: (251.08484, 0.0),
+  252: (252.08656, 0.0),
+  253: (253.08728, 0.0),
+  254: (254.08966, 0.0),
+  255: (255.091083, 0.0),
+  256: (256.09406, 0.0),
+  257: (257.095541, 0.0),
+  258: (258.098431, 0.0),
+  259: (259.10051, 0.0),
+  260: (260.10365, 0.0),
+  261: (261.10572, 0.0),
+  262: (262.10887, 0.0)},
+ 'Mg': {0: (23.9850417, 1.0),
+  19: (19.03547, 0.0),
+  20: (20.018863, 0.0),
+  21: (21.011713, 0.0),
+  22: (21.9995738, 0.0),
+  23: (22.9941237, 0.0),
+  24: (23.9850417, 0.7899),
+  25: (24.98583692, 0.1),
+  26: (25.982592929, 0.1101),
+  27: (26.98434059, 0.0),
+  28: (27.9838768, 0.0),
+  29: (28.9886, 0.0),
+  30: (29.990434, 0.0),
+  31: (30.996546, 0.0),
+  32: (31.998975, 0.0),
+  33: (33.005254, 0.0),
+  34: (34.00946, 0.0),
+  35: (35.01734, 0.0),
+  36: (36.023, 0.0),
+  37: (37.0314, 0.0),
+  38: (38.03757, 0.0),
+  39: (39.04677, 0.0),
+  40: (40.05393, 0.0)},
+ 'Mn': {0: (54.9380451, 1.0),
+  44: (44.00687, 0.0),
+  45: (44.99451, 0.0),
+  46: (45.98672, 0.0),
+  47: (46.9761, 0.0),
+  48: (47.96852, 0.0),
+  49: (48.959618, 0.0),
+  50: (49.9542382, 0.0),
+  51: (50.9482108, 0.0),
+  52: (51.9455655, 0.0),
+  53: (52.9412901, 0.0),
+  54: (53.9403589, 0.0),
+  55: (54.9380451, 1.0),
+  56: (55.9389049, 0.0),
+  57: (56.9382854, 0.0),
+  58: (57.93998, 0.0),
+  59: (58.94044, 0.0),
+  60: (59.94291, 0.0),
+  61: (60.94465, 0.0),
+  62: (61.94843, 0.0),
+  63: (62.95024, 0.0),
+  64: (63.95425, 0.0),
+  65: (64.95634, 0.0),
+  66: (65.96108, 0.0),
+  67: (66.96414, 0.0),
+  68: (67.9693, 0.0),
+  69: (68.97284, 0.0)},
+ 'Mo': {0: (97.9054082, 1.0),
+  83: (82.94874, 0.0),
+  84: (83.94009, 0.0),
+  85: (84.93655, 0.0),
+  86: (85.9307, 0.0),
+  87: (86.92733, 0.0),
+  88: (87.921953, 0.0),
+  89: (88.91948, 0.0),
+  90: (89.913937, 0.0),
+  91: (90.91175, 0.0),
+  92: (91.906811, 0.1477),
+  93: (92.906813, 0.0),
+  94: (93.9050883, 0.0923),
+  95: (94.9058421, 0.159),
+  96: (95.9046795, 0.1668),
+  97: (96.9060215, 0.0956),
+  98: (97.9054082, 0.2419),
+  99: (98.9077119, 0.0),
+  100: (99.907477, 0.0967),
+  101: (100.910347, 0.0),
+  102: (101.910297, 0.0),
+  103: (102.91321, 0.0),
+  104: (103.91376, 0.0),
+  105: (104.91697, 0.0),
+  106: (105.918137, 0.0),
+  107: (106.92169, 0.0),
+  108: (107.92345, 0.0),
+  109: (108.92781, 0.0),
+  110: (109.92973, 0.0),
+  111: (110.93441, 0.0),
+  112: (111.93684, 0.0),
+  113: (112.94188, 0.0),
+  114: (113.94492, 0.0),
+  115: (114.95029, 0.0)},
+ 'Mt': {0: (276, 1.0),
+  265: (265.13615, 0.0),
+  266: (266.1373, 0.0),
+  267: (267.13731, 0.0),
+  268: (268.13873, 0.0),
+  269: (269.13906, 0.0),
+  270: (270.14066, 0.0),
+  271: (271.14114, 0.0),
+  272: (272.14374, 0.0),
+  273: (273.14491, 0.0),
+  274: (274.14749, 0.0),
+  275: (275.14865, 0.0),
+  276: (276.15116, 0.0),
+  277: (277.15242, 0.0),
+  278: (278.15481, 0.0),
+  279: (279.15619, 0.0)},
+ 'N': {0: (14.0030740048, 1.0),
+  10: (10.04165, 0.0),
+  11: (11.02609, 0.0),
+  12: (12.0186132, 0.0),
+  13: (13.00573861, 0.0),
+  14: (14.0030740048, 0.99636),
+  15: (15.0001088982, 0.00364),
+  16: (16.0061017, 0.0),
+  17: (17.00845, 0.0),
+  18: (18.014079, 0.0),
+  19: (19.017029, 0.0),
+  20: (20.02337, 0.0),
+  21: (21.02711, 0.0),
+  22: (22.03439, 0.0),
+  23: (23.04122, 0.0),
+  24: (24.05104, 0.0),
+  25: (25.06066, 0.0)},
+ 'Na': {0: (22.9897692809, 1.0),
+  18: (18.02597, 0.0),
+  19: (19.013877, 0.0),
+  20: (20.007351, 0.0),
+  21: (20.9976552, 0.0),
+  22: (21.9944364, 0.0),
+  23: (22.9897692809, 1.0),
+  24: (23.99096278, 0.0),
+  25: (24.989954, 0.0),
+  26: (25.992633, 0.0),
+  27: (26.994077, 0.0),
+  28: (27.998938, 0.0),
+  29: (29.002861, 0.0),
+  30: (30.008976, 0.0),
+  31: (31.01359, 0.0),
+  32: (32.02047, 0.0),
+  33: (33.02672, 0.0),
+  34: (34.03517, 0.0),
+  35: (35.04249, 0.0),
+  36: (36.05148, 0.0),
+  37: (37.05934, 0.0)},
+ 'Nb': {0: (92.9063781, 1.0),
+  81: (80.94903, 0.0),
+  82: (81.94313, 0.0),
+  83: (82.93671, 0.0),
+  84: (83.93357, 0.0),
+  85: (84.92791, 0.0),
+  86: (85.92504, 0.0),
+  87: (86.92036, 0.0),
+  88: (87.91833, 0.0),
+  89: (88.913418, 0.0),
+  90: (89.911265, 0.0),
+  91: (90.906996, 0.0),
+  92: (91.907194, 0.0),
+  93: (92.9063781, 1.0),
+  94: (93.9072839, 0.0),
+  95: (94.9068358, 0.0),
+  96: (95.908101, 0.0),
+  97: (96.9080986, 0.0),
+  98: (97.910328, 0.0),
+  99: (98.911618, 0.0),
+  100: (99.914182, 0.0),
+  101: (100.915252, 0.0),
+  102: (101.91804, 0.0),
+  103: (102.91914, 0.0),
+  104: (103.92246, 0.0),
+  105: (104.92394, 0.0),
+  106: (105.92797, 0.0),
+  107: (106.93031, 0.0),
+  108: (107.93484, 0.0),
+  109: (108.93763, 0.0),
+  110: (109.94244, 0.0),
+  111: (110.94565, 0.0),
+  112: (111.95083, 0.0),
+  113: (112.9547, 0.0)},
+ 'Nd': {0: (141.9077233, 1.0),
+  124: (123.95223, 0.0),
+  125: (124.94888, 0.0),
+  126: (125.94322, 0.0),
+  127: (126.9405, 0.0),
+  128: (127.93539, 0.0),
+  129: (128.93319, 0.0),
+  130: (129.92851, 0.0),
+  131: (130.92725, 0.0),
+  132: (131.923321, 0.0),
+  133: (132.92235, 0.0),
+  134: (133.91879, 0.0),
+  135: (134.918181, 0.0),
+  136: (135.914976, 0.0),
+  137: (136.914567, 0.0),
+  138: (137.91195, 0.0),
+  139: (138.911978, 0.0),
+  140: (139.90955, 0.0),
+  141: (140.90961, 0.0),
+  142: (141.9077233, 0.272),
+  143: (142.9098143, 0.122),
+  144: (143.9100873, 0.238),
+  145: (144.9125736, 0.083),
+  146: (145.9131169, 0.172),
+  147: (146.9161004, 0.0),
+  148: (147.916893, 0.057),
+  149: (148.920149, 0.0),
+  150: (149.920891, 0.056),
+  151: (150.923829, 0.0),
+  152: (151.924682, 0.0),
+  153: (152.927698, 0.0),
+  154: (153.92948, 0.0),
+  155: (154.93293, 0.0),
+  156: (155.93502, 0.0),
+  157: (156.93903, 0.0),
+  158: (157.9416, 0.0),
+  159: (158.94609, 0.0),
+  160: (159.94909, 0.0),
+  161: (160.95388, 0.0)},
+ 'Ne': {0: (19.9924401754, 1.0),
+  16: (16.025761, 0.0),
+  17: (17.017672, 0.0),
+  18: (18.0057082, 0.0),
+  19: (19.0018802, 0.0),
+  20: (19.9924401754, 0.9048),
+  21: (20.99384668, 0.0027),
+  22: (21.991385114, 0.0925),
+  23: (22.9944669, 0.0),
+  24: (23.9936108, 0.0),
+  25: (24.997737, 0.0),
+  26: (26.000461, 0.0),
+  27: (27.00759, 0.0),
+  28: (28.01207, 0.0),
+  29: (29.01939, 0.0),
+  30: (30.0248, 0.0),
+  31: (31.03311, 0.0),
+  32: (32.04002, 0.0),
+  33: (33.04938, 0.0),
+  34: (34.05703, 0.0)},
+ 'Ni': {0: (57.9353429, 1.0),
+  48: (48.01975, 0.0),
+  49: (49.00966, 0.0),
+  50: (49.99593, 0.0),
+  51: (50.98772, 0.0),
+  52: (51.97568, 0.0),
+  53: (52.96847, 0.0),
+  54: (53.95791, 0.0),
+  55: (54.95133, 0.0),
+  56: (55.942132, 0.0),
+  57: (56.9397935, 0.0),
+  58: (57.9353429, 0.680769),
+  59: (58.9343467, 0.0),
+  60: (59.9307864, 0.262231),
+  61: (60.931056, 0.011399),
+  62: (61.9283451, 0.036345),
+  63: (62.9296694, 0.0),
+  64: (63.927966, 0.009256),
+  65: (64.9300843, 0.0),
+  66: (65.9291393, 0.0),
+  67: (66.931569, 0.0),
+  68: (67.931869, 0.0),
+  69: (68.93561, 0.0),
+  70: (69.9365, 0.0),
+  71: (70.94074, 0.0),
+  72: (71.94209, 0.0),
+  73: (72.94647, 0.0),
+  74: (73.94807, 0.0),
+  75: (74.95287, 0.0),
+  76: (75.95533, 0.0),
+  77: (76.96055, 0.0),
+  78: (77.96318, 0.0)},
+ 'No': {0: (259, 1.0),
+  248: (248.0866, 0.0),
+  249: (249.08783, 0.0),
+  250: (250.08751, 0.0),
+  251: (251.08901, 0.0),
+  252: (252.088977, 0.0),
+  253: (253.09068, 0.0),
+  254: (254.090955, 0.0),
+  255: (255.093241, 0.0),
+  256: (256.094283, 0.0),
+  257: (257.096877, 0.0),
+  258: (258.09821, 0.0),
+  259: (259.10103, 0.0),
+  260: (260.10264, 0.0),
+  261: (261.10575, 0.0),
+  262: (262.1073, 0.0),
+  263: (263.11055, 0.0),
+  264: (264.11235, 0.0)},
+ 'Np': {0: (237, 1.0),
+  225: (225.03391, 0.0),
+  226: (226.03515, 0.0),
+  227: (227.03496, 0.0),
+  228: (228.03618, 0.0),
+  229: (229.03626, 0.0),
+  230: (230.03783, 0.0),
+  231: (231.03825, 0.0),
+  232: (232.04011, 0.0),
+  233: (233.04074, 0.0),
+  234: (234.042895, 0.0),
+  235: (235.0440633, 0.0),
+  236: (236.04657, 0.0),
+  237: (237.0481734, 0.0),
+  238: (238.0509464, 0.0),
+  239: (239.052939, 0.0),
+  240: (240.056162, 0.0),
+  241: (241.05825, 0.0),
+  242: (242.06164, 0.0),
+  243: (243.06428, 0.0),
+  244: (244.06785, 0.0)},
+ 'O': {0: (15.99491461956, 1.0),
+  12: (12.034405, 0.0),
+  13: (13.024812, 0.0),
+  14: (14.00859625, 0.0),
+  15: (15.0030656, 0.0),
+  16: (15.99491461956, 0.99757),
+  17: (16.9991317, 0.00038),
+  18: (17.999161, 0.00205),
+  19: (19.00358, 0.0),
+  20: (20.0040767, 0.0),
+  21: (21.008656, 0.0),
+  22: (22.00997, 0.0),
+  23: (23.01569, 0.0),
+  24: (24.02047, 0.0),
+  25: (25.02946, 0.0),
+  26: (26.03834, 0.0),
+  27: (27.04826, 0.0),
+  28: (28.05781, 0.0)},
+ 'Os': {0: (191.9614807, 1.0),
+  162: (161.98443, 0.0),
+  163: (162.98269, 0.0),
+  164: (163.97804, 0.0),
+  165: (164.97676, 0.0),
+  166: (165.972691, 0.0),
+  167: (166.97155, 0.0),
+  168: (167.967804, 0.0),
+  169: (168.967019, 0.0),
+  170: (169.963577, 0.0),
+  171: (170.963185, 0.0),
+  172: (171.960023, 0.0),
+  173: (172.959808, 0.0),
+  174: (173.957062, 0.0),
+  175: (174.956946, 0.0),
+  176: (175.95481, 0.0),
+  177: (176.954965, 0.0),
+  178: (177.953251, 0.0),
+  179: (178.953816, 0.0),
+  180: (179.952379, 0.0),
+  181: (180.95324, 0.0),
+  182: (181.95211, 0.0),
+  183: (182.95313, 0.0),
+  184: (183.9524891, 0.0002),
+  185: (184.9540423, 0.0),
+  186: (185.9538382, 0.0159),
+  187: (186.9557505, 0.0196),
+  188: (187.9558382, 0.1324),
+  189: (188.9581475, 0.1615),
+  190: (189.958447, 0.2626),
+  191: (190.9609297, 0.0),
+  192: (191.9614807, 0.4078),
+  193: (192.9641516, 0.0),
+  194: (193.9651821, 0.0),
+  195: (194.96813, 0.0),
+  196: (195.96964, 0.0)},
+ 'P': {0: (30.97376163, 1.0),
+  24: (24.03435, 0.0),
+  25: (25.02026, 0.0),
+  26: (26.01178, 0.0),
+  27: (26.99923, 0.0),
+  28: (27.992315, 0.0),
+  29: (28.9818006, 0.0),
+  30: (29.9783138, 0.0),
+  31: (30.97376163, 1.0),
+  32: (31.97390727, 0.0),
+  33: (32.9717255, 0.0),
+  34: (33.973636, 0.0),
+  35: (34.9733141, 0.0),
+  36: (35.97826, 0.0),
+  37: (36.97961, 0.0),
+  38: (37.98416, 0.0),
+  39: (38.98618, 0.0),
+  40: (39.9913, 0.0),
+  41: (40.99434, 0.0),
+  42: (42.00101, 0.0),
+  43: (43.00619, 0.0),
+  44: (44.01299, 0.0),
+  45: (45.01922, 0.0),
+  46: (46.02738, 0.0)},
+ 'Pa': {0: (231.035884, 1.0),
+  212: (212.0232, 0.0),
+  213: (213.02111, 0.0),
+  214: (214.02092, 0.0),
+  215: (215.01919, 0.0),
+  216: (216.01911, 0.0),
+  217: (217.01832, 0.0),
+  218: (218.020042, 0.0),
+  219: (219.01988, 0.0),
+  220: (220.02188, 0.0),
+  221: (221.02188, 0.0),
+  222: (222.02374, 0.0),
+  223: (223.02396, 0.0),
+  224: (224.025626, 0.0),
+  225: (225.02613, 0.0),
+  226: (226.027948, 0.0),
+  227: (227.028805, 0.0),
+  228: (228.031051, 0.0),
+  229: (229.0320968, 0.0),
+  230: (230.034541, 0.0),
+  231: (231.035884, 1.0),
+  232: (232.038592, 0.0),
+  233: (233.0402473, 0.0),
+  234: (234.043308, 0.0),
+  235: (235.04544, 0.0),
+  236: (236.04868, 0.0),
+  237: (237.05115, 0.0),
+  238: (238.0545, 0.0),
+  239: (239.05726, 0.0),
+  240: (240.06098, 0.0)},
+ 'Pb': {0: (207.9766521, 1.0),
+  178: (178.00383, 0.0),
+  179: (179.00215, 0.0),
+  180: (179.997918, 0.0),
+  181: (180.99662, 0.0),
+  182: (181.992672, 0.0),
+  183: (182.99187, 0.0),
+  184: (183.988142, 0.0),
+  185: (184.98761, 0.0),
+  186: (185.984239, 0.0),
+  187: (186.983918, 0.0),
+  188: (187.980874, 0.0),
+  189: (188.98081, 0.0),
+  190: (189.978082, 0.0),
+  191: (190.97827, 0.0),
+  192: (191.975785, 0.0),
+  193: (192.97617, 0.0),
+  194: (193.974012, 0.0),
+  195: (194.974542, 0.0),
+  196: (195.972774, 0.0),
+  197: (196.973431, 0.0),
+  198: (197.972034, 0.0),
+  199: (198.972917, 0.0),
+  200: (199.971827, 0.0),
+  201: (200.972885, 0.0),
+  202: (201.972159, 0.0),
+  203: (202.973391, 0.0),
+  204: (203.9730436, 0.014),
+  205: (204.9744818, 0.0),
+  206: (205.9744653, 0.241),
+  207: (206.9758969, 0.221),
+  208: (207.9766521, 0.524),
+  209: (208.9810901, 0.0),
+  210: (209.9841885, 0.0),
+  211: (210.988737, 0.0),
+  212: (211.9918975, 0.0),
+  213: (212.996581, 0.0),
+  214: (213.9998054, 0.0),
+  215: (215.00481, 0.0)},
+ 'Pd': {0: (105.903486, 1.0),
+  91: (90.94911, 0.0),
+  92: (91.94042, 0.0),
+  93: (92.93591, 0.0),
+  94: (93.92877, 0.0),
+  95: (94.92469, 0.0),
+  96: (95.91816, 0.0),
+  97: (96.91648, 0.0),
+  98: (97.912721, 0.0),
+  99: (98.911768, 0.0),
+  100: (99.908506, 0.0),
+  101: (100.908289, 0.0),
+  102: (101.905609, 0.0102),
+  103: (102.906087, 0.0),
+  104: (103.904036, 0.1114),
+  105: (104.905085, 0.2233),
+  106: (105.903486, 0.2733),
+  107: (106.905133, 0.0),
+  108: (107.903892, 0.2646),
+  109: (108.90595, 0.0),
+  110: (109.905153, 0.1172),
+  111: (110.907671, 0.0),
+  112: (111.907314, 0.0),
+  113: (112.91015, 0.0),
+  114: (113.910363, 0.0),
+  115: (114.91368, 0.0),
+  116: (115.91416, 0.0),
+  117: (116.91784, 0.0),
+  118: (117.91898, 0.0),
+  119: (118.92311, 0.0),
+  120: (119.92469, 0.0),
+  121: (120.92887, 0.0),
+  122: (121.93055, 0.0),
+  123: (122.93493, 0.0),
+  124: (123.93688, 0.0)},
+ 'Pm': {0: (145, 1.0),
+  126: (125.95752, 0.0),
+  127: (126.95163, 0.0),
+  128: (127.94842, 0.0),
+  129: (128.94316, 0.0),
+  130: (129.94045, 0.0),
+  131: (130.93587, 0.0),
+  132: (131.93375, 0.0),
+  133: (132.92978, 0.0),
+  134: (133.92835, 0.0),
+  135: (134.92488, 0.0),
+  136: (135.92357, 0.0),
+  137: (136.920479, 0.0),
+  138: (137.919548, 0.0),
+  139: (138.916804, 0.0),
+  140: (139.91604, 0.0),
+  141: (140.913555, 0.0),
+  142: (141.912874, 0.0),
+  143: (142.910933, 0.0),
+  144: (143.912591, 0.0),
+  145: (144.912749, 0.0),
+  146: (145.914696, 0.0),
+  147: (146.9151385, 0.0),
+  148: (147.917475, 0.0),
+  149: (148.918334, 0.0),
+  150: (149.920984, 0.0),
+  151: (150.921207, 0.0),
+  152: (151.923497, 0.0),
+  153: (152.924117, 0.0),
+  154: (153.92646, 0.0),
+  155: (154.9281, 0.0),
+  156: (155.93106, 0.0),
+  157: (156.93304, 0.0),
+  158: (157.93656, 0.0),
+  159: (158.93897, 0.0),
+  160: (159.94299, 0.0),
+  161: (160.94586, 0.0),
+  162: (161.95029, 0.0),
+  163: (162.95368, 0.0)},
+ 'Po': {0: (209, 1.0),
+  188: (187.999422, 0.0),
+  189: (188.998481, 0.0),
+  190: (189.995101, 0.0),
+  191: (190.994574, 0.0),
+  192: (191.991335, 0.0),
+  193: (192.99103, 0.0),
+  194: (193.988186, 0.0),
+  195: (194.98811, 0.0),
+  196: (195.985535, 0.0),
+  197: (196.98566, 0.0),
+  198: (197.983389, 0.0),
+  199: (198.983666, 0.0),
+  200: (199.981799, 0.0),
+  201: (200.98226, 0.0),
+  202: (201.980758, 0.0),
+  203: (202.98142, 0.0),
+  204: (203.980318, 0.0),
+  205: (204.981203, 0.0),
+  206: (205.980481, 0.0),
+  207: (206.981593, 0.0),
+  208: (207.9812457, 0.0),
+  209: (208.9824304, 0.0),
+  210: (209.9828737, 0.0),
+  211: (210.9866532, 0.0),
+  212: (211.988868, 0.0),
+  213: (212.992857, 0.0),
+  214: (213.9952014, 0.0),
+  215: (214.99942, 0.0),
+  216: (216.001915, 0.0),
+  217: (217.006335, 0.0),
+  218: (218.008973, 0.0),
+  219: (219.01374, 0.0),
+  220: (220.0166, 0.0)},
+ 'Pr': {0: (140.9076528, 1.0),
+  121: (120.95536, 0.0),
+  122: (121.95181, 0.0),
+  123: (122.94596, 0.0),
+  124: (123.94296, 0.0),
+  125: (124.93783, 0.0),
+  126: (125.93531, 0.0),
+  127: (126.93083, 0.0),
+  128: (127.92879, 0.0),
+  129: (128.9251, 0.0),
+  130: (129.92359, 0.0),
+  131: (130.92026, 0.0),
+  132: (131.91926, 0.0),
+  133: (132.916331, 0.0),
+  134: (133.91571, 0.0),
+  135: (134.913112, 0.0),
+  136: (135.912692, 0.0),
+  137: (136.910705, 0.0),
+  138: (137.910755, 0.0),
+  139: (138.908938, 0.0),
+  140: (139.909076, 0.0),
+  141: (140.9076528, 1.0),
+  142: (141.9100448, 0.0),
+  143: (142.9108169, 0.0),
+  144: (143.913305, 0.0),
+  145: (144.914512, 0.0),
+  146: (145.91764, 0.0),
+  147: (146.918996, 0.0),
+  148: (147.922135, 0.0),
+  149: (148.92372, 0.0),
+  150: (149.926673, 0.0),
+  151: (150.928319, 0.0),
+  152: (151.9315, 0.0),
+  153: (152.93384, 0.0),
+  154: (153.93752, 0.0),
+  155: (154.94012, 0.0),
+  156: (155.94427, 0.0),
+  157: (156.94743, 0.0),
+  158: (157.95198, 0.0),
+  159: (158.9555, 0.0)},
+ 'Pt': {0: (194.9647911, 1.0),
+  166: (165.99486, 0.0),
+  167: (166.99298, 0.0),
+  168: (167.98815, 0.0),
+  169: (168.98672, 0.0),
+  170: (169.982495, 0.0),
+  171: (170.98124, 0.0),
+  172: (171.977347, 0.0),
+  173: (172.97644, 0.0),
+  174: (173.972819, 0.0),
+  175: (174.972421, 0.0),
+  176: (175.968945, 0.0),
+  177: (176.968469, 0.0),
+  178: (177.965649, 0.0),
+  179: (178.965363, 0.0),
+  180: (179.963031, 0.0),
+  181: (180.963097, 0.0),
+  182: (181.961171, 0.0),
+  183: (182.961597, 0.0),
+  184: (183.959922, 0.0),
+  185: (184.96062, 0.0),
+  186: (185.959351, 0.0),
+  187: (186.96059, 0.0),
+  188: (187.959395, 0.0),
+  189: (188.960834, 0.0),
+  190: (189.959932, 0.00014),
+  191: (190.961677, 0.0),
+  192: (191.961038, 0.00782),
+  193: (192.9629874, 0.0),
+  194: (193.9626803, 0.32967),
+  195: (194.9647911, 0.33832),
+  196: (195.9649515, 0.25242),
+  197: (196.9673402, 0.0),
+  198: (197.967893, 0.07163),
+  199: (198.970593, 0.0),
+  200: (199.971441, 0.0),
+  201: (200.97451, 0.0),
+  202: (201.97574, 0.0)},
+ 'Pu': {0: (244, 1.0),
+  228: (228.03874, 0.0),
+  229: (229.04015, 0.0),
+  230: (230.03965, 0.0),
+  231: (231.041101, 0.0),
+  232: (232.041187, 0.0),
+  233: (233.043, 0.0),
+  234: (234.043317, 0.0),
+  235: (235.045286, 0.0),
+  236: (236.046058, 0.0),
+  237: (237.0484097, 0.0),
+  238: (238.0495599, 0.0),
+  239: (239.0521634, 0.0),
+  240: (240.0538135, 0.0),
+  241: (241.0568515, 0.0),
+  242: (242.0587426, 0.0),
+  243: (243.062003, 0.0),
+  244: (244.064204, 0.0),
+  245: (245.067747, 0.0),
+  246: (246.070205, 0.0),
+  247: (247.07407, 0.0)},
+ 'Ra': {0: (226, 1.0),
+  202: (202.00989, 0.0),
+  203: (203.00927, 0.0),
+  204: (204.0065, 0.0),
+  205: (205.00627, 0.0),
+  206: (206.003827, 0.0),
+  207: (207.0038, 0.0),
+  208: (208.00184, 0.0),
+  209: (209.00199, 0.0),
+  210: (210.000495, 0.0),
+  211: (211.000898, 0.0),
+  212: (211.999794, 0.0),
+  213: (213.000384, 0.0),
+  214: (214.000108, 0.0),
+  215: (215.00272, 0.0),
+  216: (216.003533, 0.0),
+  217: (217.00632, 0.0),
+  218: (218.00714, 0.0),
+  219: (219.010085, 0.0),
+  220: (220.011028, 0.0),
+  221: (221.013917, 0.0),
+  222: (222.015375, 0.0),
+  223: (223.0185022, 0.0),
+  224: (224.0202118, 0.0),
+  225: (225.023612, 0.0),
+  226: (226.0254098, 0.0),
+  227: (227.0291778, 0.0),
+  228: (228.0310703, 0.0),
+  229: (229.034958, 0.0),
+  230: (230.037056, 0.0),
+  231: (231.04122, 0.0),
+  232: (232.04364, 0.0),
+  233: (233.04806, 0.0),
+  234: (234.0507, 0.0)},
+ 'Rb': {0: (84.911789738, 1.0),
+  71: (70.96532, 0.0),
+  72: (71.95908, 0.0),
+  73: (72.95056, 0.0),
+  74: (73.944265, 0.0),
+  75: (74.93857, 0.0),
+  76: (75.9350722, 0.0),
+  77: (76.930408, 0.0),
+  78: (77.928141, 0.0),
+  79: (78.923989, 0.0),
+  80: (79.922519, 0.0),
+  81: (80.918996, 0.0),
+  82: (81.9182086, 0.0),
+  83: (82.91511, 0.0),
+  84: (83.914385, 0.0),
+  85: (84.911789738, 0.7217),
+  86: (85.91116742, 0.0),
+  87: (86.909180527, 0.2783),
+  88: (87.91131559, 0.0),
+  89: (88.912278, 0.0),
+  90: (89.914802, 0.0),
+  91: (90.916537, 0.0),
+  92: (91.919729, 0.0),
+  93: (92.922042, 0.0),
+  94: (93.926405, 0.0),
+  95: (94.929303, 0.0),
+  96: (95.93427, 0.0),
+  97: (96.93735, 0.0),
+  98: (97.94179, 0.0),
+  99: (98.94538, 0.0),
+  100: (99.94987, 0.0),
+  101: (100.9532, 0.0),
+  102: (101.95887, 0.0)},
+ 'Re': {0: (186.9557531, 1.0),
+  160: (159.98212, 0.0),
+  161: (160.97759, 0.0),
+  162: (161.976, 0.0),
+  163: (162.972081, 0.0),
+  164: (163.97032, 0.0),
+  165: (164.967089, 0.0),
+  166: (165.96581, 0.0),
+  167: (166.9626, 0.0),
+  168: (167.96157, 0.0),
+  169: (168.95879, 0.0),
+  170: (169.95822, 0.0),
+  171: (170.95572, 0.0),
+  172: (171.95542, 0.0),
+  173: (172.95324, 0.0),
+  174: (173.95312, 0.0),
+  175: (174.95138, 0.0),
+  176: (175.95162, 0.0),
+  177: (176.95033, 0.0),
+  178: (177.95099, 0.0),
+  179: (178.949988, 0.0),
+  180: (179.950789, 0.0),
+  181: (180.950068, 0.0),
+  182: (181.95121, 0.0),
+  183: (182.95082, 0.0),
+  184: (183.952521, 0.0),
+  185: (184.952955, 0.374),
+  186: (185.9549861, 0.0),
+  187: (186.9557531, 0.626),
+  188: (187.9581144, 0.0),
+  189: (188.959229, 0.0),
+  190: (189.96182, 0.0),
+  191: (190.963125, 0.0),
+  192: (191.96596, 0.0),
+  193: (192.96747, 0.0),
+  194: (193.97042, 0.0)},
+ 'Rf': {0: (265, 1.0),
+  253: (253.10069, 0.0),
+  254: (254.10018, 0.0),
+  255: (255.10134, 0.0),
+  256: (256.101166, 0.0),
+  257: (257.10299, 0.0),
+  258: (258.10349, 0.0),
+  259: (259.10564, 0.0),
+  260: (260.10644, 0.0),
+  261: (261.10877, 0.0),
+  262: (262.10993, 0.0),
+  263: (263.11255, 0.0),
+  264: (264.11399, 0.0),
+  265: (265.1167, 0.0),
+  266: (266.11796, 0.0),
+  267: (267.12153, 0.0),
+  268: (268.12364, 0.0)},
+ 'Rg': {0: (280, 1.0),
+  272: (272.15362, 0.0),
+  273: (273.15368, 0.0),
+  274: (274.15571, 0.0),
+  275: (275.15614, 0.0),
+  276: (276.15849, 0.0),
+  277: (277.15952, 0.0),
+  278: (278.1616, 0.0),
+  279: (279.16247, 0.0),
+  280: (280.16447, 0.0),
+  281: (281.16537, 0.0),
+  282: (282.16749, 0.0),
+  283: (283.16842, 0.0)},
+ 'Rh': {0: (102.905504, 1.0),
+  89: (88.94884, 0.0),
+  90: (89.94287, 0.0),
+  91: (90.93655, 0.0),
+  92: (91.93198, 0.0),
+  93: (92.92574, 0.0),
+  94: (93.9217, 0.0),
+  95: (94.9159, 0.0),
+  96: (95.914461, 0.0),
+  97: (96.91134, 0.0),
+  98: (97.910708, 0.0),
+  99: (98.908132, 0.0),
+  100: (99.908122, 0.0),
+  101: (100.906164, 0.0),
+  102: (101.906843, 0.0),
+  103: (102.905504, 1.0),
+  104: (103.906656, 0.0),
+  105: (104.905694, 0.0),
+  106: (105.907287, 0.0),
+  107: (106.906748, 0.0),
+  108: (107.90873, 0.0),
+  109: (108.908737, 0.0),
+  110: (109.91114, 0.0),
+  111: (110.91159, 0.0),
+  112: (111.91439, 0.0),
+  113: (112.91553, 0.0),
+  114: (113.91881, 0.0),
+  115: (114.92033, 0.0),
+  116: (115.92406, 0.0),
+  117: (116.92598, 0.0),
+  118: (117.93007, 0.0),
+  119: (118.93211, 0.0),
+  120: (119.93641, 0.0),
+  121: (120.93872, 0.0),
+  122: (121.94321, 0.0)},
+ 'Rn': {0: (222, 1.0),
+  195: (195.00544, 0.0),
+  196: (196.002115, 0.0),
+  197: (197.00158, 0.0),
+  198: (197.998679, 0.0),
+  199: (198.99837, 0.0),
+  200: (199.995699, 0.0),
+  201: (200.99563, 0.0),
+  202: (201.993263, 0.0),
+  203: (202.993387, 0.0),
+  204: (203.991429, 0.0),
+  205: (204.99172, 0.0),
+  206: (205.990214, 0.0),
+  207: (206.990734, 0.0),
+  208: (207.989642, 0.0),
+  209: (208.990415, 0.0),
+  210: (209.989696, 0.0),
+  211: (210.990601, 0.0),
+  212: (211.990704, 0.0),
+  213: (212.993883, 0.0),
+  214: (213.995363, 0.0),
+  215: (214.998745, 0.0),
+  216: (216.000274, 0.0),
+  217: (217.003928, 0.0),
+  218: (218.0056013, 0.0),
+  219: (219.0094802, 0.0),
+  220: (220.011394, 0.0),
+  221: (221.015537, 0.0),
+  222: (222.0175777, 0.0),
+  223: (223.02179, 0.0),
+  224: (224.02409, 0.0),
+  225: (225.02844, 0.0),
+  226: (226.03089, 0.0),
+  227: (227.03541, 0.0),
+  228: (228.03799, 0.0)},
+ 'Ru': {0: (101.9043493, 1.0),
+  87: (86.94918, 0.0),
+  88: (87.94026, 0.0),
+  89: (88.93611, 0.0),
+  90: (89.92989, 0.0),
+  91: (90.92629, 0.0),
+  92: (91.92012, 0.0),
+  93: (92.91705, 0.0),
+  94: (93.91136, 0.0),
+  95: (94.910413, 0.0),
+  96: (95.907598, 0.0554),
+  97: (96.907555, 0.0),
+  98: (97.905287, 0.0187),
+  99: (98.9059393, 0.1276),
+  100: (99.9042195, 0.126),
+  101: (100.9055821, 0.1706),
+  102: (101.9043493, 0.3155),
+  103: (102.9063238, 0.0),
+  104: (103.905433, 0.1862),
+  105: (104.907753, 0.0),
+  106: (105.907329, 0.0),
+  107: (106.90991, 0.0),
+  108: (107.91017, 0.0),
+  109: (108.9132, 0.0),
+  110: (109.91414, 0.0),
+  111: (110.9177, 0.0),
+  112: (111.91897, 0.0),
+  113: (112.92249, 0.0),
+  114: (113.92428, 0.0),
+  115: (114.92869, 0.0),
+  116: (115.93081, 0.0),
+  117: (116.93558, 0.0),
+  118: (117.93782, 0.0),
+  119: (118.94284, 0.0),
+  120: (119.94531, 0.0)},
+ 'S': {0: (31.972071, 1.0),
+  26: (26.02788, 0.0),
+  27: (27.01883, 0.0),
+  28: (28.00437, 0.0),
+  29: (28.99661, 0.0),
+  30: (29.984903, 0.0),
+  31: (30.9795547, 0.0),
+  32: (31.972071, 0.9499),
+  33: (32.97145876, 0.0075),
+  34: (33.9678669, 0.0425),
+  35: (34.96903216, 0.0),
+  36: (35.96708076, 0.0001),
+  37: (36.97112557, 0.0),
+  38: (37.971163, 0.0),
+  39: (38.97513, 0.0),
+  40: (39.97545, 0.0),
+  41: (40.97958, 0.0),
+  42: (41.98102, 0.0),
+  43: (42.98715, 0.0),
+  44: (43.99021, 0.0),
+  45: (44.99651, 0.0),
+  46: (46.00075, 0.0),
+  47: (47.00859, 0.0),
+  48: (48.01417, 0.0),
+  49: (49.02362, 0.0)},
+ 'Sb': {0: (120.9038157, 1.0),
+  103: (102.93969, 0.0),
+  104: (103.93647, 0.0),
+  105: (104.93149, 0.0),
+  106: (105.92879, 0.0),
+  107: (106.92415, 0.0),
+  108: (107.92216, 0.0),
+  109: (108.918132, 0.0),
+  110: (109.91675, 0.0),
+  111: (110.91316, 0.0),
+  112: (111.912398, 0.0),
+  113: (112.909372, 0.0),
+  114: (113.90927, 0.0),
+  115: (114.906598, 0.0),
+  116: (115.906794, 0.0),
+  117: (116.904836, 0.0),
+  118: (117.905529, 0.0),
+  119: (118.903942, 0.0),
+  120: (119.905072, 0.0),
+  121: (120.9038157, 0.5721),
+  122: (121.9051737, 0.0),
+  123: (122.904214, 0.4279),
+  124: (123.9059357, 0.0),
+  125: (124.9052538, 0.0),
+  126: (125.90725, 0.0),
+  127: (126.906924, 0.0),
+  128: (127.909169, 0.0),
+  129: (128.909148, 0.0),
+  130: (129.911656, 0.0),
+  131: (130.911982, 0.0),
+  132: (131.914467, 0.0),
+  133: (132.915252, 0.0),
+  134: (133.92038, 0.0),
+  135: (134.92517, 0.0),
+  136: (135.93035, 0.0),
+  137: (136.93531, 0.0),
+  138: (137.94079, 0.0),
+  139: (138.94598, 0.0)},
+ 'Sc': {0: (44.9559119, 1.0),
+  36: (36.01492, 0.0),
+  37: (37.00305, 0.0),
+  38: (37.9947, 0.0),
+  39: (38.98479, 0.0),
+  40: (39.977967, 0.0),
+  41: (40.96925113, 0.0),
+  42: (41.96551643, 0.0),
+  43: (42.9611507, 0.0),
+  44: (43.9594028, 0.0),
+  45: (44.9559119, 1.0),
+  46: (45.9551719, 0.0),
+  47: (46.9524075, 0.0),
+  48: (47.952231, 0.0),
+  49: (48.950024, 0.0),
+  50: (49.952188, 0.0),
+  51: (50.953603, 0.0),
+  52: (51.95668, 0.0),
+  53: (52.95961, 0.0),
+  54: (53.96326, 0.0),
+  55: (54.96824, 0.0),
+  56: (55.97287, 0.0),
+  57: (56.97779, 0.0),
+  58: (57.98371, 0.0),
+  59: (58.98922, 0.0),
+  60: (59.99571, 0.0)},
+ 'Se': {0: (79.9165213, 1.0),
+  65: (64.96466, 0.0),
+  66: (65.95521, 0.0),
+  67: (66.95009, 0.0),
+  68: (67.9418, 0.0),
+  69: (68.93956, 0.0),
+  70: (69.93339, 0.0),
+  71: (70.93224, 0.0),
+  72: (71.927112, 0.0),
+  73: (72.926765, 0.0),
+  74: (73.9224764, 0.0089),
+  75: (74.9225234, 0.0),
+  76: (75.9192136, 0.0937),
+  77: (76.919914, 0.0763),
+  78: (77.9173091, 0.2377),
+  79: (78.9184991, 0.0),
+  80: (79.9165213, 0.4961),
+  81: (80.9179925, 0.0),
+  82: (81.9166994, 0.0873),
+  83: (82.919118, 0.0),
+  84: (83.918462, 0.0),
+  85: (84.92225, 0.0),
+  86: (85.924272, 0.0),
+  87: (86.92852, 0.0),
+  88: (87.93142, 0.0),
+  89: (88.93645, 0.0),
+  90: (89.93996, 0.0),
+  91: (90.94596, 0.0),
+  92: (91.94992, 0.0),
+  93: (92.95629, 0.0),
+  94: (93.96049, 0.0)},
+ 'Sg': {0: (271, 1.0),
+  258: (258.11317, 0.0),
+  259: (259.1145, 0.0),
+  260: (260.11442, 0.0),
+  261: (261.11612, 0.0),
+  262: (262.1164, 0.0),
+  263: (263.11832, 0.0),
+  264: (264.11893, 0.0),
+  265: (265.12111, 0.0),
+  266: (266.12207, 0.0),
+  267: (267.12443, 0.0),
+  268: (268.12561, 0.0),
+  269: (269.12876, 0.0),
+  270: (270.13033, 0.0),
+  271: (271.13347, 0.0),
+  272: (272.13516, 0.0),
+  273: (273.13822, 0.0)},
+ 'Si': {0: (27.9769265325, 1.0),
+  22: (22.03453, 0.0),
+  23: (23.02552, 0.0),
+  24: (24.011546, 0.0),
+  25: (25.004106, 0.0),
+  26: (25.99233, 0.0),
+  27: (26.98670491, 0.0),
+  28: (27.9769265325, 0.92223),
+  29: (28.9764947, 0.04685),
+  30: (29.97377017, 0.03092),
+  31: (30.97536323, 0.0),
+  32: (31.97414808, 0.0),
+  33: (32.978, 0.0),
+  34: (33.978576, 0.0),
+  35: (34.98458, 0.0),
+  36: (35.9866, 0.0),
+  37: (36.99294, 0.0),
+  38: (37.99563, 0.0),
+  39: (39.00207, 0.0),
+  40: (40.00587, 0.0),
+  41: (41.01456, 0.0),
+  42: (42.01979, 0.0),
+  43: (43.02866, 0.0),
+  44: (44.03526, 0.0)},
+ 'Sm': {0: (151.9197324, 1.0),
+  128: (127.95808, 0.0),
+  129: (128.95464, 0.0),
+  130: (129.94892, 0.0),
+  131: (130.94611, 0.0),
+  132: (131.94069, 0.0),
+  133: (132.93867, 0.0),
+  134: (133.93397, 0.0),
+  135: (134.93252, 0.0),
+  136: (135.928276, 0.0),
+  137: (136.92697, 0.0),
+  138: (137.923244, 0.0),
+  139: (138.922297, 0.0),
+  140: (139.918995, 0.0),
+  141: (140.918476, 0.0),
+  142: (141.915198, 0.0),
+  143: (142.914628, 0.0),
+  144: (143.911999, 0.0307),
+  145: (144.91341, 0.0),
+  146: (145.913041, 0.0),
+  147: (146.9148979, 0.1499),
+  148: (147.9148227, 0.1124),
+  149: (148.9171847, 0.1382),
+  150: (149.9172755, 0.0738),
+  151: (150.9199324, 0.0),
+  152: (151.9197324, 0.2675),
+  153: (152.9220974, 0.0),
+  154: (153.9222093, 0.2275),
+  155: (154.9246402, 0.0),
+  156: (155.925528, 0.0),
+  157: (156.92836, 0.0),
+  158: (157.92999, 0.0),
+  159: (158.93321, 0.0),
+  160: (159.93514, 0.0),
+  161: (160.93883, 0.0),
+  162: (161.94122, 0.0),
+  163: (162.94536, 0.0),
+  164: (163.94828, 0.0),
+  165: (164.95298, 0.0)},
+ 'Sn': {0: (119.9021947, 1.0),
+  99: (98.94933, 0.0),
+  100: (99.93904, 0.0),
+  101: (100.93606, 0.0),
+  102: (101.9303, 0.0),
+  103: (102.9281, 0.0),
+  104: (103.92314, 0.0),
+  105: (104.92135, 0.0),
+  106: (105.91688, 0.0),
+  107: (106.91564, 0.0),
+  108: (107.911925, 0.0),
+  109: (108.911283, 0.0),
+  110: (109.907843, 0.0),
+  111: (110.907734, 0.0),
+  112: (111.904818, 0.0097),
+  113: (112.905171, 0.0),
+  114: (113.902779, 0.0066),
+  115: (114.903342, 0.0034),
+  116: (115.901741, 0.1454),
+  117: (116.902952, 0.0768),
+  118: (117.901603, 0.2422),
+  119: (118.903308, 0.0859),
+  120: (119.9021947, 0.3258),
+  121: (120.9042355, 0.0),
+  122: (121.903439, 0.0463),
+  123: (122.9057208, 0.0),
+  124: (123.9052739, 0.0579),
+  125: (124.9077841, 0.0),
+  126: (125.907653, 0.0),
+  127: (126.91036, 0.0),
+  128: (127.910537, 0.0),
+  129: (128.91348, 0.0),
+  130: (129.913967, 0.0),
+  131: (130.917, 0.0),
+  132: (131.917816, 0.0),
+  133: (132.92383, 0.0),
+  134: (133.92829, 0.0),
+  135: (134.93473, 0.0),
+  136: (135.93934, 0.0),
+  137: (136.94599, 0.0)},
+ 'Sr': {0: (87.9056121, 1.0),
+  73: (72.96597, 0.0),
+  74: (73.95631, 0.0),
+  75: (74.94995, 0.0),
+  76: (75.94177, 0.0),
+  77: (76.937945, 0.0),
+  78: (77.93218, 0.0),
+  79: (78.929708, 0.0),
+  80: (79.924521, 0.0),
+  81: (80.923212, 0.0),
+  82: (81.918402, 0.0),
+  83: (82.917557, 0.0),
+  84: (83.913425, 0.0056),
+  85: (84.912933, 0.0),
+  86: (85.9092602, 0.0986),
+  87: (86.9088771, 0.07),
+  88: (87.9056121, 0.8258),
+  89: (88.9074507, 0.0),
+  90: (89.907738, 0.0),
+  91: (90.910203, 0.0),
+  92: (91.911038, 0.0),
+  93: (92.914026, 0.0),
+  94: (93.915361, 0.0),
+  95: (94.919359, 0.0),
+  96: (95.921697, 0.0),
+  97: (96.926153, 0.0),
+  98: (97.928453, 0.0),
+  99: (98.93324, 0.0),
+  100: (99.93535, 0.0),
+  101: (100.94052, 0.0),
+  102: (101.94302, 0.0),
+  103: (102.94895, 0.0),
+  104: (103.95233, 0.0),
+  105: (104.95858, 0.0)},
+ 'Ta': {0: (180.9479958, 1.0),
+  155: (154.97459, 0.0),
+  156: (155.9723, 0.0),
+  157: (156.96819, 0.0),
+  158: (157.9667, 0.0),
+  159: (158.963018, 0.0),
+  160: (159.96149, 0.0),
+  161: (160.95842, 0.0),
+  162: (161.95729, 0.0),
+  163: (162.95433, 0.0),
+  164: (163.95353, 0.0),
+  165: (164.950773, 0.0),
+  166: (165.95051, 0.0),
+  167: (166.94809, 0.0),
+  168: (167.94805, 0.0),
+  169: (168.94601, 0.0),
+  170: (169.94618, 0.0),
+  171: (170.94448, 0.0),
+  172: (171.9449, 0.0),
+  173: (172.94375, 0.0),
+  174: (173.94445, 0.0),
+  175: (174.94374, 0.0),
+  176: (175.94486, 0.0),
+  177: (176.944472, 0.0),
+  178: (177.945778, 0.0),
+  179: (178.9459295, 0.0),
+  180: (179.9474648, 0.00012),
+  181: (180.9479958, 0.99988),
+  182: (181.9501518, 0.0),
+  183: (182.9513726, 0.0),
+  184: (183.954008, 0.0),
+  185: (184.955559, 0.0),
+  186: (185.95855, 0.0),
+  187: (186.96053, 0.0),
+  188: (187.9637, 0.0),
+  189: (188.96583, 0.0),
+  190: (189.96923, 0.0)},
+ 'Tb': {0: (158.9253468, 1.0),
+  136: (135.96138, 0.0),
+  137: (136.95598, 0.0),
+  138: (137.95316, 0.0),
+  139: (138.94829, 0.0),
+  140: (139.94581, 0.0),
+  141: (140.94145, 0.0),
+  142: (141.93874, 0.0),
+  143: (142.93512, 0.0),
+  144: (143.93305, 0.0),
+  145: (144.92927, 0.0),
+  146: (145.92725, 0.0),
+  147: (146.924045, 0.0),
+  148: (147.924272, 0.0),
+  149: (148.923246, 0.0),
+  150: (149.92366, 0.0),
+  151: (150.923103, 0.0),
+  152: (151.92407, 0.0),
+  153: (152.923435, 0.0),
+  154: (153.92468, 0.0),
+  155: (154.923505, 0.0),
+  156: (155.924747, 0.0),
+  157: (156.9240246, 0.0),
+  158: (157.9254131, 0.0),
+  159: (158.9253468, 1.0),
+  160: (159.9271676, 0.0),
+  161: (160.9275699, 0.0),
+  162: (161.92949, 0.0),
+  163: (162.930648, 0.0),
+  164: (163.93335, 0.0),
+  165: (164.93488, 0.0),
+  166: (165.93799, 0.0),
+  167: (166.94005, 0.0),
+  168: (167.94364, 0.0),
+  169: (168.94622, 0.0),
+  170: (169.95025, 0.0),
+  171: (170.9533, 0.0)},
+ 'Tc': {0: (98, 1.0),
+  85: (84.94883, 0.0),
+  86: (85.94288, 0.0),
+  87: (86.93653, 0.0),
+  88: (87.93268, 0.0),
+  89: (88.92717, 0.0),
+  90: (89.92356, 0.0),
+  91: (90.91843, 0.0),
+  92: (91.91526, 0.0),
+  93: (92.910249, 0.0),
+  94: (93.909657, 0.0),
+  95: (94.907657, 0.0),
+  96: (95.907871, 0.0),
+  97: (96.906365, 0.0),
+  98: (97.907216, 0.0),
+  99: (98.9062547, 0.0),
+  100: (99.9076578, 0.0),
+  101: (100.907315, 0.0),
+  102: (101.909215, 0.0),
+  103: (102.909181, 0.0),
+  104: (103.91145, 0.0),
+  105: (104.91166, 0.0),
+  106: (105.914358, 0.0),
+  107: (106.91508, 0.0),
+  108: (107.91846, 0.0),
+  109: (108.91998, 0.0),
+  110: (109.92382, 0.0),
+  111: (110.92569, 0.0),
+  112: (111.92915, 0.0),
+  113: (112.93159, 0.0),
+  114: (113.93588, 0.0),
+  115: (114.93869, 0.0),
+  116: (115.94337, 0.0),
+  117: (116.94648, 0.0),
+  118: (117.95148, 0.0)},
+ 'Te': {0: (129.9062244, 1.0),
+  105: (104.94364, 0.0),
+  106: (105.9375, 0.0),
+  107: (106.93501, 0.0),
+  108: (107.92944, 0.0),
+  109: (108.92742, 0.0),
+  110: (109.92241, 0.0),
+  111: (110.92111, 0.0),
+  112: (111.91701, 0.0),
+  113: (112.91589, 0.0),
+  114: (113.91209, 0.0),
+  115: (114.9119, 0.0),
+  116: (115.90846, 0.0),
+  117: (116.908645, 0.0),
+  118: (117.905828, 0.0),
+  119: (118.906404, 0.0),
+  120: (119.90402, 0.0009),
+  121: (120.904936, 0.0),
+  122: (121.9030439, 0.0255),
+  123: (122.90427, 0.0089),
+  124: (123.9028179, 0.0474),
+  125: (124.9044307, 0.0707),
+  126: (125.9033117, 0.1884),
+  127: (126.9052263, 0.0),
+  128: (127.9044631, 0.3174),
+  129: (128.9065982, 0.0),
+  130: (129.9062244, 0.3408),
+  131: (130.9085239, 0.0),
+  132: (131.908553, 0.0),
+  133: (132.910955, 0.0),
+  134: (133.911369, 0.0),
+  135: (134.91645, 0.0),
+  136: (135.9201, 0.0),
+  137: (136.92532, 0.0),
+  138: (137.92922, 0.0),
+  139: (138.93473, 0.0),
+  140: (139.93885, 0.0),
+  141: (140.94465, 0.0),
+  142: (141.94908, 0.0)},
+ 'Th': {0: (232.0380553, 1.0),
+  209: (209.01772, 0.0),
+  210: (210.015075, 0.0),
+  211: (211.01493, 0.0),
+  212: (212.01298, 0.0),
+  213: (213.01301, 0.0),
+  214: (214.0115, 0.0),
+  215: (215.01173, 0.0),
+  216: (216.011062, 0.0),
+  217: (217.013114, 0.0),
+  218: (218.013284, 0.0),
+  219: (219.01554, 0.0),
+  220: (220.015748, 0.0),
+  221: (221.018184, 0.0),
+  222: (222.018468, 0.0),
+  223: (223.020811, 0.0),
+  224: (224.021467, 0.0),
+  225: (225.023951, 0.0),
+  226: (226.024903, 0.0),
+  227: (227.0277041, 0.0),
+  228: (228.0287411, 0.0),
+  229: (229.031762, 0.0),
+  230: (230.0331338, 0.0),
+  231: (231.0363043, 0.0),
+  232: (232.0380553, 1.0),
+  233: (233.0415818, 0.0),
+  234: (234.043601, 0.0),
+  235: (235.04751, 0.0),
+  236: (236.04987, 0.0),
+  237: (237.05389, 0.0),
+  238: (238.0565, 0.0)},
+ 'Ti': {0: (47.9479463, 1.0),
+  38: (38.00977, 0.0),
+  39: (39.00161, 0.0),
+  40: (39.9905, 0.0),
+  41: (40.98315, 0.0),
+  42: (41.973031, 0.0),
+  43: (42.968522, 0.0),
+  44: (43.9596901, 0.0),
+  45: (44.9581256, 0.0),
+  46: (45.9526316, 0.0825),
+  47: (46.9517631, 0.0744),
+  48: (47.9479463, 0.7372),
+  49: (48.94787, 0.0541),
+  50: (49.9447912, 0.0518),
+  51: (50.946615, 0.0),
+  52: (51.946897, 0.0),
+  53: (52.94973, 0.0),
+  54: (53.95105, 0.0),
+  55: (54.95527, 0.0),
+  56: (55.9582, 0.0),
+  57: (56.96399, 0.0),
+  58: (57.96697, 0.0),
+  59: (58.97293, 0.0),
+  60: (59.97676, 0.0),
+  61: (60.9832, 0.0),
+  62: (61.98749, 0.0),
+  63: (62.99442, 0.0)},
+ 'Tl': {0: (204.9744275, 1.0),
+  176: (176.00059, 0.0),
+  177: (176.996427, 0.0),
+  178: (177.9949, 0.0),
+  179: (178.99109, 0.0),
+  180: (179.98991, 0.0),
+  181: (180.986257, 0.0),
+  182: (181.98567, 0.0),
+  183: (182.982193, 0.0),
+  184: (183.98187, 0.0),
+  185: (184.97879, 0.0),
+  186: (185.97833, 0.0),
+  187: (186.975906, 0.0),
+  188: (187.97601, 0.0),
+  189: (188.973588, 0.0),
+  190: (189.97388, 0.0),
+  191: (190.971786, 0.0),
+  192: (191.97223, 0.0),
+  193: (192.97067, 0.0),
+  194: (193.9712, 0.0),
+  195: (194.969774, 0.0),
+  196: (195.970481, 0.0),
+  197: (196.969575, 0.0),
+  198: (197.97048, 0.0),
+  199: (198.96988, 0.0),
+  200: (199.970963, 0.0),
+  201: (200.970819, 0.0),
+  202: (201.972106, 0.0),
+  203: (202.9723442, 0.2952),
+  204: (203.9738635, 0.0),
+  205: (204.9744275, 0.7048),
+  206: (205.9761103, 0.0),
+  207: (206.977419, 0.0),
+  208: (207.9820187, 0.0),
+  209: (208.985359, 0.0),
+  210: (209.990074, 0.0),
+  211: (210.99348, 0.0),
+  212: (211.99823, 0.0)},
+ 'Tm': {0: (168.9342133, 1.0),
+  145: (144.97007, 0.0),
+  146: (145.96643, 0.0),
+  147: (146.96096, 0.0),
+  148: (147.95784, 0.0),
+  149: (148.95272, 0.0),
+  150: (149.94996, 0.0),
+  151: (150.945483, 0.0),
+  152: (151.94442, 0.0),
+  153: (152.942012, 0.0),
+  154: (153.941568, 0.0),
+  155: (154.939199, 0.0),
+  156: (155.93898, 0.0),
+  157: (156.93697, 0.0),
+  158: (157.93698, 0.0),
+  159: (158.93498, 0.0),
+  160: (159.93526, 0.0),
+  161: (160.93355, 0.0),
+  162: (161.933995, 0.0),
+  163: (162.932651, 0.0),
+  164: (163.93356, 0.0),
+  165: (164.932435, 0.0),
+  166: (165.933554, 0.0),
+  167: (166.9328516, 0.0),
+  168: (167.934173, 0.0),
+  169: (168.9342133, 1.0),
+  170: (169.9358014, 0.0),
+  171: (170.9364294, 0.0),
+  172: (171.9384, 0.0),
+  173: (172.939604, 0.0),
+  174: (173.94217, 0.0),
+  175: (174.94384, 0.0),
+  176: (175.94699, 0.0),
+  177: (176.94904, 0.0),
+  178: (177.95264, 0.0),
+  179: (178.95534, 0.0)},
+ 'U': {0: (238.0507882, 1.0),
+  217: (217.02437, 0.0),
+  218: (218.02354, 0.0),
+  219: (219.02492, 0.0),
+  220: (220.02472, 0.0),
+  221: (221.0264, 0.0),
+  222: (222.02609, 0.0),
+  223: (223.02774, 0.0),
+  224: (224.027605, 0.0),
+  225: (225.029391, 0.0),
+  226: (226.029339, 0.0),
+  227: (227.031156, 0.0),
+  228: (228.031374, 0.0),
+  229: (229.033506, 0.0),
+  230: (230.03394, 0.0),
+  231: (231.036294, 0.0),
+  232: (232.0371562, 0.0),
+  233: (233.0396352, 0.0),
+  234: (234.0409521, 5.4e-05),
+  235: (235.0439299, 0.007204),
+  236: (236.045568, 0.0),
+  237: (237.0487302, 0.0),
+  238: (238.0507882, 0.992742),
+  239: (239.0542933, 0.0),
+  240: (240.056592, 0.0),
+  241: (241.06033, 0.0),
+  242: (242.06293, 0.0)},
+ 'Uuh': {0: (293, 1.0),
+  289: (289.19886, 0.0),
+  290: (290.19859, 0.0),
+  291: (291.20001, 0.0),
+  292: (292.19979, 0.0)},
+ 'Uuo': {0: (294, 1.0), 293: (293.21467, 0.0)},
+ 'Uup': {0: (288, 1.0),
+  287: (287.19119, 0.0),
+  288: (288.19249, 0.0),
+  289: (289.19272, 0.0),
+  290: (290.19414, 0.0),
+  291: (291.19438, 0.0)},
+ 'Uuq': {0: (289, 1.0),
+  285: (285.1837, 0.0),
+  286: (286.18386, 0.0),
+  287: (287.1856, 0.0),
+  288: (288.18569, 0.0),
+  289: (289.18728, 0.0)},
+ 'Uus': {0: (292, 1.0), 291: (291.20656, 0.0), 292: (292.20755, 0.0)},
+ 'Uut': {0: (284, 1.0),
+  283: (283.17645, 0.0),
+  284: (284.17808, 0.0),
+  285: (285.17873, 0.0),
+  286: (286.18048, 0.0),
+  287: (287.18105, 0.0)},
+ 'V': {0: (50.9439595, 1.0),
+  40: (40.01109, 0.0),
+  41: (40.99978, 0.0),
+  42: (41.99123, 0.0),
+  43: (42.98065, 0.0),
+  44: (43.97411, 0.0),
+  45: (44.965776, 0.0),
+  46: (45.9602005, 0.0),
+  47: (46.9549089, 0.0),
+  48: (47.9522537, 0.0),
+  49: (48.9485161, 0.0),
+  50: (49.9471585, 0.0025),
+  51: (50.9439595, 0.9975),
+  52: (51.9447755, 0.0),
+  53: (52.944338, 0.0),
+  54: (53.94644, 0.0),
+  55: (54.94723, 0.0),
+  56: (55.95053, 0.0),
+  57: (56.95256, 0.0),
+  58: (57.95683, 0.0),
+  59: (58.96021, 0.0),
+  60: (59.96503, 0.0),
+  61: (60.96848, 0.0),
+  62: (61.97378, 0.0),
+  63: (62.97755, 0.0),
+  64: (63.98347, 0.0),
+  65: (64.98792, 0.0)},
+ 'W': {0: (183.9509312, 1.0),
+  158: (157.97456, 0.0),
+  159: (158.97292, 0.0),
+  160: (159.96848, 0.0),
+  161: (160.96736, 0.0),
+  162: (161.963497, 0.0),
+  163: (162.96252, 0.0),
+  164: (163.958954, 0.0),
+  165: (164.95828, 0.0),
+  166: (165.955027, 0.0),
+  167: (166.954816, 0.0),
+  168: (167.951808, 0.0),
+  169: (168.951779, 0.0),
+  170: (169.949228, 0.0),
+  171: (170.94945, 0.0),
+  172: (171.94729, 0.0),
+  173: (172.94769, 0.0),
+  174: (173.94608, 0.0),
+  175: (174.94672, 0.0),
+  176: (175.94563, 0.0),
+  177: (176.94664, 0.0),
+  178: (177.945876, 0.0),
+  179: (178.94707, 0.0),
+  180: (179.946704, 0.0012),
+  181: (180.948197, 0.0),
+  182: (181.9482042, 0.265),
+  183: (182.950223, 0.1431),
+  184: (183.9509312, 0.3064),
+  185: (184.9534193, 0.0),
+  186: (185.9543641, 0.2843),
+  187: (186.9571605, 0.0),
+  188: (187.958489, 0.0),
+  189: (188.96191, 0.0),
+  190: (189.96318, 0.0),
+  191: (190.9666, 0.0),
+  192: (191.96817, 0.0)},
+ 'Xe': {0: (131.9041535, 1.0),
+  110: (109.94428, 0.0),
+  111: (110.9416, 0.0),
+  112: (111.93562, 0.0),
+  113: (112.93334, 0.0),
+  114: (113.92798, 0.0),
+  115: (114.926294, 0.0),
+  116: (115.921581, 0.0),
+  117: (116.920359, 0.0),
+  118: (117.916179, 0.0),
+  119: (118.915411, 0.0),
+  120: (119.911784, 0.0),
+  121: (120.911462, 0.0),
+  122: (121.908368, 0.0),
+  123: (122.908482, 0.0),
+  124: (123.905893, 0.000952),
+  125: (124.9063955, 0.0),
+  126: (125.904274, 0.00089),
+  127: (126.905184, 0.0),
+  128: (127.9035313, 0.019102),
+  129: (128.9047794, 0.264006),
+  130: (129.903508, 0.04071),
+  131: (130.9050824, 0.212324),
+  132: (131.9041535, 0.269086),
+  133: (132.9059107, 0.0),
+  134: (133.9053945, 0.104357),
+  135: (134.907227, 0.0),
+  136: (135.907219, 0.088573),
+  137: (136.911562, 0.0),
+  138: (137.91395, 0.0),
+  139: (138.918793, 0.0),
+  140: (139.92164, 0.0),
+  141: (140.92665, 0.0),
+  142: (141.92971, 0.0),
+  143: (142.93511, 0.0),
+  144: (143.93851, 0.0),
+  145: (144.94407, 0.0),
+  146: (145.94775, 0.0),
+  147: (146.95356, 0.0)},
+ 'Y': {0: (88.9058483, 1.0),
+  76: (75.95845, 0.0),
+  77: (76.94965, 0.0),
+  78: (77.94361, 0.0),
+  79: (78.93735, 0.0),
+  80: (79.93428, 0.0),
+  81: (80.92913, 0.0),
+  82: (81.92679, 0.0),
+  83: (82.92235, 0.0),
+  84: (83.92039, 0.0),
+  85: (84.916433, 0.0),
+  86: (85.914886, 0.0),
+  87: (86.9108757, 0.0),
+  88: (87.9095011, 0.0),
+  89: (88.9058483, 1.0),
+  90: (89.9071519, 0.0),
+  91: (90.907305, 0.0),
+  92: (91.908949, 0.0),
+  93: (92.909583, 0.0),
+  94: (93.911595, 0.0),
+  95: (94.912821, 0.0),
+  96: (95.915891, 0.0),
+  97: (96.918134, 0.0),
+  98: (97.922203, 0.0),
+  99: (98.924636, 0.0),
+  100: (99.92776, 0.0),
+  101: (100.93031, 0.0),
+  102: (101.93356, 0.0),
+  103: (102.93673, 0.0),
+  104: (103.94105, 0.0),
+  105: (104.94487, 0.0),
+  106: (105.94979, 0.0),
+  107: (106.95414, 0.0),
+  108: (107.95948, 0.0)},
+ 'Yb': {0: (173.9388621, 1.0),
+  148: (147.96742, 0.0),
+  149: (148.96404, 0.0),
+  150: (149.95842, 0.0),
+  151: (150.9554, 0.0),
+  152: (151.95029, 0.0),
+  153: (152.94948, 0.0),
+  154: (153.946394, 0.0),
+  155: (154.945782, 0.0),
+  156: (155.942818, 0.0),
+  157: (156.942628, 0.0),
+  158: (157.939866, 0.0),
+  159: (158.94005, 0.0),
+  160: (159.937552, 0.0),
+  161: (160.937902, 0.0),
+  162: (161.935768, 0.0),
+  163: (162.936334, 0.0),
+  164: (163.934489, 0.0),
+  165: (164.93528, 0.0),
+  166: (165.933882, 0.0),
+  167: (166.93495, 0.0),
+  168: (167.933897, 0.0013),
+  169: (168.93519, 0.0),
+  170: (169.9347618, 0.0304),
+  171: (170.9363258, 0.1428),
+  172: (171.9363815, 0.2183),
+  173: (172.9382108, 0.1613),
+  174: (173.9388621, 0.3183),
+  175: (174.9412765, 0.0),
+  176: (175.9425717, 0.1276),
+  177: (176.9452608, 0.0),
+  178: (177.946647, 0.0),
+  179: (178.95017, 0.0),
+  180: (179.95233, 0.0),
+  181: (180.95615, 0.0)},
+ 'Zn': {0: (63.9291422, 1.0),
+  54: (53.99295, 0.0),
+  55: (54.98398, 0.0),
+  56: (55.97238, 0.0),
+  57: (56.96479, 0.0),
+  58: (57.95459, 0.0),
+  59: (58.94926, 0.0),
+  60: (59.941827, 0.0),
+  61: (60.939511, 0.0),
+  62: (61.93433, 0.0),
+  63: (62.9332116, 0.0),
+  64: (63.9291422, 0.48268),
+  65: (64.929241, 0.0),
+  66: (65.9260334, 0.27975),
+  67: (66.9271273, 0.04102),
+  68: (67.9248442, 0.19024),
+  69: (68.9265503, 0.0),
+  70: (69.9253193, 0.00631),
+  71: (70.927722, 0.0),
+  72: (71.926858, 0.0),
+  73: (72.92978, 0.0),
+  74: (73.92946, 0.0),
+  75: (74.93294, 0.0),
+  76: (75.93329, 0.0),
+  77: (76.93696, 0.0),
+  78: (77.93844, 0.0),
+  79: (78.94265, 0.0),
+  80: (79.94434, 0.0),
+  81: (80.95048, 0.0),
+  82: (81.95442, 0.0),
+  83: (82.96103, 0.0)},
+ 'Zr': {0: (89.9047044, 1.0),
+  78: (77.95523, 0.0),
+  79: (78.94916, 0.0),
+  80: (79.9404, 0.0),
+  81: (80.93721, 0.0),
+  82: (81.93109, 0.0),
+  83: (82.92865, 0.0),
+  84: (83.92325, 0.0),
+  85: (84.92147, 0.0),
+  86: (85.91647, 0.0),
+  87: (86.914816, 0.0),
+  88: (87.910227, 0.0),
+  89: (88.90889, 0.0),
+  90: (89.9047044, 0.5145),
+  91: (90.9056458, 0.1122),
+  92: (91.9050408, 0.1715),
+  93: (92.906476, 0.0),
+  94: (93.9063152, 0.1738),
+  95: (94.9080426, 0.0),
+  96: (95.9082734, 0.028),
+  97: (96.9109531, 0.0),
+  98: (97.912735, 0.0),
+  99: (98.916512, 0.0),
+  100: (99.91776, 0.0),
+  101: (100.92114, 0.0),
+  102: (101.92298, 0.0),
+  103: (102.9266, 0.0),
+  104: (103.92878, 0.0),
+  105: (104.93305, 0.0),
+  106: (105.93591, 0.0),
+  107: (106.94075, 0.0),
+  108: (107.94396, 0.0),
+  109: (108.94924, 0.0),
+  110: (109.95287, 0.0)},
+ 'e*': {0: (0.00054857990943, 1.0)}}
diff --git a/pyteomics/auxiliary/file_helpers.py b/pyteomics/auxiliary/file_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29e106532c78eb9ae04ed193b01c4f1a4fae064
--- /dev/null
+++ b/pyteomics/auxiliary/file_helpers.py
@@ -0,0 +1,1250 @@
+import sys
+import codecs
+import re
+from functools import wraps
+from contextlib import contextmanager
+from collections import OrderedDict, defaultdict
+import json
+import multiprocessing as mp
+import threading
+import warnings
+import os
+from abc import ABCMeta
+
+try:
+    basestring
+except NameError:
+    basestring = (str, bytes)
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+try:
+    import dill
+except ImportError:
+    dill = None
+    try:
+        import cPickle as pickle
+    except ImportError:
+        import pickle
+    serializer = pickle
+else:
+    serializer = dill
+
+try:
+    from queue import Empty
+except ImportError:
+    from Queue import Empty
+
+try:
+    from collections.abc import Sequence
+except ImportError:
+    from collections import Sequence
+
+from .structures import PyteomicsError
+from .utils import add_metaclass
+
+
+def _keepstate(func):
+    """Decorator to help keep the position in open files passed as
+    positional arguments to functions"""
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        positions = [getattr(arg, 'seek', None) and getattr(arg, 'tell', type(None))() for arg in args]
+        for arg, pos in zip(args, positions):
+            if pos is not None:
+                arg.seek(0)
+        res = func(*args, **kwargs)
+        for arg, pos in zip(args, positions):
+            if pos is not None:
+                try:
+                    arg.seek(pos)
+                except ValueError:
+                    pass
+        return res
+    return wrapped
+
+
+def _keepstate_method(func):
+    """Decorator for :py:class:`FileReader` methods to help keep the position
+    in the underlying file.
+    """
+    @wraps(func)
+    def wrapped(self, *args, **kwargs):
+        position = self.tell()
+        self.seek(0)
+        try:
+            return func(self, *args, **kwargs)
+        finally:
+            self.seek(position)
+    return wrapped
+
+
+class _file_obj(object):
+    """Check if `f` is a file name and open the file in `mode`.
+    A context manager."""
+
+    def __init__(self, f, mode, encoding=None):
+        self._file_spec = None
+        self.mode = mode
+        if f is None:
+            self.file = {'r': sys.stdin, 'a': sys.stdout, 'w': sys.stdout
+                         }[mode[0]]
+            self._file_spec = None
+        elif isinstance(f, basestring):
+            self.file = codecs.open(f, mode, encoding)
+            self._file_spec = f
+        else:
+            self._file_spec = f
+            self.file = f
+        self.encoding = getattr(self.file, 'encoding', encoding)
+        self.close_file = (self.file is not f)
+
+    def __enter__(self):
+        return self
+
+    def __reduce_ex__(self, protocol):
+        return self.__class__, (self._file_spec, self.mode, self.encoding)
+
+    def __exit__(self, *args, **kwargs):
+        if (not self.close_file) or self._file_spec is None:
+            return  # do nothing
+        # clean up
+        exit = getattr(self.file, '__exit__', None)
+        if exit is not None:
+            return exit(*args, **kwargs)
+        else:
+            exit = getattr(self.file, 'close', None)
+            if exit is not None:
+                exit()
+
+    def __getattr__(self, attr):
+        return getattr(self.file, attr)
+
+    def __iter__(self):
+        return iter(self.file)
+
+
+class NoOpBaseReader(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+
+class IteratorContextManager(NoOpBaseReader):
+    def __init__(self, *args, **kwargs):
+        self._func = kwargs.pop('parser_func')
+        self._args = args
+        self._kwargs = kwargs
+        if type(self) == IteratorContextManager:
+            self.reset()
+        super(IteratorContextManager, self).__init__(*args, **kwargs)
+
+    def __getstate__(self):
+        state = {}
+        state['_iterator_args'] = self._args
+        state['_iterator_kwargs'] = self._kwargs
+        return state
+
+    def __setstate__(self, state):
+        self._args = state['_iterator_args']
+        self._kwargs = state['_iterator_kwargs']
+
+    def reset(self):
+        """Resets the iterator to its initial state."""
+        try:
+            self._reader = self._func(*self._args, **self._kwargs)
+        except Exception:
+            self.__exit__(*sys.exc_info())
+            raise
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        pass
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # try:
+        return next(self._reader)
+        # except StopIteration:
+        # self.__exit__(None, None, None)
+        # raise
+
+    next = __next__
+
+
+@add_metaclass(ABCMeta)
+class FileReader(IteratorContextManager):
+    """Abstract class implementing context manager protocol
+    for file readers.
+    """
+
+    def __init__(self, source, **kwargs):
+        func = kwargs['parser_func']
+        super(FileReader, self).__init__(*kwargs['args'], parser_func=func, **kwargs['kwargs'])
+        self._pass_file = kwargs['pass_file']
+        self._source_init = source
+        self._mode = kwargs['mode']
+        self._encoding = kwargs.get('encoding')
+        self.reset()
+
+    def reset(self):
+        if hasattr(self, '_source'):
+            self._source.__exit__(None, None, None)
+        self._source = _file_obj(self._source_init, self._mode, self._encoding)
+        try:
+            if self._pass_file:
+                self._reader = self._func(
+                    self._source, *self._args, **self._kwargs)
+            else:
+                self._reader = self._func(*self._args, **self._kwargs)
+        except Exception:  # clean up on any error
+            self.__exit__(*sys.exc_info())
+            raise
+
+    def __exit__(self, *args, **kwargs):
+        self._source.__exit__(*args, **kwargs)
+
+    # delegate everything else to file object
+    def __getattr__(self, attr):
+        if attr == '_source':
+            raise AttributeError
+        return getattr(self._source, attr)
+
+
+def remove_bom(bstr):
+    return bstr.replace(codecs.BOM_LE, b'').lstrip(b"\x00")
+
+
+class IndexedReaderMixin(NoOpBaseReader):
+    """Common interface for :py:class:`IndexedTextReader` and :py:class:`IndexedXML`."""
+    @property
+    def index(self):
+        return self._offset_index
+
+    @property
+    def default_index(self):
+        return self._offset_index
+
+    def __len__(self):
+        return len(self._offset_index)
+
+    def __contains__(self, key):
+        return key in self._offset_index
+
+    def _item_from_offsets(self, offsets):
+        raise NotImplementedError
+
+    def get_by_id(self, elem_id):
+        index = self.default_index
+        if index is None:
+            raise PyteomicsError('Access by ID requires building an offset index.')
+        offsets = index[elem_id]
+        return self._item_from_offsets(offsets)
+
+    def get_by_ids(self, ids):
+        return [self.get_by_id(key) for key in ids]
+
+    def get_by_index(self, i):
+        try:
+            key = self.default_index.from_index(i, False)
+        except AttributeError:
+            raise PyteomicsError('Positional access requires building an offset index.')
+        return self.get_by_id(key)
+
+    def get_by_indexes(self, indexes):
+        return [self.get_by_index(i) for i in indexes]
+
+    def get_by_index_slice(self, s):
+        try:
+            keys = self.default_index.from_slice(s, False)
+        except AttributeError:
+            raise PyteomicsError('Positional access requires building an offset index.')
+        return self.get_by_ids(keys)
+
+    def get_by_key_slice(self, s):
+        keys = self.default_index.between(s.start, s.stop)
+        if s.step:
+            keys = keys[::s.step]
+        return self.get_by_ids(keys)
+
+    def __getitem__(self, key):
+        if isinstance(key, basestring):
+            return self.get_by_id(key)
+        if isinstance(key, int):
+            return self.get_by_index(key)
+        if isinstance(key, Sequence):
+            if not key:
+                return []
+            if isinstance(key[0], int):
+                return self.get_by_indexes(key)
+            if isinstance(key[0], basestring):
+                return self.get_by_ids(key)
+        if isinstance(key, slice):
+            for item in (key.start, key.stop, key.step):
+                if item is not None:
+                    break
+            if isinstance(item, int):
+                return self.get_by_index_slice(key)
+            if isinstance(item, basestring):
+                return self.get_by_key_slice(key)
+            if item is None:
+                return list(self)
+        raise PyteomicsError('Unsupported query key: {}'.format(key))
+
+
+class RTLocator():
+    def __init__(self, reader):
+        self._reader = reader
+
+    def _get_scan_by_time(self, time):
+        """Retrieve the scan object for the specified scan time.
+
+        Parameters
+        ----------
+        time : float
+            The time to get the nearest scan from
+        Returns
+        -------
+        tuple: (scan_id, scan, scan_time)
+        """
+        if not self._reader.default_index:
+            raise PyteomicsError("This method requires the index. Please pass `use_index=True` during initialization")
+
+        scan_ids = tuple(self._reader.default_index)
+        lo = 0
+        hi = len(scan_ids)
+
+        best_match = None
+        best_error = float('inf')
+        best_time = None
+        best_id = None
+
+        if time == float('inf'):
+            scan = self._reader.get_by_id(scan_ids[-1])
+            return scan_ids[-1], scan, self._reader._get_time(scan)
+
+        while hi != lo:
+            mid = (hi + lo) // 2
+            sid = scan_ids[mid]
+            scan = self._reader.get_by_id(sid)
+            scan_time = self._reader._get_time(scan)
+            err = abs(scan_time - time)
+            if err < best_error:
+                best_error = err
+                best_match = scan
+                best_time = scan_time
+                best_id = sid
+            if scan_time == time:
+                return sid, scan, scan_time
+            elif (hi - lo) == 1:
+                return best_id, best_match, best_time
+            elif scan_time > time:
+                hi = mid
+            else:
+                lo = mid
+
+    def __getitem__(self, key):
+        if isinstance(key, (int, float)):
+            return self._get_scan_by_time(key)[1]
+        if isinstance(key, Sequence):
+            return [self._get_scan_by_time(t)[1] for t in key]
+        if isinstance(key, slice):
+            if key.start is None:
+                start_index = self._reader.default_index.from_index(0)
+            else:
+                start_index = self._get_scan_by_time(key.start)[0]
+            if key.stop is None:
+                stop_index = self._reader.default_index.from_index(-1)
+            else:
+                stop_index = self._get_scan_by_time(key.stop)[0]
+            return self._reader[start_index:stop_index:key.step]
+
+
+class TimeOrderedIndexedReaderMixin(IndexedReaderMixin):
+    @property
+    def time(self):
+        return self._time
+
+    def __init__(self, *args, **kwargs):
+        super(TimeOrderedIndexedReaderMixin, self).__init__(*args, **kwargs)
+        self._time = RTLocator(self)
+
+    @staticmethod
+    def _get_time(scan):
+        raise NotImplementedError
+
+
+class IndexedTextReader(IndexedReaderMixin, FileReader):
+    """Abstract class for text file readers that keep an index of records for random access.
+    This requires reading the file in binary mode."""
+
+    delimiter = None
+    label = None
+    block_size = 1000000
+    label_group = 1
+    _kw_keys = ['delimiter', 'label', 'block_size', 'label_group']
+
+    def __init__(self, source, **kwargs):
+        # the underlying _file_obj gets None as encoding
+        # to avoid transparent decoding of StreamReader on read() calls
+        encoding = kwargs.pop('encoding', 'utf-8')
+        super(IndexedTextReader, self).__init__(source, mode='rb', encoding=None, **kwargs)
+        self.encoding = encoding
+        for attr in self._kw_keys:
+            if attr in kwargs:
+                setattr(self, attr, kwargs.pop(attr))
+        self._offset_index = None
+        if not kwargs.pop('_skip_index', False):
+            self._offset_index = self.build_byte_index()
+
+    def __getstate__(self):
+        state = super(IndexedTextReader, self).__getstate__()
+        state['offset_index'] = self._offset_index
+        for key in self._kw_keys:
+            state[key] = getattr(self, key)
+        return state
+
+    def __setstate__(self, state):
+        super(IndexedTextReader, self).__setstate__(state)
+        self._offset_index = state['offset_index']
+        for key in self._kw_keys:
+            if key in state:
+                setattr(self, key, state[key])
+
+    def _chunk_iterator(self):
+        fh = self._source.file
+        delim = remove_bom(self.delimiter.encode(self.encoding))
+        buff = fh.read(self.block_size)
+        parts = buff.split(delim)
+        started_with_delim = buff.startswith(delim)
+        tail = parts[-1]
+        front = parts[:-1]
+        i = 0
+        for part in front:
+            i += 1
+            if part == b"":
+                continue
+            if i == 1:
+                if started_with_delim:
+                    yield delim + part
+                else:
+                    yield part
+            else:
+                yield delim + part
+        running = True
+        while running:
+            buff = fh.read(self.block_size)
+            if len(buff) == 0:
+                running = False
+                buff = tail
+            else:
+                buff = tail + buff
+            parts = buff.split(delim)
+            tail = parts[-1]
+            front = parts[:-1]
+            for part in front:
+                yield delim + part
+        yield delim + tail
+
+    def _generate_offsets(self):
+        i = 0
+        pattern = re.compile(remove_bom(self.label.encode(self.encoding)))
+        for chunk in self._chunk_iterator():
+            match = pattern.search(chunk)
+            if match:
+                label = match.group(self.label_group)
+                yield i, label.decode(self.encoding), match
+            i += len(chunk)
+        yield i, None, None
+
+    def build_byte_index(self):
+        index = OffsetIndex()
+        g = self._generate_offsets()
+        last_offset = 0
+        last_label = None
+        for offset, label, keyline in g:
+            if last_label is not None:
+                index[last_label] = (last_offset, offset)
+            last_label = label
+            last_offset = offset
+        assert last_label is None
+        return index
+
+    def _read_lines_from_offsets(self, start, end):
+        self._source.seek(start)
+        lines = self._source.read(end - start).decode(self.encoding).split('\n')
+        return lines
+
+
+class IndexSavingMixin(NoOpBaseReader):
+    """Common interface for :py:class:`IndexSavingXML` and :py:class:`IndexSavingTextReader`."""
+    _index_class = NotImplemented
+
+    @property
+    def _byte_offset_filename(self):
+        try:
+            path = self._source.name
+        except AttributeError:
+            return None
+        name, ext = os.path.splitext(path)
+        byte_offset_filename = '{}-{}-byte-offsets.json'.format(name, ext[1:])
+        return byte_offset_filename
+
+    def _check_has_byte_offset_file(self):
+        """Check if the file at :attr:`_byte_offset_filename` exists
+
+        Returns
+        -------
+        bool
+            Whether the file exists
+        """
+        path = self._byte_offset_filename
+        if path is None:
+            return False
+        return os.path.exists(path)
+
+    @classmethod
+    def prebuild_byte_offset_file(cls, path):
+        """Construct a new XML reader, build its byte offset index and
+        write it to file
+
+        Parameters
+        ----------
+        path : str
+            The path to the file to parse
+        """
+        with cls(path) as inst:
+            inst.write_byte_offsets()
+
+    def write_byte_offsets(self):
+        """Write the byte offsets in :attr:`_offset_index` to the file
+        at :attr:`_byte_offset_filename`
+        """
+        with open(self._byte_offset_filename, 'w') as f:
+            self._offset_index.save(f)
+
+    @_keepstate_method
+    def _build_index(self):
+        """Build the byte offset index by either reading these offsets
+        from the file at :attr:`_byte_offset_filename`, or falling back
+        to the method used by :class:`IndexedXML` if this operation fails
+        due to an IOError
+        """
+        if not self._use_index: return
+        try:
+            self._read_byte_offsets()
+        except (IOError, AttributeError, TypeError):
+            super(IndexSavingMixin, self)._build_index()
+
+    def _read_byte_offsets(self):
+        """Read the byte offset index JSON file at :attr:`_byte_offset_filename`
+        and populate :attr:`_offset_index`
+        """
+        with open(self._byte_offset_filename, 'r') as f:
+            index = self._index_class.load(f)
+            self._offset_index = index
+
+
+def _file_reader(_mode='r'):
+    # a lot of the code below is borrowed from
+    # http://stackoverflow.com/a/14095585/1258041
+    def decorator(_func):
+        """A decorator implementing the context manager protocol for functions
+        that read files.
+
+        Note: 'close' must be in kwargs! Otherwise it won't be respected.
+        """
+        @wraps(_func)
+        def helper(*args, **kwargs):
+            if args:
+                return FileReader(args[0], mode=_mode, parser_func=_func, pass_file=True, args=args[1:], kwargs=kwargs,
+                    encoding=kwargs.pop('encoding', None))
+            source = kwargs.pop('source', None)
+            return FileReader(source, mode=_mode, parser_func=_func, pass_file=True, args=(), kwargs=kwargs, encoding=kwargs.pop('encoding', None))
+        return helper
+    return decorator
+
+
+def _file_writer(_mode='w'):
+    def decorator(_func):
+        """A decorator that opens output files for writer functions.
+        """
+        @wraps(_func)
+        def helper(*args, **kwargs):
+            m = kwargs.pop('file_mode', _mode)
+            enc = kwargs.pop('encoding', None)
+            if len(args) > 1:
+                out_arg = args[1]
+            else:
+                out_arg = kwargs.pop('output', None)
+
+            with _file_obj(out_arg, m, encoding=enc) as out:
+                if len(args) > 1:
+                    call_args = (args[0], out) + args[2:]
+                    call_kwargs = kwargs
+                else:
+                    call_args = args
+                    call_kwargs = dict(output=out, **kwargs)
+                return _func(*call_args, **call_kwargs)
+        return helper
+    return decorator
+
+
+class WritableIndex(object):
+    schema_version = (1, 0, 0)
+    _schema_version_tag_key = "@pyteomics_schema_version"
+
+    def _serializable_container(self):
+        container = {'index': list(self.items())}
+        return container
+
+    def save(self, fp):
+        container = self._serializable_container()
+        container[self._schema_version_tag_key] = self.schema_version
+        json.dump(container, fp)
+
+    @classmethod
+    def load(cls, fp):
+        container = json.load(fp, object_hook=OrderedDict)
+        version_tag = container.get(cls._schema_version_tag_key)
+        if version_tag is None:
+            # The legacy case, no special processing yet
+            inst = cls()
+            inst.schema_version = None
+            return inst
+        version_tag = tuple(version_tag)
+        index = container.get("index")
+        if version_tag < cls.schema_version:
+            # schema upgrade case, no special processing yet
+            inst = cls(index)
+            inst.schema_version = version_tag
+            return inst
+        # no need to upgrade
+        return cls(index)
+
+
+class OffsetIndex(OrderedDict, WritableIndex):
+    '''An augmented OrderedDict that formally wraps getting items by index
+    '''
+
+    def __init__(self, *args, **kwargs):
+        super(OffsetIndex, self).__init__(*args, **kwargs)
+        self._index_sequence = None
+
+    def _invalidate(self):
+        self._index_sequence = None
+
+    @property
+    def index_sequence(self):
+        """Keeps a cached copy of the :meth:`items` sequence
+        stored as a :class:`tuple` to avoid repeatedly copying
+        the sequence over many method calls.
+
+        Returns
+        -------
+        :class:`tuple`
+        """
+        if self._index_sequence is None:
+            self._index_sequence = tuple(self.items())
+        return self._index_sequence
+
+    def __setitem__(self, key, value):
+        self._invalidate()
+        return super(OffsetIndex, self).__setitem__(key, value)
+
+    def pop(self, *args, **kwargs):
+        self._invalidate()
+        return super(OffsetIndex, self).pop(*args, **kwargs)
+
+    def find(self, key, *args, **kwargs):
+        return self[key]
+
+    def from_index(self, index, include_value=False):
+        '''Get an entry by its integer index in the ordered sequence
+        of this mapping.
+
+        Parameters
+        ----------
+        index: int
+            The index to retrieve.
+        include_value: bool
+            Whether to return both the key and the value or just the key.
+            Defaults to :const:`False`.
+
+        Returns
+        -------
+        object:
+            If ``include_value`` is :const:`True`, a tuple of (key, value) at ``index``
+            else just the key at ``index``.
+        '''
+        items = self.index_sequence
+        if include_value:
+            return items[index]
+        else:
+            return items[index][0]
+
+    def from_slice(self, spec, include_value=False):
+        '''Get a slice along index in the ordered sequence
+        of this mapping.
+
+        Parameters
+        ----------
+        spec: slice
+            The slice over the range of indices to retrieve
+        include_value: bool
+            Whether to return both the key and the value or just the key.
+            Defaults to :const:`False`
+
+        Returns
+        -------
+        list:
+            If ``include_value`` is :const:`True`, a tuple of (key, value) at ``index``
+            else just the key at ``index`` for each ``index`` in ``spec``
+        '''
+        items = self.index_sequence
+        return [(k, v) if include_value else k for k, v in items[spec]]
+
+    def between(self, start, stop, include_value=False):
+        keys = list(self)
+        if start is not None:
+            try:
+                start_index = keys.index(start)
+            except ValueError:
+                raise KeyError(start)
+        else:
+            start_index = 0
+        if stop is not None:
+            try:
+                stop_index = keys.index(stop)
+            except ValueError:
+                raise KeyError(stop)
+        else:
+            stop_index = len(keys) - 1
+        if start is None or stop is None:
+            pass  # won't switch indices
+        else:
+            start_index, stop_index = min(start_index, stop_index), max(start_index, stop_index)
+
+        if include_value:
+            return [(k, self[k]) for k in keys[start_index:stop_index + 1]]
+        return keys[start_index:stop_index + 1]
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({items})"
+        return template.format(self=self, items=list(self.items()))
+
+    def _integrity_check(self):
+        indices = list(self.values())
+        sorted_indices = sorted(self.values())
+        return indices == sorted_indices
+
+    def sort(self):
+        sorted_pairs = sorted(self.items(), key=lambda x: x[1])
+        self.clear()
+        self._invalidate()
+        for key, value in sorted_pairs:
+            self[key] = value
+        return self
+
+
+class IndexSavingTextReader(IndexSavingMixin, IndexedTextReader):
+    _index_class = OffsetIndex
+
+
+class HierarchicalOffsetIndex(WritableIndex):
+    _inner_type = OffsetIndex
+
+    def __init__(self, base=None):
+        self.mapping = defaultdict(self._inner_type)
+        for key, value in (base or {}).items():
+            self.mapping[key] = self._inner_type(value)
+
+    def _integrity_check(self):
+        for key, value in self.items():
+            if not value._integrity_check():
+                return False
+        return True
+
+    def sort(self):
+        for key, value in self.items():
+            value.sort()
+        return self
+
+    def __getitem__(self, key):
+        return self.mapping[key]
+
+    def __setitem__(self, key, value):
+        self.mapping[key] = value
+
+    def __iter__(self):
+        return iter(self.mapping)
+
+    def __len__(self):
+        return sum(len(group) for key, group in self.items())
+
+    def __contains__(self, key):
+        return key in self.mapping
+
+    def find(self, key, element_type=None):
+        if element_type is None:
+            for element_type in self.keys():
+                try:
+                    return self.find(key, element_type)
+                except KeyError:
+                    continue
+            raise KeyError(key)
+        else:
+            return self[element_type][key]
+
+    def find_no_type(self, key):
+        """Try to find `key` in each of the lower-level indexes, returning both
+        value and the element type that match the key."""
+        for element_type in self.keys():
+            try:
+                return self.find(key, element_type), element_type
+            except KeyError:
+                continue
+        raise KeyError(key)
+
+    def update(self, *args, **kwargs):
+        self.mapping.update(*args, **kwargs)
+
+    def pop(self, key, default=None):
+        return self.mapping.pop(key, default)
+
+    def keys(self):
+        return self.mapping.keys()
+
+    def values(self):
+        return self.mapping.values()
+
+    def items(self):
+        return self.mapping.items()
+
+    def _serializable_container(self):
+        encoded_index = {}
+        container = {
+            'keys': list(self.keys())
+        }
+        for key, offset in self.items():
+            encoded_index[key] = list(offset.items())
+        container['index'] = encoded_index
+        return container
+
+
+def _make_chain(reader, readername, full_output=False):
+
+    def concat_results(*args, **kwargs):
+        results = [reader(arg, **kwargs) for arg in args]
+        if pd is not None and all(isinstance(a, pd.DataFrame) for a in args):
+            return pd.concat(results)
+        return np.concatenate(results)
+
+    def _iter(files, kwargs):
+        for f in files:
+            with reader(f, **kwargs) as r:
+                for item in r:
+                    yield item
+
+    def chain(*files, **kwargs):
+        return _iter(files, kwargs)
+
+    def from_iterable(files, **kwargs):
+        return _iter(files, kwargs)
+
+    @contextmanager
+    def _chain(*files, **kwargs):
+        yield chain(*files, **kwargs)
+
+    @contextmanager
+    def _from_iterable(files, **kwargs):
+        yield from_iterable(files, **kwargs)
+
+    def dispatch(*args, **kwargs):
+        return dispatch_from_iterable(args, **kwargs)
+
+    def dispatch_from_iterable(args, **kwargs):
+        if kwargs.get('full_output', full_output):
+            return concat_results(*args, **kwargs)
+        return _chain(*args, **kwargs)
+
+    dispatch.__doc__ = """Chain :py:func:`{0}` for several files.
+        Positional arguments should be file names or file objects.
+        Keyword arguments are passed to the :py:func:`{0}` function.
+        """.format(readername)
+    dispatch_from_iterable.__doc__ = """Chain :py:func:`{0}` for several files.
+        Keyword arguments are passed to the :py:func:`{0}` function.
+
+        Parameters
+        ----------
+        files : iterable
+            Iterable of file names or file objects.
+        """.format(readername)
+    dispatch.from_iterable = dispatch_from_iterable
+
+    return dispatch
+
+
+def _check_use_index(source, use_index, default):
+    try:
+        if use_index is not None:
+            use_index = bool(use_index)
+
+        # if a file name is given, do not override anything; short-circuit
+        if isinstance(source, basestring):
+            return use_index if use_index is not None else default
+
+        # collect information on source
+        if hasattr(source, 'seekable'):
+            seekable = source.seekable()
+        else:
+            seekable = None
+
+        if hasattr(source, 'mode'):
+            binary = 'b' in source.mode
+        else:
+            binary = None
+
+        # now check for conflicts
+        if seekable is False:
+            if binary:
+                raise PyteomicsError('Cannot work with non-seekable file in binary mode: {}.'.format(source))
+            if use_index:
+                warnings.warn('Cannot use indexing as {} is not seekable. Setting `use_index` to False.'.format(source))
+                use_index = False
+        elif binary is not None:
+            if use_index is not None and binary != use_index:
+                warnings.warn('use_index is {}, but the file mode is {}. '
+                    'Setting `use_index` to {}'.format(use_index, source.mode, binary))
+            use_index = binary
+        elif use_index is None:
+            warnings.warn('Could not check mode on {}. Specify `use_index` explicitly to avoid errors.'.format(source))
+
+        if use_index is not None:
+            return use_index
+
+        return default
+
+    except PyteomicsError:
+        raise
+    except Exception as e:
+        if use_index is None:
+            warnings.warn('Could not check mode on {}. Reason: {!r}. '
+                'Specify `use_index` explicitly to avoid errors.'.format(source, e))
+            return default
+        return use_index
+
+
+class FileReadingProcess(mp.Process):
+    """Process that does a share of distributed work on entries read from file.
+    Reconstructs a reader object, parses an entries from given indexes,
+    optionally does additional processing, sends results back.
+
+    The reader class must support the :py:meth:`__getitem__` dict-like lookup.
+    """
+
+    def __init__(self, reader_spec, target_spec, qin, qout, args_spec, kwargs_spec):
+        super(FileReadingProcess, self).__init__(name='pyteomics-map-worker')
+        self.reader_spec = reader_spec
+        self.target_spec = target_spec
+        self.args_spec = args_spec
+        self.kwargs_spec = kwargs_spec
+        self._qin = qin
+        self._qout = qout
+        # self._in_flag = in_flag
+        self._done_flag = mp.Event()
+        self.daemon = True
+
+    def run(self):
+        reader = serializer.loads(self.reader_spec)
+        target = serializer.loads(self.target_spec)
+        args = serializer.loads(self.args_spec)
+        kwargs = serializer.loads(self.kwargs_spec)
+        for key in iter(self._qin.get, None):
+            item = reader[key]
+            if target is not None:
+                result = target(item, *args, **kwargs)
+            else:
+                result = item
+            self._qout.put(result)
+        self._done_flag.set()
+
+    def is_done(self):
+        return self._done_flag.is_set()
+
+
+try:
+    _NPROC = mp.cpu_count()
+except NotImplementedError:
+    _NPROC = 4
+_QUEUE_TIMEOUT = 4
+_QUEUE_SIZE = int(1e7)
+
+
+class TaskMappingMixin(NoOpBaseReader):
+    def __init__(self, *args, **kwargs):
+        '''
+        Instantiate a :py:class:`TaskMappingMixin` object, set default parameters for IPC.
+
+        Parameters
+        ----------
+
+        queue_timeout : float, keyword only, optional
+            The number of seconds to block, waiting for a result before checking to see if
+            all workers are done.
+        queue_size : int, keyword only, optional
+            The length of IPC queue used.
+        processes : int, keyword only, optional
+            Number of worker processes to spawn when :py:meth:`map` is called. This can also be
+            specified in the :py:meth:`map` call.
+        '''
+        self._queue_size = kwargs.pop('queue_size', _QUEUE_SIZE)
+        self._queue_timeout = kwargs.pop('timeout', _QUEUE_TIMEOUT)
+        self._nproc = kwargs.pop('processes', _NPROC)
+        super(TaskMappingMixin, self).__init__(*args, **kwargs)
+
+    def _get_reader_for_worker_spec(self):
+        return self
+
+    def _build_worker_spec(self, target, args, kwargs):
+        serialized = []
+        for obj, objname in [(self._get_reader_for_worker_spec(), 'reader'), (target, 'target'), (args, 'args'),
+                             (kwargs, 'kwargs')]:
+            try:
+                serialized.append(serializer.dumps(obj))
+            except serializer.PicklingError:
+                msg = 'Could not serialize {0} {1} with {2.__name__}.'.format(objname, obj, serializer)
+                if serializer is not dill:
+                    msg += ' Try installing `dill`.'
+                raise PyteomicsError(msg)
+        return serialized
+
+    def _spawn_workers(self, specifications, in_queue, out_queue, processes):
+        reader_spec, target_spec, args_spec, kwargs_spec = specifications
+        workers = []
+        for _ in range(processes):
+            worker = FileReadingProcess(
+                reader_spec, target_spec, in_queue, out_queue, args_spec, kwargs_spec)
+            workers.append(worker)
+        return workers
+
+    def _spawn_feeder_thread(self, in_queue, iterator, processes):
+        def feeder():
+            for key in iterator:
+                in_queue.put(key)
+            for _ in range(processes):
+                in_queue.put(None)
+
+        feeder_thread = threading.Thread(target=feeder)
+        feeder_thread.daemon = True
+        feeder_thread.start()
+        return feeder_thread
+
+    def map(self, target=None, processes=-1, args=None, kwargs=None, **_kwargs):
+        """Execute the ``target`` function over entries of this object across up to ``processes``
+        processes.
+
+        Results will be returned out of order.
+
+        Parameters
+        ----------
+        target : :class:`Callable`, optional
+            The function to execute over each entry. It will be given a single object yielded by
+            the wrapped iterator as well as all of the values in ``args`` and ``kwargs``
+        processes : int, optional
+            The number of worker processes to use. If 0 or negative,
+            defaults to the number of available CPUs.
+            This parameter can also be set at reader creation.
+        args : :class:`Sequence`, optional
+            Additional positional arguments to be passed to the target function
+        kwargs : :class:`Mapping`, optional
+            Additional keyword arguments to be passed to the target function
+        **_kwargs
+            Additional keyword arguments to be passed to the target function
+
+        Yields
+        ------
+        object
+            The work item returned by the target function.
+        """
+        if self._offset_index is None:
+            raise PyteomicsError('The reader needs an index for map() calls. Create the reader with `use_index=True`.')
+
+        if processes < 1:
+            processes = self._nproc
+        iterator = self._task_map_iterator()
+
+        if args is None:
+            args = tuple()
+        else:
+            args = tuple(args)
+        if kwargs is None:
+            kwargs = dict()
+        else:
+            kwargs = dict(kwargs)
+        kwargs.update(_kwargs)
+
+        serialized = self._build_worker_spec(target, args, kwargs)
+
+        in_queue = mp.Queue(self._queue_size)
+        out_queue = mp.Queue(self._queue_size)
+
+        workers = self._spawn_workers(serialized, in_queue, out_queue, processes)
+        feeder_thread = self._spawn_feeder_thread(in_queue, iterator, processes)
+        for worker in workers:
+            worker.start()
+
+        def iterate():
+            while True:
+                try:
+                    result = out_queue.get(True, self._queue_timeout)
+                    yield result
+                except Empty:
+                    if all(w.is_done() for w in workers):
+                        break
+                    else:
+                        continue
+
+            feeder_thread.join()
+            for worker in workers:
+                worker.join()
+        return iterate()
+
+    def _task_map_iterator(self):
+        """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC
+        queue used by :meth:`map`
+
+        Returns
+        -------
+        :class:`Iteratable`
+        """
+
+        return iter(self._offset_index.keys())
+
+
+class ChainBase(object):
+    """Chain :meth:`sequence_maker` for several sources into a
+    single iterable. Positional arguments should be sources like
+    file names or file objects. Keyword arguments are passed to
+    the :meth:`sequence_maker` function.
+
+    Parameters
+    ----------
+    sources : :class:`Iterable`
+        Sources for creating new sequences from, such as paths or
+        file-like objects
+    kwargs : :class:`Mapping`
+        Additional arguments used to instantiate each sequence
+    """
+
+    def __init__(self, *sources, **kwargs):
+        self.sources = sources
+        self.kwargs = kwargs
+        self._iterator = None
+
+    @classmethod
+    def from_iterable(cls, sources, **kwargs):
+        return cls(*sources, **kwargs)
+
+    @classmethod
+    def _make_chain(cls, sequence_maker):
+        if isinstance(sequence_maker, type):
+            tp = type('%sChain' % sequence_maker.__class__.__name__, (cls,), {
+                'sequence_maker': sequence_maker,
+                '__doc__': cls.__doc__.replace(':meth:`sequence_maker`', ':class:`{}`'.format(sequence_maker.__name__))
+            })
+        else:
+            tp = type('FunctionChain', (cls,), {
+                'sequence_maker': staticmethod(sequence_maker),
+                '__doc__': cls.__doc__.replace(':meth:`sequence_maker`', ':func:`{}`'.format(sequence_maker.__name__))
+            })
+        return tp
+
+    def sequence_maker(self, file):
+        raise NotImplementedError()
+
+    def _create_sequence(self, file):
+        return self.sequence_maker(file, **self.kwargs)
+
+    def _iterate_over_series(self):
+        for f in self.sources:
+            with self._create_sequence(f) as r:
+                for item in r:
+                    yield item
+
+    def __enter__(self):
+        self._iterator = iter(self._iterate_over_series())
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self._iterator = None
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self._iterator is None:
+            self._iterator = self._iterate_over_series()
+        return next(self._iterator)
+
+    def next(self):
+        return self.__next__()
+
+    def map(self, target=None, processes=-1, queue_timeout=_QUEUE_TIMEOUT, args=None, kwargs=None, **_kwargs):
+        """Execute the ``target`` function over entries of this object across up to ``processes``
+        processes.
+
+        Results will be returned out of order.
+
+        Parameters
+        ----------
+        target : :class:`Callable`, optional
+            The function to execute over each entry. It will be given a single object yielded by
+            the wrapped iterator as well as all of the values in ``args`` and ``kwargs``
+        processes : int, optional
+            The number of worker processes to use. If negative, the number of processes
+            will match the number of available CPUs.
+        queue_timeout : float, optional
+            The number of seconds to block, waiting for a result before checking to see if
+            all workers are done.
+        args : :class:`Sequence`, optional
+            Additional positional arguments to be passed to the target function
+        kwargs : :class:`Mapping`, optional
+            Additional keyword arguments to be passed to the target function
+        **_kwargs
+            Additional keyword arguments to be passed to the target function
+
+        Yields
+        ------
+        object
+            The work item returned by the target function.
+        """
+        for f in self.sources:
+            with self._create_sequence(f) as r:
+                for result in r.map(target, processes, queue_timeout, args, kwargs, **_kwargs):
+                    yield result
+
+
+class TableJoiner(ChainBase):
+    def concatenate(self, results):
+        if pd is not None and all(isinstance(a, pd.DataFrame) for a in results):
+            return pd.concat(results)
+        if isinstance(results[0], np.ndarray):
+            return np.concatenate(results)
+        else:
+            return np.array([b for a in results for b in a])
+
+    def _iterate_over_series(self):
+        results = [self._create_sequence(f) for f in self.sources]
+        return self.concatenate(results)
diff --git a/pyteomics/auxiliary/math.py b/pyteomics/auxiliary/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1f46a72bba707d928b5c3d96a119aa6e1a125e
--- /dev/null
+++ b/pyteomics/auxiliary/math.py
@@ -0,0 +1,97 @@
+from .structures import PyteomicsError
+
+
+def linear_regression_vertical(x, y=None, a=None, b=None):
+    """Calculate coefficients of a linear regression y = a * x + b.
+    The fit minimizes *vertical* distances between the points and the line.
+
+    Requires :py:mod:`numpy`.
+
+    Parameters
+    ----------
+    x, y : array_like of float
+        1-D arrays of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2).
+    a : float, optional
+        If specified then the slope coefficient is fixed and equals a.
+    b : float, optional
+        If specified then the free term is fixed and equals b.
+
+    Returns
+    -------
+    out : 4-tuple of float
+        The structure is (a, b, r, stderr), where
+        a -- slope coefficient,
+        b -- free term,
+        r -- Peason correlation coefficient,
+        stderr -- standard deviation.
+    """
+
+    import numpy as np
+    x = np.array(x, copy=False)
+    if y is not None:
+        y = np.array(y, copy=False)
+    else:
+        if len(x.shape) != 2 or x.shape[-1] != 2:
+            raise PyteomicsError(
+                'If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape))
+        y = x[:, 1]
+        x = x[:, 0]
+    if (a is not None and b is None):
+        b = (y - a * x).mean()
+    elif (a is not None and b is not None):
+        pass
+    else:
+        a, b = np.polyfit(x, y, 1)
+
+    r = np.corrcoef(x, y)[0, 1]
+    stderr = (y - a * x - b).std()
+
+    return a, b, r, stderr
+
+
+def linear_regression(x, y=None, a=None, b=None):
+    """Alias of :py:func:`linear_regression_vertical`."""
+    return linear_regression_vertical(x, y, a, b)
+
+
+def linear_regression_perpendicular(x, y=None):
+    """Calculate coefficients of a linear regression y = a * x + b.
+    The fit minimizes *perpendicular* distances between the points and the line.
+
+    Requires :py:mod:`numpy`.
+
+    Parameters
+    ----------
+    x, y : array_like of float
+        1-D arrays of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2).
+
+    Returns
+    -------
+    out : 4-tuple of float
+        The structure is (a, b, r, stderr), where
+        a -- slope coefficient,
+        b -- free term,
+        r -- Peason correlation coefficient,
+        stderr -- standard deviation.
+    """
+
+    import numpy as np
+    x = np.array(x, copy=False)
+    if y is not None:
+        y = np.array(y, copy=False)
+        data = np.hstack((x.reshape((-1, 1)), y.reshape((-1, 1))))
+    else:
+        if len(x.shape) != 2 or x.shape[-1] != 2:
+            raise PyteomicsError(
+                'If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape))
+        data = x
+    mu = data.mean(axis=0)
+    eigenvectors, eigenvalues, V = np.linalg.svd((data - mu).T, full_matrices=False)
+    a = eigenvectors[0][1] / eigenvectors[0][0]
+    xm, ym = data.mean(axis=0)
+    b = ym - a * xm
+
+    r = np.corrcoef(data[:, 0], data[:, 1])[0, 1]
+    stderr = ((data[:, 1] - a * data[:, 0] - b) / np.sqrt(a**2 + 1)).std()
+
+    return a, b, r, stderr
diff --git a/pyteomics/auxiliary/patch.py b/pyteomics/auxiliary/patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pyteomics/auxiliary/structures.py b/pyteomics/auxiliary/structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e56e15eb13a4434f6440ade7e73c8cad8d248b
--- /dev/null
+++ b/pyteomics/auxiliary/structures.py
@@ -0,0 +1,504 @@
+import re
+from collections import defaultdict, Counter
+import warnings
+
+try:
+    basestring
+    PY2 = True
+except NameError:
+    basestring = (str, bytes)
+    PY2 = False
+
+
+_UNIT_CV_INTERN_TABLE = dict()
+
+
+def clear_unit_cv_table():
+    """Clear the module-level unit name and
+    controlled vocabulary accession table.
+    """
+    _UNIT_CV_INTERN_TABLE.clear()
+
+
+def _intern_unit_or_cv(unit_or_cv):
+    """Intern `unit_or_cv` in :const:`~._UNIT_CV_INTERN_TABLE`, potentially
+    keeping a reference to the object stored for the duration of the program.
+
+    Parameters
+    ----------
+    unit_or_cv : object
+        The value to intern
+
+    Returns
+    -------
+    object:
+        The object which `unit_or_cv` hash-equals in :const:`~._UNIT_CV_INTERN_TABLE`.
+    """
+    if unit_or_cv is None:
+        return None
+    try:
+        return _UNIT_CV_INTERN_TABLE[unit_or_cv]
+    except KeyError:
+        _UNIT_CV_INTERN_TABLE[unit_or_cv] = unit_or_cv
+        return _UNIT_CV_INTERN_TABLE[unit_or_cv]
+
+
+class PyteomicsError(Exception):
+    """Exception raised for errors in Pyteomics library.
+
+    Attributes
+    ----------
+    message : str
+        Error message.
+    """
+
+    def __init__(self, msg, *values):
+        self.message = msg
+        self.values = values
+
+    def __str__(self):
+        if not self.values:
+            return "Pyteomics error, message: %s" % (repr(self.message),)
+        else:
+            return "Pyteomics error, message: %s %r" % (repr(self.message), self.values)
+
+
+class Charge(int):
+    """A subclass of :py:class:`int`. Can be constructed from strings in "N+"
+    or "N-" format, and the string representation of a :py:class:`Charge` is
+    also in that format.
+    """
+    def __new__(cls, *args, **kwargs):
+        try:
+            return super(Charge, cls).__new__(cls, *args)
+        except ValueError as e:
+            if isinstance(args[0], basestring):
+                try:
+                    num, sign = re.match(r'^(\d+)(\+|-)$', args[0]).groups()
+                    return super(Charge, cls).__new__(cls, sign + num, *args[1:], **kwargs)
+                except Exception:
+                    pass
+            raise PyteomicsError(*e.args)
+
+    def __str__(self):
+        return str(abs(self)) + '+-'[self < 0]
+
+
+class Ion(str):
+    """Represents an Ion, right now just a subclass of String.
+    """
+    _pattern = r'([abcxyz]\d+(\-H2O|\-NH3)?)([\+|-]\d+)'  # "y2-H2O+1"
+
+    def __init__(self, *args, **kwargs):
+        if args and isinstance(args[0], basestring):
+            try:
+                self.ion_type, self.neutral_loss, self.charge = re.match(self._pattern, args[0]).groups()
+            except Exception:
+                raise PyteomicsError("Malformed ion string, must match the regex {!r}".format(self._pattern))
+
+
+class ChargeList(list):
+    """Just a list of :py:class:`Charge`s. When printed, looks like an
+    enumeration of the list contents. Can also be constructed from such
+    strings (e.g. "2+, 3+ and 4+").
+    """
+
+    def __init__(self, *args, **kwargs):
+        if args and isinstance(args[0], basestring):
+            delim = r'(?:,\s*)|(?:\s*and\s*)'
+            self.extend(map(Charge, re.split(delim, args[0])))
+        else:
+            try:
+                super(ChargeList, self).__init__(
+                    sorted(set(args[0])), *args[1:], **kwargs)
+            except Exception:
+                super(ChargeList, self).__init__(*args, **kwargs)
+            self[:] = map(Charge, self)
+
+    def __str__(self):
+        if len(self) > 1:
+            return ', '.join(map(str, self[:-1])) + ' and {}'.format(self[-1])
+        elif self:
+            return str(self[0])
+        return super(ChargeList, self).__str__()
+
+
+def _parse_charge(s, list_only=False):
+    if not list_only:
+        try:
+            return Charge(s)
+        except PyteomicsError:
+            pass
+    return ChargeList(s)
+
+
+def _parse_ion(ion_text):
+    try:
+        return Ion(ion_text)
+    except Exception as e:
+        warnings.warn('Could not parse ion string: {} ({})'.format(ion_text, e.args[0]))
+
+
+class BasicComposition(defaultdict, Counter):
+    """A generic dictionary for compositions.
+    Keys should be strings, values should be integers.
+    Allows simple arithmetics."""
+
+    def __init__(self, *args, **kwargs):
+        defaultdict.__init__(self, int)
+        Counter.__init__(self, *args, **kwargs)
+        for k, v in list(self.items()):
+            if not v:
+                del self[k]
+
+    def __str__(self):
+        return '{}({})'.format(type(self).__name__, dict.__repr__(self))
+
+    def __repr__(self):
+        return str(self)
+
+    def _repr_pretty_(self, p, cycle):
+        if cycle:  # should never happen
+            p.text('{} object with a cyclic reference'.format(type(self).__name__))
+        p.text(str(self))
+
+    def __add__(self, other):
+        result = self.copy()
+        for elem, cnt in other.items():
+            result[elem] += cnt
+        return result
+
+    def __iadd__(self, other):
+        for elem, cnt in other.items():
+            self[elem] += cnt
+        return self
+
+    def __radd__(self, other):
+        return self + other
+
+    def __sub__(self, other):
+        result = self.copy()
+        for elem, cnt in other.items():
+            result[elem] -= cnt
+        return result
+
+    def __isub__(self, other):
+        for elem, cnt in other.items():
+            self[elem] -= cnt
+        return self
+
+    def __rsub__(self, other):
+        return (self - other) * (-1)
+
+    def __mul__(self, other):
+        if not isinstance(other, int):
+            raise PyteomicsError('Cannot multiply Composition by non-integer',
+                                 other)
+        return type(self)({k: v * other for k, v in self.items()})
+
+    def __imul__(self, other):
+        if not isinstance(other, int):
+            raise PyteomicsError('Cannot multiply Composition by non-integer',
+                                 other)
+        for elem in self:
+            self[elem] *= other
+        return self
+
+    def __rmul__(self, other):
+        return self * other
+
+    def __eq__(self, other):
+        if not isinstance(other, dict):
+            return False
+        self_items = {i for i in self.items() if i[1]}
+        other_items = {i for i in other.items() if i[1]}
+        return self_items == other_items
+
+    # override default behavior:
+    # we don't want to add 0's to the dictionary
+    def __missing__(self, key):
+        return 0
+
+    def __setitem__(self, key, value):
+        if isinstance(value, float):
+            value = int(round(value))
+        elif not isinstance(value, int):
+            raise PyteomicsError('Only integers allowed as values in '
+                                 'Composition, got {}.'.format(type(value).__name__))
+        if value:  # reject 0's
+            super(BasicComposition, self).__setitem__(key, value)
+        elif key in self:
+            del self[key]
+
+    def copy(self):
+        return type(self)(self)
+
+    def __reduce__(self):
+        class_, args, state, list_iterator, dict_iterator = super(
+            BasicComposition, self).__reduce__()
+        # Override the reduce of defaultdict so we do not provide the
+        # `int` type as the first argument
+        # which prevents from correctly unpickling the object
+        args = ()
+        return class_, args, state, list_iterator, dict_iterator
+
+
+class _MappingOverAttributeProxy(object):
+    '''A replacement for __dict__ for unpickling an object which once
+    has __slots__ now but did not before.'''
+
+    def __init__(self, obj):
+        self.obj = obj
+
+    def __getitem__(self, key):
+        return getattr(self.obj, key)
+
+    def __setitem__(self, key, value):
+        setattr(self.obj, key, value)
+
+    def __contains__(self, key):
+        return hasattr(self.obj, key)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.obj})".format(self=self)
+
+
+class unitint(int):
+    '''Represents an integer value with a unit name.
+
+    Behaves identically to a built-in :class:`int` type.
+
+    Attributes
+    ----------
+    unit_info : :class:`str`
+        The name of the unit this value posseses.
+    '''
+    def __new__(cls, value, unit_info=None):
+        inst = int.__new__(cls, value)
+        inst.unit_info = unit_info
+        return inst
+
+    def __reduce__(self):
+        return self.__class__, (int(self), self.unit_info)
+
+    def _repr_pretty_(self, p, cycle):
+        base = super(unitint, self).__repr__()
+        if self.unit_info:
+            string = "%s %s" % (base, self.unit_info)
+        else:
+            string = base
+        p.text(string)
+
+
+class unitfloat(float):
+    '''Represents an float value with a unit name.
+
+    Behaves identically to a built-in :class:`float` type.
+
+    Attributes
+    ----------
+    unit_info : :class:`str`
+        The name of the unit this value posseses.
+    '''
+    __slots__ = ('unit_info', )
+
+    def __new__(cls, value, unit_info=None):
+        inst = float.__new__(cls, value)
+        inst.unit_info = unit_info
+        return inst
+
+    @property
+    def __dict__(self):
+        return _MappingOverAttributeProxy(self)
+
+    def __reduce__(self):
+        return self.__class__, (float(self), self.unit_info)
+
+    def _repr_pretty_(self, p, cycle):
+        base = super(unitfloat, self).__repr__()
+        if self.unit_info:
+            string = "%s %s" % (base, self.unit_info)
+        else:
+            string = base
+        p.text(string)
+
+
+class unitstr(str):
+    '''Represents an string value with a unit name.
+
+    Behaves identically to a built-in :class:`str` type.
+
+    Attributes
+    ----------
+    unit_info : :class:`str`
+        The name of the unit this value posseses.
+    '''
+    if not PY2:
+        __slots__ = ("unit_info", )
+
+    def __new__(cls, value, unit_info=None):
+        if PY2 and isinstance(value, unicode):
+            value = value.encode('utf-8')
+        inst = str.__new__(cls, value)
+        inst.unit_info = unit_info
+        return inst
+
+    @property
+    def __dict__(self):
+        return _MappingOverAttributeProxy(self)
+
+    def __reduce__(self):
+        return self.__class__, (str(self), self.unit_info)
+
+    def _repr_pretty_(self, p, cycle):
+        base = super(unitstr, self).__repr__()
+        if self.unit_info:
+            string = "%s %s" % (base, self.unit_info)
+        else:
+            string = base
+        p.text(string)
+
+
+class cvstr(str):
+    '''A helper class to associate a controlled vocabullary accession
+    number with an otherwise plain :class:`str` object
+
+    Attributes
+    ----------
+    accession : str
+        The accession number for this parameter, e.g. MS:1000040
+    unit_accession : str
+        The accession number for the unit of the value, if any
+    '''
+
+    if not PY2:
+        __slots__ = ('accession', 'unit_accession')
+
+    _cache = {}
+
+    def __new__(cls, value, accession=None, unit_accession=None):
+        try:
+            inst = cls._cache[value]
+            if inst.accession == accession and inst.unit_accession == unit_accession:
+                return inst
+        except KeyError:
+            pass
+
+        if PY2 and isinstance(value, unicode):
+            value = value.encode('utf-8')
+        inst = str.__new__(cls, value)
+        inst.accession = _intern_unit_or_cv(accession)
+        inst.unit_accession = _intern_unit_or_cv(unit_accession)
+        cls._cache[value] = inst
+        return inst
+
+    @property
+    def __dict__(self):
+        return _MappingOverAttributeProxy(self)
+
+    def __reduce__(self):
+        return self.__class__, (str(self), self.accession, self.unit_accession)
+
+
+class CVQueryEngine(object):
+    '''Traverse an arbitrarily nested dictionary looking
+    for keys which are :class:`cvstr` instances, or objects
+    with an attribute called ``accession``.
+    '''
+
+    def _accession(self, key):
+        return getattr(key, 'accession', None)
+
+    def _query_dict(self, data, accession):
+        for key, value in data.items():
+            if self._accession(key) == accession:
+                if not isinstance(value, str) or value != '':
+                    return value
+                else:
+                    return key
+            elif isinstance(value, dict):
+                inner = self._query_dict(value, accession)
+                if inner is not None:
+                    return inner
+            elif isinstance(value, (list, tuple)):
+                inner = self._query_sequence(value, accession)
+                if inner is not None:
+                    return inner
+            elif self._accession(value) == accession:
+                return value
+
+    def _query_sequence(self, data, accession):
+        for value in data:
+            if isinstance(value, dict):
+                inner = self._query_dict(value, accession)
+                if inner is not None:
+                    return inner
+            elif isinstance(value, (list, tuple)):
+                inner = self._query_sequence(value, accession)
+                if inner is not None:
+                    return inner
+            elif self._accession(value) == accession:
+                return value
+
+    def query(self, data, accession):
+        '''Search ``data`` for a key with the accession
+        number ``accession``. Returns :const:`None` if
+        not found.
+        '''
+        if accession is None:
+            raise TypeError("`accession` cannot be None")
+        return self._query_dict(data, accession)
+
+    def _is_empty(self, value):
+        if isinstance(value, basestring):
+            return value == ''
+        return False
+
+    def _walk_dict(self, data, index):
+        for key, value in data.items():
+            accession = self._accession(key)
+            if accession:
+                if not self._is_empty(value):
+                    index[accession] = value
+                else:
+                    index[accession] = key
+            elif isinstance(value, dict):
+                self._walk_dict(value, index)
+            elif isinstance(value, (list, tuple)):
+                self._walk_sequence(value, index)
+            accession = self._accession(value)
+            if accession:
+                index[accession] = value
+        return index
+
+    def _walk_sequence(self, data, index):
+        for value in data:
+            if isinstance(value, dict):
+                self._walk_dict(value, index)
+            elif isinstance(value, (list, tuple)):
+                self._walk_sequence(value, index)
+            else:
+                accession = self._accession(value)
+                if accession:
+                    index[accession] = value
+
+    def index(self, data):
+        '''Construct a flat :class:`dict` whose keys are the
+        accession numbers for all qualified keys in ``data``
+        and whose values are the mapped values from ``data``.
+        '''
+        index = self._walk_dict(data, {})
+        return index
+
+    def __call__(self, data, accession=None):
+        '''If ``accession`` is :const:`None`, calls
+        :meth:`index` on ``data``, otherwise calls
+        :meth:`query` with ``data`` and ``accession``.
+        '''
+        if accession is None:
+            return self.index(data)
+        else:
+            return self.query(data, accession)
+
+'''A ready-to-use instance of :class:`~.CVQueryEngine`'''
+cvquery = CVQueryEngine()
diff --git a/pyteomics/auxiliary/target_decoy.py b/pyteomics/auxiliary/target_decoy.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c563deafdc810b67d2d084f8b105b846dbea932
--- /dev/null
+++ b/pyteomics/auxiliary/target_decoy.py
@@ -0,0 +1,997 @@
+from __future__ import absolute_import
+import re
+import operator as op
+import math
+
+try:
+    basestring
+except NameError:
+    basestring = (str, bytes)
+
+try:
+    from collections.abc import Container, Sized
+except ImportError:
+    from collections import Container, Sized
+from bisect import bisect_right
+from contextlib import contextmanager
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
+from .structures import PyteomicsError
+from .file_helpers import _keepstate, IteratorContextManager, _make_chain, ChainBase, TableJoiner
+
+
+def _fix_docstring(f, **defaults):
+    for argname, v in defaults.items():
+        if v is not None:
+            f.__doc__ = re.sub('{} : .*'.format(argname),
+                               lambda m: m.group() + ', optional', f.__doc__)
+
+
+def _calculate_qvalues(scores, isdecoy, peps=False, **kwargs):
+    """Actual q-value calculation.
+
+    Parameters
+    ----------
+    scores : numpy.ndarray
+        Sorted array of PSMs.
+    isdecoy : numpy.ndarray
+        Sorted array of bools (decoy/target) or floats (PEPs).
+
+    Returns
+    -------
+    out : numpy.ndarray
+        Calculated q-values.
+    """
+    correction = kwargs.pop('correction', 0)
+    ratio = kwargs.pop('ratio', 1)
+    if ratio == 0:
+        raise PyteomicsError('Size ratio cannot be zero!')
+    remove_decoy = kwargs.get('remove_decoy', False)
+    formula = kwargs.pop('formula', (2, 1)[bool(remove_decoy)])
+    if formula not in {1, 2}:
+        raise PyteomicsError('`formula` must be either 1 or 2')
+
+    # score_label = kwargs['score_label']
+    cumsum = isdecoy.cumsum(dtype=np.float64)
+    tfalse = cumsum.copy()
+    ind = np.arange(1., scores.shape[0] + 1., dtype=np.float64)
+
+    if peps:
+        q = cumsum / ind
+    else:
+        if isinstance(correction, int):
+            if correction == 1:
+                tfalse += 1
+            elif correction == 2:
+                p = 1. / (1. + ratio)
+                targ = ind - cumsum
+                for i in range(tfalse.size):
+                    tfalse[i] = _expectation(cumsum[i], targ[i], p)
+        elif 0 < correction < 1:
+            p = 1. / (1. + ratio)
+            targ = ind - cumsum
+            for i in range(tfalse.size):
+                tfalse[i] = _confidence_value(
+                    correction, cumsum[i], targ[i], p)
+        elif correction:
+            raise PyteomicsError('Invalid value for `correction`: {}.'.format(correction))
+
+        if formula == 1:
+            q = tfalse / (ind - cumsum) / ratio
+        else:
+            q = (cumsum + tfalse / ratio) / ind
+
+    # Make sure that q-values are equal for equal scores (conservatively)
+    # and that q-values are monotonic
+    for i in range(scores.size - 1, 0, -1):
+        if (scores[i] == scores[i - 1] or q[i - 1] > q[i]):
+            q[i - 1] = q[i]
+
+    return q
+
+
+def _qvalues_df(psms, keyf, isdecoy, **kwargs):
+    full = kwargs.get('full_output', False)
+    remove_decoy = kwargs.get('remove_decoy', False)
+    peps = kwargs.get('pep')
+    decoy_or_pep_label = _decoy_or_pep_label(**kwargs)
+    q_label = kwargs.setdefault('q_label', 'q')
+    score_label = kwargs.setdefault('score_label', 'score')
+    if callable(keyf):
+        keyf = psms.apply(keyf, axis=1)
+    if callable(isdecoy):
+        isdecoy = psms.apply(isdecoy, axis=1)
+    if not isinstance(keyf, basestring):
+        if psms.shape[0]:
+            psms[score_label] = keyf
+        else:
+            psms[score_label] = []
+        keyf = kwargs['score_label']
+    if not isinstance(isdecoy, basestring):
+        if psms.shape[0]:
+            psms[decoy_or_pep_label] = isdecoy
+        else:
+            psms[decoy_or_pep_label] = []
+        isdecoy = decoy_or_pep_label
+    reverse = kwargs.get('reverse', False)
+
+    if not full:  # create fields early
+        if peps is None:
+            fields = [(keyf, np.float64), (isdecoy, np.bool_),
+                      (q_label, np.float64)]
+        else:
+            fields = [(isdecoy, np.float64), (q_label, np.float64)]
+        dtype = np.dtype(fields)
+
+    psms.sort_values([keyf, isdecoy], ascending=[
+                     not reverse, True], inplace=True)
+
+    if not psms.shape[0]:
+        if full:
+            psms[q_label] = []
+            return psms
+        else:
+            return np.array([], dtype=dtype)
+
+    q = _calculate_qvalues(psms[keyf].values, psms[
+                           isdecoy].values, peps is not None, **kwargs)
+    if remove_decoy:
+        q = q[~psms[isdecoy].values]
+        psms = psms[~psms[isdecoy]].copy()
+    if not full:
+        psms_ = np.empty_like(q, dtype=dtype)
+        if peps is None:
+            psms_[keyf] = psms[keyf]
+        psms_[isdecoy] = psms[isdecoy]
+        psms_[q_label] = q
+        psms = psms_
+    else:
+        q_label = kwargs['q_label']
+        psms[q_label] = q
+    return psms
+
+
+def _decoy_or_pep_label(**kwargs):
+    peps = kwargs.get('pep')
+    return kwargs.get('decoy_label', 'is decoy') if peps is None else kwargs.get(
+        'pep_label', peps if isinstance(peps, basestring) else 'PEP')
+
+
+def _construct_dtype(*args, **kwargs):
+    full = kwargs.pop('full_output', False)
+    peps = kwargs.get('pep')
+    q_label = kwargs.setdefault('q_label', 'q')
+    score_label = kwargs.setdefault('score_label', 'score')
+
+    fields = [(score_label, np.float64),
+              (_decoy_or_pep_label(**kwargs),
+               np.bool_ if peps is None else np.float64),
+              (q_label, np.float64)]
+    # if all args are NumPy arrays with common dtype, use it in the output
+    if full:
+        dtypes = {getattr(arg, 'dtype', None) for arg in args}
+        if len(dtypes) == 1 and None not in dtypes:
+            psm_dtype = dtypes.pop()
+        else:
+            psm_dtype = np.object_
+        dtype = np.dtype(fields + [('psm', psm_dtype)])
+    else:
+        dtype = np.dtype(fields)
+    return dtype
+
+
+def _make_qvalues(read, is_decoy_prefix, is_decoy_suffix, key):
+    """Create a function that reads PSMs from a file and calculates q-values
+    for each value of `key`."""
+
+    def qvalues(*args, **kwargs):
+        """Read `args` and return a NumPy array with scores and q-values.
+        q-values are calculated either using TDA or based on provided values of PEP.
+
+        Requires :py:mod:`numpy` (and optionally :py:mod:`pandas`).
+
+        Parameters
+        ----------
+
+        positional args : file or str
+            Files to read PSMs from. All positional arguments are treated as
+            files. The rest of the arguments must be named.
+
+        key : callable / array-like / iterable / str, keyword only
+            If callable, a function used for sorting of PSMs. Should accept
+            exactly one argument (PSM) and return a number (the smaller the better).
+            If array-like, should contain scores for all given PSMs.
+            If string, it is used as a field name (PSMs must be in a record array
+            or a :py:class:`DataFrame`).
+
+            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        reverse : bool, keyword only, optional
+            If :py:const:`True`, then PSMs are sorted in descending order,
+            i.e. the value of the key function is higher for better PSMs.
+            Default is :py:const:`False`.
+
+        is_decoy : callable / array-like / iterable / str, keyword only
+            If callable, a function used to determine if the PSM is decoy or not.
+            Should accept exactly one argument (PSM) and return a truthy value if the
+            PSM should be considered decoy.
+            If array-like, should contain boolean values for all given PSMs.
+            If string, it is used as a field name (PSMs must be in a record array
+            or a :py:class:`DataFrame`).
+
+            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        decoy_prefix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name prefix to use to detect decoy matches. If you provide your own
+            `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+            Default is `"DECOY_"`.
+
+        decoy_suffix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name suffix to use to detect decoy matches. If you provide your own
+            `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.
+
+        pep : callable / array-like / iterable / str, keyword only, optional
+            If callable, a function used to determine the posterior error probability (PEP).
+            Should accept exactly one argument (PSM) and return a float.
+            If array-like, should contain float values for all given PSMs.
+            If string, it is used as a field name (PSMs must be in a record array
+            or a :py:class:`DataFrame`).
+
+            .. note:: If this parameter is given, then PEP values will be used to calculate
+               q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with:
+               `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`.
+               `key` can still be provided. Without `key`, PSMs will be sorted by PEP.
+
+        remove_decoy : bool, keyword only, optional
+            Defines whether decoy matches should be removed from the output.
+            Default is :py:const:`False`.
+
+            .. note:: If set to :py:const:`False`, then by default the decoy
+               PSMs will be taken into account when estimating FDR. Refer to the
+               documentation of :py:func:`fdr` for math; basically, if
+               `remove_decoy` is :py:const:`True`, then formula 1 is used
+               to control output FDR, otherwise it's formula 2. This can be
+               changed by overriding the `formula` argument.
+
+        formula : int, keyword only, optional
+            Can be either 1 or 2, defines which formula should be used for FDR
+            estimation. Default is 1 if `remove_decoy` is :py:const:`True`,
+            else 2 (see :py:func:`fdr` for definitions).
+
+        ratio : float, keyword only, optional
+            The size ratio between the decoy and target databases. Default is
+            1. In theory, the "size" of the database is the number of
+            theoretical peptides eligible for assignment to spectra that are
+            produced by *in silico* cleavage of that database.
+
+        correction : int or float, keyword only, optional
+            Possible values are 0, 1 and 2, or floating point numbers between 0 and 1.
+
+            0 (default): no correction;
+
+            1: enable "+1" correction. This accounts for the probability that a false
+            positive scores better than the first excluded decoy PSM;
+
+            2: this also corrects that probability for finite size of the sample,
+            so the correction will be slightly less than "+1".
+
+            If a floating point number
+            is given, then instead of the expectation value for the number of false PSMs,
+            the confidence value is used. The value of `correction` is then interpreted as
+            desired confidence level. E.g., if correction=0.95, then the calculated q-values
+            do not exceed the "real" q-values with 95% probability.
+
+            See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation.
+
+        q_label : str, optional
+            Field name for q-value in the output. Default is ``'q'``.
+
+        score_label : str, optional
+            Field name for score in the output. Default is ``'score'``.
+
+        decoy_label : str, optional
+            Field name for the decoy flag in the output. Default is ``'is decoy'``.
+
+        pep_label : str, optional
+            Field name for PEP in the output. Default is ``'PEP'``.
+
+        full_output : bool, keyword only, optional
+            If :py:const:`True`, then the returned array has PSM objects along
+            with scores and q-values. Default is :py:const:`False`.
+
+        **kwargs : passed to the :py:func:`chain` function.
+
+        Returns
+        -------
+        out : numpy.ndarray
+            A sorted array of records with the following fields:
+
+            - 'score': :py:class:`np.float64`
+            - 'is decoy': :py:class:`np.bool_`
+            - 'q': :py:class:`np.float64`
+            - 'psm': :py:class:`np.object_` (if `full_output` is :py:const:`True`)
+        """
+        import numpy as np
+
+        @_keepstate
+        def get_scores(*args, **kwargs):
+            scores = []
+            with read(*args, **kwargs) as f:
+                for i, psm in enumerate(f):
+                    row = []
+                    for func in (keyf, isdecoy):
+                        if callable(func):
+                            row.append(func(psm))
+                        elif isinstance(func, basestring):
+                            row.append(psm[func])
+                        else:
+                            row.append(func[i])
+                    row.append(None)
+                    if full:
+                        row.append(psm)
+                    scores.append(tuple(row))
+            return scores
+
+        peps = kwargs.get('pep', None)
+        if peps is not None:
+            x = {'is_decoy', 'remove_decoy', 'formula',
+                 'ratio', 'correction'}.intersection(kwargs)
+            if x:
+                raise PyteomicsError(
+                    "Can't use these parameters with `pep`: " + ', '.join(x))
+        keyf = kwargs.pop('key', key)
+        reverse = kwargs.get('reverse', False)
+        if keyf is None:
+            keyf = peps
+            if reverse:
+                raise PyteomicsError(
+                    'reverse = True when using PEPs for sorting')
+
+        if not callable(keyf) and not isinstance(keyf, (Sized, Container)):
+            keyf = np.array(list(keyf))
+
+        if peps is None:
+            if 'is_decoy' not in kwargs:
+                if 'decoy_suffix' in kwargs:
+                    isdecoy = lambda x: is_decoy_suffix(x, kwargs['decoy_suffix'])
+                elif 'decoy_prefix' in kwargs:
+                    isdecoy = lambda x: is_decoy_prefix(x, kwargs['decoy_prefix'])
+                else:
+                    isdecoy = is_decoy_prefix
+            else:
+                isdecoy = kwargs['is_decoy']
+        else:
+            isdecoy = peps
+
+        if not callable(isdecoy) and not isinstance(isdecoy, (Sized, Container)):
+            isdecoy = np.array(list(isdecoy))
+
+        remove_decoy = kwargs.get('remove_decoy', False)
+        decoy_or_pep_label = _decoy_or_pep_label(**kwargs)
+        score_label = kwargs.setdefault('score_label', 'score')
+        q_label = kwargs.setdefault('q_label', 'q')
+        dtype = _construct_dtype(*args, **kwargs)
+        full = kwargs.get('full_output', False)
+        arr_flag = False
+        psms = None
+
+        # time to check arg type
+        if pd is not None and all(isinstance(arg, pd.DataFrame) for arg in args):
+            psms = pd.concat(args)
+            return _qvalues_df(psms, keyf, isdecoy, **kwargs)
+
+        if not all(isinstance(arg, np.ndarray) for arg in args):
+            if isinstance(keyf, basestring):
+                keyf = op.itemgetter(keyf)
+            if isinstance(isdecoy, basestring):
+                isdecoy = op.itemgetter(isdecoy)
+            if isinstance(peps, basestring):
+                peps = op.itemgetter(peps)
+
+        if callable(keyf) or callable(isdecoy):
+            kwargs.pop('full_output', None)
+            scores = np.array(get_scores(*args, **kwargs), dtype=dtype)
+        else:
+            if all(isinstance(arg, np.ndarray) for arg in args):
+                psms = np.concatenate(args)
+
+            if not isinstance(keyf, basestring):
+                keyf = np.array(keyf)
+                arr_flag = True
+            if not isinstance(isdecoy, basestring):
+                isdecoy = np.array(isdecoy)
+                arr_flag = True
+
+            if arr_flag:
+                scores = np.empty(keyf.size if hasattr(
+                    keyf, 'size') else isdecoy.size, dtype=dtype)
+                for func, label in zip((keyf, isdecoy), (score_label, decoy_or_pep_label)):
+                    if not isinstance(func, basestring):
+                        scores[label] = func
+                    else:
+                        scores[label] = psms[func]
+            else:
+                scores = np.empty(psms.shape[0], dtype=dtype)
+                scores[score_label] = psms[keyf]
+                scores[decoy_or_pep_label] = psms[isdecoy]
+
+        if not scores.size:
+            if full and psms is not None:
+                return psms
+            return scores
+
+        if not reverse:
+            keys = scores[decoy_or_pep_label], scores[score_label]
+        else:
+            keys = scores[decoy_or_pep_label], -scores[score_label]
+        lexsort = np.lexsort(keys)
+        scores = scores[lexsort]
+        if psms is not None:
+            psms = psms[lexsort]
+
+        scores[q_label] = _calculate_qvalues(scores[score_label], scores[
+                                             decoy_or_pep_label], peps is not None, **kwargs)
+        if remove_decoy:
+            if psms is not None:
+                psms = psms[~scores[decoy_or_pep_label]]
+            scores = scores[~scores[decoy_or_pep_label]]
+
+        if full and psms is not None:
+            if isinstance(psms, np.ndarray):
+                fields = sorted(psms.dtype.fields,
+                                key=lambda x: psms.dtype.fields[x][1])
+                extra = []
+                for func, label in zip((keyf, isdecoy), ('score', decoy_or_pep_label)):
+                    if not (isinstance(func, basestring) or label in psms.dtype.fields):
+                        extra.append(label)
+                    elif label in psms.dtype.fields:
+                        psms[label] = scores[label]
+                newdt = [(name, psms.dtype.fields[name][0]) for name in fields] + [
+                    (name, np.float64) for name in extra] + [(q_label, np.float64)]
+                psms_ = psms
+                psms = np.empty_like(psms_, dtype=newdt)
+                for f in fields:
+                    psms[f] = psms_[f]
+                for f in extra:
+                    psms[f] = scores[f]
+            else:
+                for func, label in zip((keyf, isdecoy), ('score', decoy_or_pep_label)):
+                    if not isinstance(label, basestring):
+                        psms[label] = scores[label]
+            psms[q_label] = scores[q_label]
+            return psms
+        return scores
+
+    _fix_docstring(qvalues, is_decoy=is_decoy_prefix, key=key)
+    if read is _iter:
+        qvalues.__doc__ = qvalues.__doc__.replace("""positional args : file or str
+            Files to read PSMs from. All positional arguments are treated as
+            files.""", """positional args : iterables
+            Iterables to read PSMs from. All positional arguments are chained."""
+                ).replace("""\n            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        decoy_prefix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name prefix to use to detect decoy matches. If you provide your own
+            `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+            Default is `"DECOY_"`.
+
+        decoy_suffix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name suffix to use to detect decoy matches. If you provide your own
+            `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "")
+
+    return qvalues
+
+
+def _make_filter(read, is_decoy_prefix, is_decoy_suffix, key, qvalues):
+    """Create a function that reads PSMs from a file and filters them to
+    the desired FDR level (estimated by TDA), returning the top PSMs
+    sorted by `key`.
+    """
+    def filter(*args, **kwargs):
+        try:
+            fdr = kwargs.pop('fdr')
+        except KeyError:
+            raise PyteomicsError('Keyword argument required: fdr')
+
+        args = [list(arg) if not isinstance(
+            arg, (Container, Sized)) else arg for arg in args]
+        peps = kwargs.get('pep')
+        if peps is None:
+            remove_decoy = kwargs.pop('remove_decoy', True)
+            scores = qvalues(*args, remove_decoy=remove_decoy, **kwargs)
+        else:
+            scores = qvalues(*args, **kwargs)
+        keyf = kwargs.pop('key', key)
+        if keyf is None:
+            keyf = peps
+        reverse = kwargs.pop('reverse', False)
+        better = [op.lt, op.gt][bool(reverse)]
+        if 'is_decoy' not in kwargs:
+            if 'decoy_suffix' in kwargs:
+                isdecoy = lambda x: is_decoy_suffix(x, kwargs['decoy_suffix'])
+            elif 'decoy_prefix' in kwargs:
+                isdecoy = lambda x: is_decoy_prefix(x, kwargs['decoy_prefix'])
+            else:
+                isdecoy = is_decoy_prefix
+        else:
+            isdecoy = kwargs['is_decoy']
+        kwargs.pop('formula', None)
+        decoy_or_pep_label = _decoy_or_pep_label(**kwargs)
+        score_label = kwargs.setdefault('score_label', 'score')
+        q_label = kwargs.get('q_label', 'q')
+
+        try:
+            i = scores[q_label].searchsorted(fdr, side='right')
+            if isinstance(i, Sized):
+                i = i[0]
+        except AttributeError:
+            i = bisect_right(scores['q'], fdr)
+        if kwargs.pop('full_output', False):
+            if pd is not None and isinstance(scores, pd.DataFrame):
+                return scores.iloc[:i]
+            elif callable(keyf) or callable(isdecoy):
+                return scores['psm'][:i]
+            else:
+                return scores[:i]
+        elif not scores.size:
+            return (_ for _ in ())
+        if peps is None:
+            label = score_label
+        else:
+            label = decoy_or_pep_label
+        cutoff = scores[label][i] if i < scores.size else (
+            scores[label][-1] + (1, -1)[bool(reverse)])
+
+        def out():
+            with read(*args, **kwargs) as f:
+                for p, s in zip(f, scores):
+                    if peps is not None or not remove_decoy or not s[decoy_or_pep_label]:
+                        if better(s[label], cutoff):
+                            yield p
+        return out()
+
+    def _filter(*args, **kwargs):
+        """Read `args` and yield only the PSMs that form a set with
+        estimated false discovery rate (FDR) not exceeding `fdr`.
+
+        Requires :py:mod:`numpy` and, optionally, :py:mod:`pandas`.
+
+        Parameters
+        ----------
+        positional args : file or str
+            Files to read PSMs from. All positional arguments are treated as
+            files. The rest of the arguments must be named.
+
+        fdr : float, keyword only, 0 <= fdr <= 1
+            Desired FDR level.
+
+        key : callable / array-like / iterable / str, keyword only
+            A function used for sorting of PSMs. Should accept exactly one
+            argument (PSM) and return a number (the smaller the better). The
+            default is a function that tries to extract e-value from the PSM.
+
+            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        reverse : bool, keyword only, optional
+            If :py:const:`True`, then PSMs are sorted in descending order,
+            i.e. the value of the key function is higher for better PSMs.
+            Default is :py:const:`False`.
+
+        is_decoy : callable / array-like / iterable / str, keyword only
+            A function used to determine if the PSM is decoy or not. Should
+            accept exactly one argument (PSM) and return a truthy value if the
+            PSM should be considered decoy.
+
+            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        decoy_prefix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name prefix to use to detect decoy matches. If you provide your own
+            `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+            Default is `"DECOY_"`.
+
+        decoy_suffix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name suffix to use to detect decoy matches. If you provide your own
+            `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.
+
+        remove_decoy : bool, keyword only, optional
+            Defines whether decoy matches should be removed from the output.
+            Default is :py:const:`True`.
+
+            .. note:: If set to :py:const:`False`, then by default the decoy
+               PSMs will be taken into account when estimating FDR. Refer to the
+               documentation of :py:func:`fdr` for math; basically, if
+               `remove_decoy` is :py:const:`True`, then formula 1 is used
+               to control output FDR, otherwise it's formula 2. This can be
+               changed by overriding the `formula` argument.
+
+        formula : int, keyword only, optional
+            Can be either 1 or 2, defines which formula should be used for FDR
+            estimation. Default is 1 if `remove_decoy` is :py:const:`True`,
+            else 2 (see :py:func:`fdr` for definitions).
+
+        ratio : float, keyword only, optional
+            The size ratio between the decoy and target databases. Default is
+            1. In theory, the "size" of the database is the number of
+            theoretical peptides eligible for assignment to spectra that are
+            produced by *in silico* cleavage of that database.
+
+        correction : int or float, keyword only, optional
+            Possible values are 0, 1 and 2, or floating point numbers between 0 and 1.
+
+            0 (default): no correction;
+
+            1: enable "+1" correction. This accounts for the probability that a false
+            positive scores better than the first excluded decoy PSM;
+
+            2: this also corrects that probability for finite size of the sample,
+            so the correction will be slightly less than "+1".
+
+            If a floating point number
+            is given, then instead of the expectation value for the number of false PSMs,
+            the confidence value is used. The value of `correction` is then interpreted as
+            desired confidence level. E.g., if correction=0.95, then the calculated q-values
+            do not exceed the "real" q-values with 95% probability.
+
+            See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation.
+
+        pep : callable / array-like / iterable / str, keyword only, optional
+            If callable, a function used to determine the posterior error probability (PEP).
+            Should accept exactly one argument (PSM) and return a float.
+            If array-like, should contain float values for all given PSMs.
+            If string, it is used as a field name (PSMs must be in a record array
+            or a :py:class:`DataFrame`).
+
+            .. note:: If this parameter is given, then PEP values will be used to calculate
+               q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with:
+               `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`.
+               `key` can still be provided. Without `key`, PSMs will be sorted by PEP.
+
+        full_output : bool, keyword only, optional
+            If :py:const:`True`, then an array of PSM objects is returned.
+            Otherwise, an iterator / context manager object is returned, and the
+            files are parsed twice. This saves some RAM, but is ~2x slower.
+            Default is :py:const:`True`.
+
+            .. note:: The name for the parameter comes from the fact that it is
+                      internally passed to :py:func:`qvalues`.
+
+        q_label : str, optional
+            Field name for q-value in the output. Default is ``'q'``.
+
+        score_label : str, optional
+            Field name for score in the output. Default is ``'score'``.
+
+        decoy_label : str, optional
+            Field name for the decoy flag in the output. Default is ``'is decoy'``.
+
+        pep_label : str, optional
+            Field name for PEP in the output. Default is ``'PEP'``.
+
+        **kwargs : passed to the :py:func:`chain` function.
+
+        Returns
+        -------
+        out : iterator or :py:class:`numpy.ndarray` or :py:class:`pandas.DataFrame`
+        """
+        if kwargs.pop('full_output', True):
+            return filter(*args, full_output=True, **kwargs)
+        return IteratorContextManager(*args, parser_func=filter, **kwargs)
+
+    _fix_docstring(_filter, is_decoy=is_decoy_prefix, key=key)
+    if read is _iter:
+        _filter.__doc__ = _filter.__doc__.replace("""positional args : file or str
+            Files to read PSMs from. All positional arguments are treated as
+            files.""", """positional args : iterables
+            Iterables to read PSMs from. All positional arguments are chained.""").replace(
+                """\n            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        decoy_prefix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name prefix to use to detect decoy matches. If you provide your own
+            `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+            Default is `"DECOY_"`.
+
+        decoy_suffix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name suffix to use to detect decoy matches. If you provide your own
+            `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "")
+    return _filter
+
+
+@contextmanager
+def _itercontext(x, **kw):
+    try:
+        yield (row for i, row in x.iterrows())
+    except AttributeError:
+        yield x
+
+
+# _iter = _make_chain(_itercontext, 'iter')
+_iter = ChainBase._make_chain(_itercontext)
+qvalues = _make_qvalues(_iter, None, None, None)
+
+filter = _make_filter(_iter, None, None, None, qvalues)
+filter.chain = _make_chain(filter, 'filter', True)
+# filter.chain = TableJoiner._make_chain(filter)
+
+
+try:
+    import numpy as np
+    _precalc_fact = np.log([math.factorial(n) for n in range(20)])
+
+    def log_factorial(x):
+        x = np.array(x)
+        pf = _precalc_fact
+        m = (x >= pf.size)
+        out = np.empty(x.shape)
+        out[~m] = pf[x[~m].astype(int)]
+        x = x[m]
+        out[m] = x * np.log(x) - x + 0.5 * np.log(2 * np.pi * x)
+        return out
+
+    def _expectation(d, T, p=0.5):
+        if T is None:
+            return d + 1
+        T = np.array(T, dtype=int)
+        m = np.arange(T.max() + 1, dtype=int)
+        pi = np.exp(_log_pi(d, m, p))
+        return ((m * pi).cumsum() / pi.cumsum())[T]
+
+    def _confidence_value(conf, d, T, p=0.5):
+        if T is not None:
+            T = np.array(T, dtype=int)
+            m = np.arange(T.max() + 1, dtype=int)
+        else:
+            m = np.arange(max(50 * d, 10000))
+        log_pi = _log_pi(d, m, p)
+        pics = np.exp(log_pi).cumsum()
+        return np.searchsorted(pics, conf * (pics[T] if T is not None else 1))
+
+except ImportError:
+    def log_factorial(n):
+        if n > 10:
+            return n * math.log(n) - n + 0.5 * math.log(2 * math.pi * n)
+        else:
+            return math.log(math.factorial(n))
+
+    def _expectation(*a, **k):
+        raise NotImplementedError('NumPy required')
+
+    def _confidence_value(*a, **k):
+        raise NotImplementedError('NumPy required')
+
+
+def _log_pi_r(d, k, p=0.5):
+    return k * math.log(p) + log_factorial(k + d) - log_factorial(k) - log_factorial(d)
+
+
+def _log_pi(d, k, p=0.5):
+    return _log_pi_r(d, k, p) + (d + 1) * math.log(1 - p)
+
+
+def _count_psms(psms, is_decoy, pep, decoy_prefix, decoy_suffix, is_decoy_prefix, is_decoy_suffix):
+    total, decoy = 0, 0
+    if pep is not None:
+        is_decoy = pep
+    elif is_decoy is None:
+        if decoy_suffix is not None:
+            is_decoy = lambda x: is_decoy_suffix(x, decoy_suffix)
+        else:
+            is_decoy = lambda x: is_decoy_prefix(x, decoy_prefix)
+    if isinstance(is_decoy, basestring):
+        decoy = psms[is_decoy].sum()
+        total = psms.shape[0]
+    elif callable(is_decoy):
+        for psm in psms:
+            total += 1
+            d = is_decoy(psm)
+            decoy += d if pep is not None else bool(d)
+    else:
+        if not isinstance(is_decoy, (Sized, Container)):
+            is_decoy = list(is_decoy)
+        if pep is not None:
+            decoy = sum(is_decoy)
+        else:
+            decoy = sum(map(bool, is_decoy))
+        total = len(is_decoy)
+    return decoy, total
+
+
+def _make_fdr(is_decoy_prefix, is_decoy_suffix):
+    def fdr(psms=None, formula=1, is_decoy=None, ratio=1, correction=0, pep=None, decoy_prefix='DECOY_', decoy_suffix=None):
+        """Estimate FDR of a data set using TDA or given PEP values.
+        Two formulas can be used. The first one (default) is:
+
+        .. math::
+
+                FDR = \\frac{N_{decoy}}{N_{target} * ratio}
+
+        The second formula is:
+
+        .. math::
+
+                FDR = \\frac{N_{decoy} * (1 + \\frac{1}{ratio})}{N_{total}}
+
+        .. note::
+            This function is less versatile than :py:func:`qvalues`. To obtain FDR,
+            you can call :py:func:`qvalues` and take the last q-value. This function
+            can be used (with `correction = 0` or `1`) when :py:mod:`numpy` is not available.
+
+        Parameters
+        ----------
+        psms : iterable, optional
+            An iterable of PSMs, e.g. as returned by :py:func:`read`.
+            Not needed if `is_decoy` is an iterable.
+
+        formula : int, optional
+            Can be either 1 or 2, defines which formula should be used for FDR
+            estimation. Default is 1.
+
+        is_decoy : callable, iterable, or str
+            If callable, should accept exactly one argument (PSM) and return a truthy value
+            if the PSM is considered decoy. Default is :py:func:`is_decoy`.
+            If array-like, should contain float values for all given PSMs.
+            If string, it is used as a field name (PSMs must be in a record array
+            or a :py:class:`pandas.DataFrame`).
+
+            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        decoy_prefix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name prefix to use to detect decoy matches. If you provide your own
+            `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+            Default is `"DECOY_"`.
+
+        decoy_suffix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name suffix to use to detect decoy matches. If you provide your own
+            `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.
+
+        pep : callable, iterable, or str, optional
+            If callable, a function used to determine the posterior error probability (PEP).
+            Should accept exactly one argument (PSM) and return a float.
+            If array-like, should contain float values for all given PSMs.
+            If string, it is used as a field name (PSMs must be in a record array
+            or a :py:class:`pandas.DataFrame`).
+
+            .. note:: If this parameter is given, then PEP values will be used to calculate FDR.
+               Otherwise, decoy PSMs will be used instead. This option conflicts with:
+               `is_decoy`, `formula`, `ratio`, `correction`.
+
+        ratio : float, optional
+            The size ratio between the decoy and target databases. Default is 1.
+            In theory, the "size" of the database is the number of
+            theoretical peptides eligible for assignment to spectra that are
+            produced by *in silico* cleavage of that database.
+
+        correction : int or float, optional
+            Possible values are 0, 1 and 2, or floating point numbers between 0 and 1.
+
+            0 (default): no correction;
+
+            1: enable "+1" correction. This accounts for the probability that a false
+            positive scores better than the first excluded decoy PSM;
+
+            2: this also corrects that probability for finite size of the sample,
+            so the correction will be slightly less than "+1".
+
+            If a floating point number
+            is given, then instead of the expectation value for the number of false PSMs,
+            the confidence value is used. The value of `correction` is then interpreted as
+            desired confidence level. E.g., if correction=0.95, then the calculated q-values
+            do not exceed the "real" q-values with 95% probability.
+
+            See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation.
+
+            .. note::
+                Requires :py:mod:`numpy`, if `correction` is a float or 2.
+
+            .. note::
+                Correction is only needed if the PSM set at hand was obtained using TDA
+                filtering based on decoy counting (as done by using :py:func:`!filter` without
+                `correction`).
+
+        Returns
+        -------
+        out : float
+            The estimation of FDR, (roughly) between 0 and 1.
+        """
+        if formula not in {1, 2}:
+            raise PyteomicsError('`formula` must be either 1 or 2.')
+        if ratio == 0:
+            raise PyteomicsError('Size ratio cannot be zero!')
+
+        decoy, total = _count_psms(psms, is_decoy, pep, decoy_prefix, decoy_suffix, is_decoy_prefix, is_decoy_suffix)
+        if pep is not None:
+            return float(decoy) / total
+        tfalse = decoy
+        if correction == 1 or (correction == 2 and total / decoy > 10):
+            tfalse += 1
+        elif correction == 2:
+            p = 1. / (1. + ratio)
+            tfalse = _expectation(decoy, total - decoy, p)
+        elif 0 < correction < 1:
+            p = 1. / (1. + ratio)
+            tfalse = _confidence_value(correction, decoy, total - decoy, p)
+        if formula == 1:
+            if total == decoy:
+                raise PyteomicsError('Cannot compute FDR using formula 1: no target IDs found.')
+            return float(tfalse) / (total - decoy) / ratio
+        return (decoy + tfalse / ratio) / total
+
+    _fix_docstring(fdr, is_decoy=is_decoy_prefix)
+    if is_decoy_prefix is None:
+        fdr.__doc__ = fdr.__doc__.replace(
+            """\n            .. warning::
+                The default function may not work
+                with your files, because format flavours are diverse.
+
+        decoy_prefix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name prefix to use to detect decoy matches. If you provide your own
+            `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+            Default is `"DECOY_"`.
+
+        decoy_suffix : str, optional
+            If the default `is_decoy` function works for you, this parameter specifies which
+            protein name suffix to use to detect decoy matches. If you provide your own
+            `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "")
+    return fdr
+
+
+fdr = _make_fdr(None, None)
+
+def _sigma_T(decoy, ratio):
+    return math.sqrt((decoy + 1) * (ratio + 1) / (ratio * ratio))
+
+def sigma_T(psms, is_decoy, ratio=1):
+    """Calculates the standard error for the number of false positive target PSMs.
+
+    The formula is::
+
+    .. math ::
+
+        \\sigma(T) = \\sqrt{\\frac{(d + 1) \\cdot {p}}{(1 - p)^{2}}} = \\sqrt{\\frac{d+1}{r^{2}} \\cdot (r+1)}
+
+    This estimation is accurate for low FDRs.
+    See the `article <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for more details.
+    """
+    decoy, total = _count_psms(psms, is_decoy, None, None, None, None, None)
+    return _sigma_T(decoy, ratio)
+
+def sigma_fdr(psms=None, formula=1, is_decoy=None, ratio=1):
+    """Calculates the standard error of FDR using the formula for negative binomial distribution.
+    See :py:func:`sigma_T` for math. This estimation is accurate for low FDRs.
+    See also the `article <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for more details.
+    """
+
+    if formula not in {1, 2}:
+        raise PyteomicsError('`formula` must be either 1 or 2.')
+    decoy, total = _count_psms(psms, is_decoy, None, None, None, None, None)
+    sigmaT = _sigma_T(decoy, ratio)
+    if formula == 1:
+        return sigmaT / (total - decoy) / ratio
+    return sigmaT / total / ratio
diff --git a/pyteomics/auxiliary/utils.py b/pyteomics/auxiliary/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..728b38507f3845b2923833fe4f50c4e9e5b4de1b
--- /dev/null
+++ b/pyteomics/auxiliary/utils.py
@@ -0,0 +1,317 @@
+from __future__ import print_function
+
+import base64
+import zlib
+from functools import wraps
+from collections import namedtuple
+
+
+try:
+    basestring
+except NameError:
+    basestring = (str, bytes)
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+try:
+    import pynumpress
+except ImportError:
+    pynumpress = None
+
+from .structures import PyteomicsError
+
+def print_tree(d, indent_str=' -> ', indent_count=1):
+    """Read a nested dict (with strings as keys) and print its structure.
+    """
+    def structure(d):
+        out = {}
+        for k, v in d.items():
+            if isinstance(v, dict):
+                out[k] = structure(v)
+            elif isinstance(v, list) and v and isinstance(v[0], dict):
+                out['{} [list]'.format(k)] = structure(v[0])
+            else:
+                out[k] = None
+        return out
+
+    def _print(d, level=0):
+        for k, v in d.items():
+            print('{}{}'.format(indent_str * indent_count * level, k))
+            if v is not None:
+                _print(v, level + 1)
+    _print(structure(d))
+
+
+def memoize(maxsize=1000):
+    """Make a memoization decorator. A negative value of `maxsize` means
+    no size limit."""
+    def deco(f):
+        """Memoization decorator. Items of `kwargs` must be hashable."""
+        memo = {}
+
+        @wraps(f)
+        def func(*args, **kwargs):
+            key = (args, frozenset(kwargs.items()))
+            if key not in memo:
+                if len(memo) == maxsize:
+                    memo.popitem()
+                memo[key] = f(*args, **kwargs)
+            return memo[key]
+        return func
+    return deco
+
+
+def _decode_base64_data_array(source, dtype, is_compressed):
+    """Read a base64-encoded binary array.
+
+    Parameters
+    ----------
+    source : str
+        A binary array encoded with base64.
+    dtype : dtype
+        The type of the array in numpy dtype notation.
+    is_compressed : bool
+        If True then the array will be decompressed with zlib.
+
+    Returns
+    -------
+    out : numpy.array
+    """
+
+    decoded_source = base64.b64decode(source.encode('ascii'))
+    if is_compressed:
+        decoded_source = zlib.decompress(decoded_source)
+    output = np.frombuffer(bytearray(decoded_source), dtype=dtype)
+    return output
+
+
+_default_compression_map = {
+        'no compression': lambda x: x,
+        'zlib compression': zlib.decompress,
+}
+
+
+def _pynumpressDecompress(decoder):
+    def decode(data):
+        return decoder(np.frombuffer(data, dtype=np.uint8))
+    return decode
+
+
+def _zlibNumpress(decoder):
+    def decode(data):
+        return decoder(np.frombuffer(zlib.decompress(data), dtype=np.uint8))
+    return decode
+
+
+if pynumpress:
+    _default_compression_map.update(
+        {
+            'MS-Numpress short logged float compression': _pynumpressDecompress(pynumpress.decode_slof),
+            'MS-Numpress positive integer compression':   _pynumpressDecompress(pynumpress.decode_pic),
+            'MS-Numpress linear prediction compression':  _pynumpressDecompress(pynumpress.decode_linear),
+            'MS-Numpress short logged float compression followed by zlib compression': _zlibNumpress(pynumpress.decode_slof),
+            'MS-Numpress positive integer compression followed by zlib compression':   _zlibNumpress(pynumpress.decode_pic),
+            'MS-Numpress linear prediction compression followed by zlib compression':  _zlibNumpress(pynumpress.decode_linear),
+        })
+
+
+class ArrayConversionMixin(object):
+    _dtype_dict = {}
+    _array_keys = ['m/z array', 'intensity array']
+
+    def __init__(self, *args, **kwargs):
+        self._dtype_dict = {None: None}
+        dtype = kwargs.pop('dtype', None)
+        if isinstance(dtype, dict):
+            self._dtype_dict.update(dtype)
+        elif dtype:
+            self._dtype_dict = {k: dtype for k in self._array_keys}
+            self._dtype_dict[None] = dtype
+        self._convert_arrays = kwargs.pop('convert_arrays', 1)
+        if self._convert_arrays and np is None:
+            raise PyteomicsError('numpy is required for array conversion')
+        super(ArrayConversionMixin, self).__init__(*args, **kwargs)
+
+    def __getstate__(self):
+        state = super(ArrayConversionMixin, self).__getstate__()
+        state['_dtype_dict'] = self._dtype_dict
+        state['_convert_arrays'] = self._convert_arrays
+        state['_array_keys'] = self._array_keys
+        return state
+
+    def __setstate__(self, state):
+        super(ArrayConversionMixin, self).__setstate__(state)
+        self._dtype_dict = state['_dtype_dict']
+        self._convert_arrays = state['_convert_arrays']
+        self._array_keys = state['_array_keys']
+
+    def _build_array(self, k, data):
+        dtype = self._dtype_dict.get(k)
+        return np.array(data, dtype=dtype)
+
+    def _convert_array(self, k, array):
+        dtype = self._dtype_dict.get(k)
+        if dtype is not None:
+            return array.astype(dtype)
+        return array
+
+    def _build_all_arrays(self, info):
+        if self._convert_arrays:
+            for k in self._array_keys:
+                if k in info:
+                    info[k] = self._build_array(k, info[k])
+
+
+class MaskedArrayConversionMixin(ArrayConversionMixin):
+    _masked_array_keys = ['charge array']
+    _mask_value = 0
+
+    def __init__(self, *args, **kwargs):
+        self._convert_arrays = kwargs.pop('convert_arrays', 2)
+        kwargs['convert_arrays'] = self._convert_arrays
+        super(MaskedArrayConversionMixin, self).__init__(*args, **kwargs)
+
+    def __getstate__(self):
+        state = super(MaskedArrayConversionMixin, self).__getstate__()
+        state['_masked_array_keys'] = self._masked_array_keys
+        state['_mask_value'] = self._mask_value
+        return state
+
+    def __setstate__(self, state):
+        super(MaskedArrayConversionMixin, self).__setstate__(state)
+        self._masked_array_keys = state['_masked_array_keys']
+        self._mask_value = state['_mask_value']
+
+    def _build_masked_array(self, k, data):
+        array = self._build_array(k, data)
+        return self._convert_masked_array(k, array)
+
+    def _convert_masked_array(self, k, array):
+        return np.ma.masked_equal(array, self._mask_value)
+
+    def _ensure_masked_array(self, k, data):
+        if isinstance(data, np.ndarray):
+            return self._convert_masked_array(k, data)
+        return self._build_masked_array(self, k, data)
+
+    def _build_all_arrays(self, info):
+        super(MaskedArrayConversionMixin, self)._build_all_arrays(info)
+        if self._convert_arrays == 2:
+            for k in self._masked_array_keys:
+                if k in info:
+                    info[k] = self._ensure_masked_array(k, info[k])
+
+
+if np is not None:
+    class BinaryDataArrayTransformer(object):
+        """A base class that provides methods for reading
+        base64-encoded binary arrays.
+
+        Attributes
+        ----------
+        compression_type_map : dict
+            Maps compressor type name to decompression function
+        """
+
+        compression_type_map = _default_compression_map
+
+        class binary_array_record(namedtuple(
+                "binary_array_record", ("data", "compression", "dtype", "source", "key"))):
+            """Hold all of the information about a base64 encoded array needed to
+            decode the array.
+            """
+
+            def decode(self):
+                """Decode :attr:`data` into a numerical array
+
+                Returns
+                -------
+                np.ndarray
+                """
+                return self.source._decode_record(self)
+
+        def _make_record(self, data, compression, dtype, key=None):
+            return self.binary_array_record(data, compression, dtype, self, key)
+
+        def _decode_record(self, record):
+            array = self.decode_data_array(
+                record.data, record.compression, record.dtype)
+            return self._finalize_record_conversion(array, record)
+
+        def _finalize_record_conversion(self, array, record):
+            return array
+
+        def _base64_decode(self, source):
+            decoded_source = base64.b64decode(source.encode('ascii'))
+            return decoded_source
+
+        def _decompress(self, source, compression_type=None):
+            if compression_type is None:
+                return source
+            decompressor = self.compression_type_map.get(compression_type)
+            decompressed_source = decompressor(source)
+            return decompressed_source
+
+        def _transform_buffer(self, binary, dtype):
+            if isinstance(binary, np.ndarray):
+                return binary.astype(dtype, copy=False)
+            return np.frombuffer(binary, dtype=dtype)
+
+        def decode_data_array(self, source, compression_type=None, dtype=np.float64):
+            """Decode a base64-encoded, compressed bytestring into a numerical
+            array.
+
+            Parameters
+            ----------
+            source : bytes
+                A base64 string encoding a potentially compressed numerical
+                array.
+            compression_type : str, optional
+                The name of the compression method used before encoding the
+                array into base64.
+            dtype : type, optional
+                The data type to use to decode the binary array from the
+                decompressed bytes.
+
+            Returns
+            -------
+            np.ndarray
+            """
+            binary = self._base64_decode(source)
+            binary = self._decompress(binary, compression_type)
+            if isinstance(binary, bytes):
+                binary = bytearray(binary)
+            array = self._transform_buffer(binary, dtype)
+            return array
+
+
+    class BinaryArrayConversionMixin(ArrayConversionMixin, BinaryDataArrayTransformer):
+        def _finalize_record_conversion(self, array, record):
+            key = record.key
+            return self._convert_array(key, array)
+
+
+else:
+    BinaryDataArrayTransformer = None
+    BinaryArrayConversionMixin = None
+
+
+def add_metaclass(metaclass):
+    """Class decorator for creating a class with a metaclass."""
+    def wrapper(cls):
+        orig_vars = cls.__dict__.copy()
+        slots = orig_vars.get('__slots__')
+        if slots is not None:
+            if isinstance(slots, str):
+                slots = [slots]
+            for slots_var in slots:
+                orig_vars.pop(slots_var)
+        orig_vars.pop('__dict__', None)
+        orig_vars.pop('__weakref__', None)
+        if hasattr(cls, '__qualname__'):
+            orig_vars['__qualname__'] = cls.__qualname__
+        return metaclass(cls.__name__, cls.__bases__, orig_vars)
+    return wrapper
diff --git a/pyteomics/electrochem.py b/pyteomics/electrochem.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fa2937d0cc3f7b96639a749054368d4e969295b
--- /dev/null
+++ b/pyteomics/electrochem.py
@@ -0,0 +1,499 @@
+"""
+electrochem - electrochemical properties of polypeptides
+========================================================
+
+Summary
+-------
+
+This module is used to calculate the
+electrochemical properties of polypeptide molecules.
+
+The theory behind most of this module is based on the Henderson-Hasselbalch
+equation and was thoroughly described in a number of sources [#Aronson]_,
+[#Moore]_.
+
+Briefly, the formula for the charge of a polypeptide in given pH is the following:
+
+.. math::
+
+   Q_{peptide} = \sum{\\frac{Q_i}{1+10^{Q_i(pH-pK_i)}}},
+
+where the sum is taken over all ionizable groups of the polypeptide, and
+:math:`Q_i` is -1 and +1 for acidic and basic functional groups,
+respectively.
+
+Charge and pI functions
+-----------------------
+
+  :py:func:`charge` - calculate the charge of a polypeptide
+
+  :py:func:`pI` - calculate the isoelectric point of a polypeptide
+
+
+GRand AVerage of hYdropathicity (GRAVY)
+---------------------------------------
+
+  :py:func:`gravy` - calculate the GRAVY index of a polypeptide
+
+
+Data
+----
+
+  :py:data:`pK_lehninger` - a set of pK from [#Lehninger]_.
+
+  :py:data:`pK_sillero` - a set of pK from [#Sillero]_.
+
+  :py:data:`pK_dawson` - a set of pK from [#Dawson]_, the pK values for NH2-
+  and -OH are taken from [#Sillero]_.
+
+  :py:data:`pK_rodwell` - a set of pK from [#Rodwell]_.
+
+  :py:data:`pK_bjellqvist` - a set of pK from [#Bjellqvist]_.
+
+  :py:data:`pK_nterm_bjellqvist` - a set of N-terminal pK from [#Bjellqvist]_.
+
+  :py:data:`pK_cterm_bjellqvist` - a set of C-terminal pK from [#Bjellqvist]_.
+
+  :py:data:`hydropathicity_KD` - a set of hydropathicity indexes from [#Kyte]_.
+
+
+References
+----------
+
+.. [#Aronson] Aronson, J. N. The Henderson-Hasselbalch equation
+   revisited.  Biochemical Education, 1983, 11 (2), 68.
+   `Link. <http://dx.doi.org/10.1016/0307-4412(83)90046-8>`_
+
+.. [#Moore] Moore, D. S.. Amino acid and peptide net charges: A
+   simple calculational procedure. Biochemical Education, 1986, 13 (1), 10-12.
+   `Link. <http://dx.doi.org/10.1016/0307-4412(85)90114-1>`_
+
+.. [#Lehninger] Nelson, D. L.; Cox, M. M. Lehninger Principles of
+   Biochemistry, Fourth Edition; W. H. Freeman, 2004; p. 1100.
+
+.. [#Sillero] Sillero, A.; Ribeiro, J. Isoelectric points of proteins:
+   Theoretical determination. Analytical Biochemistry, 1989, 179 (2), 319-325.
+   `Link. <http://dx.doi.org/10.1016/0003-2697(89)90136-X>`_
+
+.. [#Dawson] Dawson, R. M. C.; Elliot, D. C.; Elliot, W. H.; Jones, K. M.
+   Data for biochemical research. Oxford University Press, 1989; p. 592.
+
+.. [#Rodwell] Rodwell, J. Heterogeneity of component bands in isoelectric
+   focusing patterns. Analytical Biochemistry, 1982, 119 (2), 440-449.
+   `Link. <http://dx.doi.org/10.1016/0003-2697(82)90611-X>`_
+
+.. [#Bjellqvist] Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E.
+    Reference points for comparisons of two-dimensional maps of proteins from
+    different human cell types defined in a pH scale where isoelectric points
+    correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539.
+    `Link. <http://dx.doi.org/10.1002/elps.1150150171>`_
+
+.. [#Kyte] Kyte, J.; Doolittle, R. F..
+   A simple method for displaying the hydropathic character of a protein.
+   Journal of molecular biology 1982, 157 (1), 105-32.
+   `Link. <https://doi.org/10.1016/0022-2836(82)90515-0>`_
+
+-------------------------------------------------------------------------------
+
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from __future__ import division
+from . import parser
+from .auxiliary import PyteomicsError
+from collections import Counter
+try:
+    from collections.abc import Iterable
+except ImportError:
+    from collections import Iterable
+
+
+def charge(sequence, pH, **kwargs):
+    """Calculate the charge of a polypeptide in given pH or list of pHs using
+    a given list of amino acid electrochemical properties.
+
+    .. warning::
+
+        Be cafeful when supplying a list with a parsed sequence or a dict with
+        amino acid composition as `sequence`. Such values must be obtained
+        with enabled `show_unmodified_termini` option.
+
+    .. warning::
+
+        If you provide `pK_nterm` or `pK_cterm` and provide `sequence` as a dict,
+        it is assumed that it was obtained with ``term_aa=True`` (see
+        :py:func:`pyteomics.parser.amino_acid_composition` for details).
+
+    Parameters
+    ----------
+    sequence : str or list or dict
+        A string with a polypeptide sequence, a list with a parsed
+        sequence or a dict of amino acid composition.
+    pH : float or iterable of floats
+        pH or iterable of pHs for which the charge is calculated.
+    pK : dict {str: [(float, int), ...]}, optional
+        A set of pK of amino acids' ionizable groups. It is a dict, where keys
+        are amino acid labels and the values are lists of tuples (pK,
+        charge_in_ionized_state), a tuple per ionizable group. The default
+        value is `pK_lehninger`.
+
+    pK_nterm : dict {str: [(float, int),]}, optional
+    pK_cterm : dict {str: [(float, int),]}, optional
+        Sets of pK of N-terminal and C-terminal (respectively) amino acids'
+        ionizable groups. Dicts with the same structure as ``pK``. These
+        values (if present) are used for N-terminal and C-terminal residues,
+        respectively. If given, `sequence` must be a :py:class:`str` or a
+        :py:class:`list`. The default value is an empty dict.
+
+    Returns
+    -------
+    out : float or list of floats
+        A single value of charge or a list of charges.
+    """
+
+    peptide_dict, pK = _prepare_charge_dict(sequence, **kwargs)
+
+    # Process the case when pH is a single float.
+    pH_list = pH if isinstance(pH, Iterable) else [pH,]
+
+    charge_list = _charge_for_dict(peptide_dict, pH_list, pK)
+    return charge_list[0] if not isinstance(pH, Iterable) else charge_list
+
+
+def _prepare_charge_dict(sequence, **kwargs):
+    nterm = cterm = n_aa = c_aa = None
+    pK = kwargs.get('pK', pK_lehninger).copy()
+    pK_nterm = kwargs.get('pK_nterm', {})
+    pK_cterm = kwargs.get('pK_cterm', {})
+
+    if isinstance(sequence, dict):
+        peptide_dict = sequence.copy()
+        for k, v in sequence.items():
+            if k[-1] == '-':
+                if v > 1 or nterm:
+                    raise PyteomicsError(
+                            'More that one N-terminal group in {}'.format(
+                                sequence))
+                nterm = k
+            if k[0] == '-':
+                if v > 1 or cterm:
+                    raise PyteomicsError(
+                            'More that one C-terminal group in {}'.format(
+                                sequence))
+                cterm = k
+            if k[:5] == 'nterm':
+                if v > 1 or n_aa:
+                    raise PyteomicsError(
+                            'More that one N-terminal residue in {}'.format(
+                                sequence))
+                n_aa = k[5:]
+                peptide_dict[n_aa] = peptide_dict.get(n_aa, 0) + 1
+            if k[:5] == 'cterm':
+                if v > 1 or c_aa:
+                    raise PyteomicsError(
+                            'More that one C-terminal residue in {}'.format(
+                                sequence))
+                c_aa = k[5:]
+                peptide_dict[c_aa] = peptide_dict.get(c_aa, 0) + 1
+
+        if nterm is None or cterm is None:
+            raise PyteomicsError('Peptide must have two explicit terminal groups')
+        if (n_aa is None or c_aa is None) and (pK_nterm or pK_cterm):
+            raise PyteomicsError('Two terminal residues must be present in '
+                    'peptide (designated as "ntermX" and "ctermX", where "X" is '
+                    'the one-letter residue label). Use '
+                    '``term_aa=True`` when calling '
+                    '`parser.amino_acid_composition`.')
+
+    elif isinstance(sequence, (str, list)):
+        if isinstance(sequence, str):
+            if sequence.isupper() and sequence.isalpha():
+                parsed_sequence = [parser.std_nterm] + list(sequence) + [parser.std_cterm]
+            else:
+                parsed_sequence = parser.parse(sequence, show_unmodified_termini=True)
+        elif isinstance(sequence, list):
+            if sequence[0][-1] != '-' or sequence[-1][0] != '-':
+                raise PyteomicsError('Parsed sequences must contain terminal '
+                                     'groups at 0-th and last positions.')
+            parsed_sequence = sequence
+
+        n_aa = parsed_sequence[1]
+        c_aa = parsed_sequence[-2]
+        nterm = parsed_sequence[0]
+        cterm = parsed_sequence[-1]
+        peptide_dict = Counter(parsed_sequence)
+
+    else:
+        raise PyteomicsError('Unsupported type of sequence: %s' % type(sequence))
+
+    if nterm in pK_nterm:
+        if n_aa in pK_nterm[nterm]:
+            pK[nterm] = pK_nterm[nterm][n_aa]
+    if cterm in pK_cterm:
+        if c_aa in pK_cterm[cterm]:
+            pK[cterm] = pK_cterm[cterm][c_aa]
+
+    return peptide_dict, pK
+
+
+def _charge_for_dict(peptide_dict, pH_list, pK):
+    # Calculate the charge for each value of pH.
+    charge_list = []
+    for pH_value in pH_list:
+        charge = 0
+        for aa in peptide_dict:
+            for ionizable_group in pK.get(aa, []):
+                charge += peptide_dict[aa] * ionizable_group[1] * (
+                    1. / (1. + 10 ** (ionizable_group[1] * (pH_value - ionizable_group[0]))))
+        charge_list.append(charge)
+
+    return charge_list
+
+
+def pI(sequence, pI_range=(0.0, 14.0), precision_pI=0.01, **kwargs):
+    """Calculate the isoelectric point of a polypeptide using a given set
+    of amino acids' electrochemical properties.
+
+    .. warning::
+
+        Be cafeful when supplying a list with a parsed sequence or a dict with
+        amino acid composition as `sequence`. Such values must be obtained
+        with enabled `show_unmodified_termini` option.
+
+    Parameters
+    ----------
+    sequence : str or list or dict
+        A string with a polypeptide sequence, a list with a parsed
+        sequence or a dict of amino acid composition.
+    pI_range : tuple (float, float)
+        The range of allowable pI values. Default is (0.0, 14.0).
+    precision_pI : float
+        The precision of the calculated pI. Default is 0.01.
+    pK : dict {str: [(float, int), ...]}, optional
+        A set of pK of amino acids' ionizable groups. It is a dict, where keys
+        are amino acid labels and the values are lists of tuples (pK,
+        charge_in_ionized_state), a tuple per ionizable group. The default
+        value is `pK_lehninger`.
+    pK_nterm : dict {str: [(float, int),]}, optional
+    pK_cterm : dict {str: [(float, int),]}, optional
+        Sets of pK of N-terminal and C-terminal (respectively) amino acids'
+        ionizable groups. Dicts with the same structure as ``pK``. These
+        values (if present) are used for N-terminal and C-terminal residues,
+        respectively. If given, `sequence` must be a :py:class:`str` or a
+        :py:class:`list`. The default value is an empty dict.
+
+    Returns
+    -------
+    out : float
+    """
+
+    pK = kwargs.get('pK', pK_lehninger.copy())
+    pK_nterm = {}
+    pK_cterm = {}
+    if isinstance(sequence, str) or isinstance(sequence, list):
+        pK_nterm = kwargs.get('pK_nterm', {})
+        pK_cterm = kwargs.get('pK_cterm', {})
+    elif isinstance(sequence, dict) and (('pK_nterm' in kwargs) or ('pK_cterm' in kwargs)):
+        raise PyteomicsError('Can not use terminal features for %s' % type(sequence))
+
+    peptide_dict, pK = _prepare_charge_dict(sequence, pK=pK, pK_cterm=pK_cterm, pK_nterm=pK_nterm)
+    # The algorithm is based on the fact that charge(pH) is a monotonic function.
+    left_x, right_x = pI_range
+    left_y = _charge_for_dict(peptide_dict, [left_x], pK)[0]
+    right_y = _charge_for_dict(peptide_dict, [right_x], pK)[0]
+    while (right_x - left_x) > precision_pI:
+        if left_y * right_y > 0:
+            return left_x if abs(left_y) < abs(right_y) else right_x
+        middle_x = (left_x + right_x) / 2.0
+        middle_y = _charge_for_dict(peptide_dict, [middle_x], pK)[0]
+        if middle_y * left_y < 0:
+            right_x = middle_x
+            right_y = middle_y
+        else:
+            left_x = middle_x
+            left_y = middle_y
+    return (left_x + right_x) / 2.0
+
+
+pK_lehninger = {
+    'E':   [(4.25,  -1)],
+    'R':   [(12.48,  1)],
+    'Y':   [(10.07, -1)],
+    'D':   [(3.65,  -1)],
+    'H':   [(6.00,  +1)],
+    'K':   [(10.53, +1)],
+    'C':   [(8.18,  -1)],
+    'H-':  [(9.69,  +1)],
+    '-OH': [(2.34,  -1)],
+    }
+"""A set of pK from Nelson, D. L.; Cox, M. M. Lehninger Principles of
+Biochemistry, Fourth Edition; W. H. Freeman, 2004; p. 1100.
+"""
+
+pK_sillero = {
+    'E':   [(4.5,  -1)],
+    'R':   [(12.0, +1)],
+    'Y':   [(10.0, -1)],
+    'D':   [(4.0,  -1)],
+    'H':   [(6.4,  +1)],
+    'K':   [(10.4, +1)],
+    'C':   [(9.0,  -1)],
+    'H-':  [(8.2,  +1)],
+    '-OH': [(3.2,  -1)],
+    }
+"""A set of pK from Sillero, A.; Ribeiro, J. Isoelectric points of proteins:
+Theoretical determination. Analytical Biochemistry, vol. 179 (2), pp. 319-325,
+1989.
+"""
+
+pK_dawson = {
+    'E':   [(4.3,  -1)],
+    'R':   [(12.0, +1)],
+    'Y':   [(10.1, -1)],
+    'D':   [(3.9,  -1)],
+    'H':   [(6.0,  +1)],
+    'K':   [(10.5, +1)],
+    'C':   [(8.3,  -1)],
+    'H-':  [(8.2,  +1)],
+    '-OH': [(3.2,  -1)],
+    }
+"""A set of pK from Dawson, R. M. C.; Elliot, D. C.; Elliot, W. H.; Jones,
+K. M.  Data for biochemical research. Oxford University Press, 1989; p. 592.
+pKs for NH2- and -OH are taken from `pK_sillero`.
+"""
+
+pK_rodwell = {
+    'E':   [(4.25, -1)],
+    'R':   [(11.5, +1)],
+    'Y':   [(10.7, -1)],
+    'D':   [(3.86, -1)],
+    'H':   [(6.0,  +1)],
+    'K':   [(11.5, +1)],
+    'C':   [(8.33, -1)],
+    'H-':  [(8.0,  +1)],
+    '-OH': [(3.1,  -1)],
+}
+"""A set of pK from Rodwell, J. Heterogeneity of component bands in
+isoelectric focusing patterns. Analytical Biochemistry, vol. 119 (2),
+pp. 440-449, 1982.
+"""
+
+pK_bjellqvist = {
+    'E':   [(4.45, -1)],
+    'R':   [(12.0, +1)],
+    'Y':   [(10.0, -1)],
+    'D':   [(4.05, -1)],
+    'H':   [(5.98, +1)],
+    'K':   [(10.0, +1)],
+    'C':   [(9.0,  -1)],
+    'H-':  [(7.5,  +1)],
+    '-OH': [(3.55, -1)],
+}
+"""
+A set of pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E.
+Reference points for comparisons of two-dimensional maps of proteins from
+different human cell types defined in a pH scale where isoelectric points
+correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539.
+"""
+
+pK_nterm_bjellqvist = {
+    'H-': {
+        'A': [(7.59, +1)],
+        'M': [(7.0,  +1)],
+        'S': [(6.93, +1)],
+        'P': [(8.36, +1)],
+        'T': [(6.82, +1)],
+        'V': [(7.44, +1)],
+        'E': [(7.7,  +1)]
+        }
+    }
+"""
+A set of N-terminal pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E.
+Reference points for comparisons of two-dimensional maps of proteins from
+different human cell types defined in a pH scale where isoelectric points
+correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539.
+"""
+
+pK_cterm_bjellqvist = {
+    '-OH': {
+        'D': [(4.55, -1)],
+        'E': [(4.75, -1)]
+        }
+    }
+"""
+A set of C-terminal pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E.
+Reference points for comparisons of two-dimensional maps of proteins from
+different human cell types defined in a pH scale where isoelectric points
+correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539.
+"""
+
+hydropathicity_KD = {
+    "A": 1.800,
+    "R": -4.500,
+    "N": -3.500,
+    "D": -3.500,
+    "C": 2.500,
+    "Q": -3.500,
+    "E": -3.500,
+    "G": -0.400,
+    "H": -3.200,
+    "I": 4.500,
+    "L": 3.800,
+    "K": -3.900,
+    "M": 1.900,
+    "F": 2.800,
+    "P": -1.600,
+    "S": -0.800,
+    "T": -0.700,
+    "W": -0.900,
+    "Y": -1.300,
+    "V": 4.200,
+}
+"""
+A set of hydropathicity indexes obtained from Kyte J., Doolittle F. J. Mol. Biol. 157:105-132 (1982).
+"""
+
+
+def gravy(sequence, hydropathicity=hydropathicity_KD):
+    """
+    Calculate GRand AVerage of hYdropathicity (GRAVY) index for amino acid sequence.
+
+    Parameters
+    ----------
+    sequence : str
+        Polypeptide sequence in one-letter format.
+    hydropathicity : dict, optional
+        Hydropathicity indexes of amino acids. Default is :py:data:`hydropathicity_KD`.
+
+    Returns
+    -------
+    out : float
+        GRand AVerage of hYdropathicity (GRAVY) index.
+
+    Examples
+    >>> gravy('PEPTIDE')
+    -1.4375
+    """
+    try:
+        return sum(hydropathicity[aa] for aa in sequence) / len(sequence)
+    except KeyError as e:
+        raise PyteomicsError("Hydropathicity for amino acid {} not provided.".format(e.args[0]))
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
diff --git a/pyteomics/fasta.py b/pyteomics/fasta.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d38c3e36da9eb2b748ae086caf006f9639bed7
--- /dev/null
+++ b/pyteomics/fasta.py
@@ -0,0 +1,1072 @@
+"""
+fasta - manipulations with FASTA databases
+==========================================
+
+FASTA is a simple file format for protein sequence databases. Please refer to
+`the NCBI website <http://www.ncbi.nlm.nih.gov/blast/fasta.shtml>`_
+for the most detailed information on the format.
+
+Data manipulation
+-----------------
+
+Classes
+.......
+
+Several classes of FASTA parsers are available. All of them have common features:
+
+ - context manager support;
+
+ - header parsing;
+
+ - direct iteration.
+
+Available classes:
+
+  :py:class:`FASTABase` - common ancestor, suitable for type checking.
+  Abstract class.
+
+  :py:class:`FASTA` - text-mode, sequential parser.
+  Good for iteration over database entries.
+
+  :py:class:`IndexedFASTA` - binary-mode, indexing parser.
+  Supports direct indexing by header string.
+
+  :py:class:`TwoLayerIndexedFASTA` - additionally supports
+  indexing by extracted header fields.
+
+  :py:class:`UniProt` and :py:class:`IndexedUniProt`,
+  :py:class:`UniParc` and :py:class:`IndexedUniParc`,
+  :py:class:`UniMes` and :py:class:`IndexedUniMes`,
+  :py:class:`UniRef` and :py:class:`IndexedUniRef`,
+  :py:class:`SPD` and :py:class:`IndexedSPD`,
+  :py:class:`NCBI` and :py:class:`IndexedNCBI`,
+  :py:class:`RefSeq` and :py:class:`IndexedRefSeq`, - format-specific parsers.
+
+Functions
+.........
+
+  :py:func:`read` - returns an instance of the appropriate reader class,
+  for sequential iteration or random access.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`write` - write entries to a FASTA database.
+
+  :py:func:`parse` - parse a FASTA header.
+
+Decoy sequence generation
+-------------------------
+
+:py:func:`decoy_sequence` - generate a decoy sequence from a given sequence, using
+one of the other functions listed in this section or any other callable.
+
+:py:func:`reverse` - generate a reversed decoy sequence.
+
+:py:func:`shuffle` - generate a shuffled decoy sequence.
+
+:py:func:`fused_decoy` - generate a "fused" decoy sequence.
+
+
+Decoy database generation
+-------------------------
+
+  :py:func:`write_decoy_db` - generate a decoy database and write it to a file.
+
+  :py:func:`decoy_db` - generate entries for a decoy database from a given FASTA
+  database.
+
+  :py:func:`decoy_entries` - generate decoy entries for an iterator.
+
+  :py:func:`decoy_chain` - a version of :py:func:`decoy_db` for multiple files.
+
+  :py:func:`decoy_chain.from_iterable` - like :py:func:`decoy_chain`, but with
+  an iterable of files.
+
+Auxiliary
+---------
+
+  :py:data:`std_parsers` - a dictionary with parsers for known FASTA header
+  formats.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import itertools
+import random
+from collections import namedtuple
+import re
+import abc
+from . import auxiliary as aux
+from .auxiliary.utils import add_metaclass
+
+
+Protein = namedtuple('Protein', ('description', 'sequence'))
+DECOY_PREFIX = 'DECOY_'
+RAW_HEADER_KEY = '__raw__'
+
+
+def _add_raw_field(parser):
+    """
+    Add :py:const:`RAW_HEADER_KEY` field to the parsed dictinary.
+
+    Parameters
+    ----------
+    parser : func
+        parser function.
+
+    Returns
+    -------
+    None.
+
+    """
+    def _new_parser(instance, descr):
+        parsed = parser(instance, descr)
+        if RAW_HEADER_KEY not in parsed:
+            parsed[RAW_HEADER_KEY] = descr
+        elif parsed[RAW_HEADER_KEY] != descr:
+            raise aux.PyteomicsError('Cannot save raw protein header, since the corresponsing'
+                                    'key ({}) already exists.'.format(RAW_HEADER_KEY))
+        return parsed
+
+    return _new_parser
+
+
+class FASTABase(object):
+    """Abstract base class for FASTA file parsers.
+    Can be used for type checking.
+    """
+    parser = None
+    _ignore_comments = False
+    _comments = set('>;')
+
+    def __init__(self, source, **kwargs):
+        self._ignore_comments = kwargs.pop('ignore_comments', False)
+        parser = kwargs.pop('parser', None)
+        if parser is not None:
+            self.parser = parser
+        super(FASTABase, self).__init__(source, **kwargs)
+
+    def _is_comment(self, line):
+        return line[0] in self._comments
+
+    def get_entry(self, key):
+        raise NotImplementedError
+
+
+class FASTA(FASTABase, aux.FileReader):
+    """Text-mode, sequential FASTA parser.
+    Suitable for iteration over the file to obtain all entries in order.
+    """
+    def __init__(self, source, ignore_comments=False, parser=None, encoding=None):
+        """Create a new FASTA parser object. Supports iteration,
+        yields `(description, sequence)` tuples. Supports `with` syntax.
+
+        Parameters
+        ----------
+
+        source : str or file-like
+            File to read. If file object, it must be opened in *text* mode.
+        ignore_comments : bool, optional
+            If :py:const:`True` then ignore the second and subsequent lines of description.
+            Default is :py:const:`False`, which concatenates multi-line descriptions into
+            a single string.
+        parser : function or None, optional
+            Defines whether the FASTA descriptions should be parsed. If it is a
+            function, that function will be given the description string, and
+            the returned value will be yielded together with the sequence.
+            The :py:data:`std_parsers` dict has parsers for several formats.
+            Hint: specify :py:func:`parse` as the parser to apply automatic
+            format recognition.
+            Default is :py:const:`None`, which means return the header "as is".
+        encoding : str or None, optional
+            File encoding (if it is given by name).
+        """
+        super(FASTA, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={},
+            encoding=encoding, ignore_comments=ignore_comments, parser=parser)
+
+    def _read(self):
+        accumulated_strings = []
+
+        # Iterate through '>' after the file is over to retrieve the last entry.
+        for string in itertools.chain(self._source, '>'):
+            stripped_string = string.strip()
+
+            # Skip empty lines.
+            if not stripped_string:
+                continue
+
+            is_comment = self._is_comment(stripped_string)
+            if is_comment:
+                # If it is a continuing comment
+                if len(accumulated_strings) == 1:
+                    if not self._ignore_comments:
+                        accumulated_strings[0] += (' ' + stripped_string[1:])
+                    else:
+                        continue
+
+                elif accumulated_strings:
+                    description = accumulated_strings[0]
+                    sequence = ''.join(accumulated_strings[1:])
+
+                    # Drop the translation stop sign.
+                    if sequence and sequence[-1] == '*':
+                        sequence = sequence[:-1]
+                    if self.parser is not None:
+                        description = self.parser(description)
+                    yield Protein(description, sequence)
+                    accumulated_strings = [stripped_string[1:]]
+                else:
+                    # accumulated_strings is empty; we're probably reading
+                    # the very first line of the file
+                    accumulated_strings.append(stripped_string[1:])
+            else:
+                accumulated_strings.append(stripped_string)
+
+    def get_entry(self, key):
+        raise aux.PyteomicsError('Direct indexing is not supported. '
+            'Use IndexedFASTA and its subclasses')
+
+
+def _reconstruct(cls, args, kwargs):
+    kwargs['_skip_index'] = True
+    return cls(*args, **kwargs)
+
+
+class IndexedFASTA(FASTABase, aux.TaskMappingMixin, aux.IndexedTextReader):
+    """Indexed FASTA parser. Supports direct indexing by matched labels."""
+    delimiter = '\n>'
+    label = r'^[\n]?>(.*)\s*'
+
+    def __init__(self, source, ignore_comments=False, parser=None, **kwargs):
+        """Create an indexed FASTA parser object.
+
+        Parameters
+        ----------
+        source : str or file-like
+            File to read. If file object, it must be opened in *binary* mode.
+        ignore_comments : bool, optional
+            If :py:const:`True` then ignore the second and subsequent lines of description.
+            Default is :py:const:`False`, which concatenates multi-line descriptions into
+            a single string.
+        parser : function or None, optional
+            Defines whether the FASTA descriptions should be parsed. If it is a
+            function, that function will be given the description string, and
+            the returned value will be yielded together with the sequence.
+            The :py:data:`std_parsers` dict has parsers for several formats.
+            Hint: specify :py:func:`parse` as the parser to apply automatic
+            format recognition.
+            Default is :py:const:`None`, which means return the header "as is".
+        encoding : str or None, optional, keyword only
+            File encoding. Default is UTF-8.
+        block_size : int or None, optional, keyword only
+            Number of bytes to consume at once.
+        delimiter : str or None, optional, keyword only
+            Overrides the FASTA record delimiter (default is ``'\\n>'``).
+        label : str or None, optional, keyword only
+            Overrides the FASTA record label pattern. Default is ``'^[\\n]?>(.*)'``.
+        label_group : int or str, optional, keyword only
+            Overrides the matched group used as key in the byte offset index.
+            This in combination with `label` can be used to extract fields from headers.
+            However, consider using :py:class:`TwoLayerIndexedFASTA` for this purpose.
+        """
+        super(IndexedFASTA, self).__init__(source, ignore_comments=ignore_comments, parser=parser,
+            parser_func=self._read, pass_file=False, args=(), kwargs={}, **kwargs)
+        self._init_args = (source, ignore_comments, parser)
+        self._init_kwargs = kwargs
+
+    def __reduce_ex__(self, protocol):
+        return (_reconstruct,
+            (self.__class__, self._init_args, self._init_kwargs),
+            self.__getstate__())
+
+    def _read_protein_lines(self, lines):
+        description = []
+        sequence = []
+
+        for string in lines:
+            stripped_string = string.strip()
+            if not stripped_string:
+                continue
+
+            is_comment = self._is_comment(stripped_string)
+            if is_comment:
+                if not description or not self._ignore_comments:
+                    description.append(stripped_string[1:])
+            else:
+                sequence.append(stripped_string)
+
+        description = ' '.join(description)
+        sequence = ''.join(sequence)
+        # Drop the translation stop sign.
+        if sequence and sequence[-1] == '*':
+            sequence = sequence[:-1]
+        if self.parser is not None:
+            description = self.parser(description)
+        return Protein(description, sequence)
+
+    def _item_from_offsets(self, offsets):
+        start, end = offsets
+        lines = self._read_lines_from_offsets(start, end)
+        return self._read_protein_lines(lines)
+
+    def _read(self, **kwargs):
+        for key, offsets in self._offset_index.items():
+            yield self._item_from_offsets(offsets)
+
+    def get_entry(self, key):
+        return self.get_by_id(key)
+
+
+class TwoLayerIndexedFASTA(IndexedFASTA):
+    """Parser with two-layer index. Extracted groups are mapped to full headers (where possible),
+    full headers are mapped to byte offsets.
+
+    When indexed, the key is looked up in both indexes, allowing access by meaningful IDs
+    (like UniProt accession) and by full header string.
+    """
+    header_group = 1
+    header_pattern = None
+    def __init__(self, source, header_pattern=None, header_group=None,
+        ignore_comments=False, parser=None, **kwargs):
+        """Open `source` and create a two-layer index for convenient random access
+        both by full header strings and extracted fields.
+
+        Parameters
+        ----------
+        source : str or file-like
+            File to read. If file object, it must be opened in *binary* mode.
+        header_pattern : str or RE or None, optional
+            Pattern to match the header string. Must capture the group used
+            for the second index. If :py:const:`None` (default), second-level index is not created.
+        header_group : int or str or None, optional
+            Defines which group is used as key in the second-level index.
+            Default is 1.
+        ignore_comments : bool, optional
+            If :py:const:`True` then ignore the second and subsequent lines of description.
+            Default is :py:const:`False`, which concatenates multi-line descriptions into
+            a single string.
+        parser : function or None, optional
+            Defines whether the FASTA descriptions should be parsed. If it is a
+            function, that function will be given the description string, and
+            the returned value will be yielded together with the sequence.
+            The :py:data:`std_parsers` dict has parsers for several formats.
+            Hint: specify :py:func:`parse` as the parser to apply automatic
+            format recognition.
+            Default is :py:const:`None`, which means return the header "as is".
+
+        Other arguments : the same as for :py:class:`IndexedFASTA`.
+        """
+        super(TwoLayerIndexedFASTA, self).__init__(source, ignore_comments, parser, **kwargs)
+        if header_group is not None:
+            self.header_group = header_group
+        if header_pattern is not None:
+            self.header_pattern = header_pattern
+        if not kwargs.get('_skip_index', False):
+            self.build_second_index()
+        self._init_args = (source, header_pattern, header_group, ignore_comments, parser)
+        self._init_kwargs = kwargs
+
+    def build_second_index(self):
+        """Create the mapping from extracted field to whole header string."""
+        if self.header_pattern is None:
+            self._id2header = None
+        else:
+            index = {}
+            for key in self._offset_index:
+                match = re.match(self.header_pattern, key)
+                if match:
+                    index[match.group(self.header_group)] = key
+            self._id2header = index
+
+    def __getstate__(self):
+        state = super(TwoLayerIndexedFASTA, self).__getstate__()
+        state['id2header'] = self._id2header
+        return state
+
+    def __setstate__(self, state):
+        super(TwoLayerIndexedFASTA, self).__setstate__(state)
+        self._id2header = state['id2header']
+
+    def get_by_id(self, key):
+        """Get the entry by value of header string or extracted field."""
+        try:
+            return super(TwoLayerIndexedFASTA, self).get_by_id(key)
+        except KeyError:
+            if self._id2header:
+                header = self._id2header.get(key)
+                if header is not None:
+                    return super(TwoLayerIndexedFASTA, self).get_entry(header)
+        raise KeyError(key)
+
+    def get_header(self, key):
+        if key in self._id2header:
+            return self._id2header[key]
+        raise KeyError(key)
+
+    def __contains__(self, key):
+        return super(TwoLayerIndexedFASTA, self).__contains__(key) or key in self._id2header
+
+
+class _FastaParserFlavorMeta(abc.ABCMeta):
+    def __new__(mcs, name, bases, namespace):
+        if "parser" in namespace:
+            namespace["parser"] = _add_raw_field(namespace["parser"])
+        if name != 'FlavoredMixin':
+            reader_type = None
+            for t in (FASTA, IndexedFASTA, TwoLayerIndexedFASTA):
+                if t in bases:
+                    reader_type = t
+
+            if reader_type is not None:
+                # this is a "concrete" reader class
+                # add a unified __init__ method for it
+                for c in bases:
+                    if issubclass(c, FlavoredMixin):
+                        flavor = c
+                        break
+                else:
+                    raise aux.PyteomicsError('Could not detect flavor of {}, not a subclass of `FlavoredMixin`.')
+
+                def __init__(self, source, parse=True, **kwargs):
+                    reader_type.__init__(self, source, **kwargs)
+                    flavor.__init__(self, parse)
+                    self._init_args = (source, parse)
+                    self._init_kwargs = kwargs
+
+                flavor_name = name[:-5]
+                type_name = "Text-mode" if reader_type is FASTA else "Indexed"
+                __init__.__doc__ = """Creates a :py:class:`{}` object.
+
+                Parameters
+                ----------
+                source : str or file
+                    The file to read. If a file object, it needs to be in *{}* mode.
+                parse : bool, optional
+                    Defines whether the descriptions should be parsed in the produced tuples.
+                    Default is :py:const:`True`.
+                kwargs : passed to the :py:class:`{}` constructor.
+                """.format(name, 'text' if reader_type is FASTA else 'binary', reader_type.__name__)
+                namespace['__init__'] = __init__
+                namespace['__doc__'] = """{} parser for {} FASTA files.""".format(type_name, flavor_name)
+
+        return super(_FastaParserFlavorMeta, mcs).__new__(mcs, name, bases, namespace)
+
+
+@add_metaclass(_FastaParserFlavorMeta)
+class FlavoredMixin():
+    """Parser aimed at a specific FASTA flavor.
+    Subclasses should define `parser` and `header_pattern`.
+    The `parse` argument in :py:meth:`__init__` defines whether description is
+    parsed in output.
+    """
+    def __init__(self, parse=True):
+        if not parse:
+            self.parser = None
+
+
+class UniProtMixin(FlavoredMixin):
+    header_pattern = r'^(?P<db>\w+)\|(?P<id>[-\w]+)\|(?P<entry>\w+)\s+(?P<name>.*?)(?:(\s+OS=(?P<OS>[^=]+))|(\s+OX=(?P<OX>\d+))|(\s+GN=(?P<GN>\S+))|(\s+PE=(?P<PE>\d))|(\s+SV=(?P<SV>\d+)))*\s*$'
+    header_group = 'id'
+
+    def parser(self, header):
+        info = re.match(self.header_pattern, header).groupdict()
+        for key in ['OS', 'OX', 'GN', 'PE', 'SV']:
+            if info[key] is None:
+                del info[key]
+        info['gene_id'], info['taxon'] = info['entry'].split('_')
+        _intify(info, ('PE', 'SV', 'OX'))
+        return info
+
+
+class UniProt(UniProtMixin, FASTA):
+    pass
+
+
+class IndexedUniProt(UniProtMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+class UniRefMixin(FlavoredMixin):
+    header_pattern = r'^(?P<id>\S+)\s+(?P<cluster>.*?)(?:(\s+n=(?P<n>\d+))|(\s+Tax=(?P<Tax>.+?))|(\s+TaxID=(?P<TaxID>\S+))|(\s+RepID=(?P<RepID>\S+)))*\s*$'
+    header_group = 'id'
+
+    def parser(self, header):
+        assert 'Tax' in header
+        info = re.match(self.header_pattern, header).groupdict()
+        for key in ['TaxID', 'Tax', 'RepID', 'n']:
+            if info[key] is None:
+                del info[key]
+        _intify(info, ('n',))
+        return info
+
+
+class UniRef(UniRefMixin, FASTA):
+    pass
+
+
+class IndexedUniRef(UniRefMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+class UniParcMixin(FlavoredMixin):
+    header_pattern = r'(\S+)\s+status=(\w+)\s*$'
+
+    def parser(self, header):
+        ID, status = re.match(self.header_pattern, header).groups()
+        return {'id': ID, 'status': status}
+
+
+class UniParc(UniParcMixin, FASTA):
+    pass
+
+
+class IndexedUniParc(UniParcMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+class UniMesMixin(FlavoredMixin):
+    header_pattern = r'^(\S+)\s+([^=]*\S)((\s+\w+=[^=]+(?!\w*=))+)\s*$'
+
+    def parser(self, header):
+        assert 'OS=' in header and 'SV=' in header and 'PE=' not in header
+        ID, name, pairs, _ = re.match(self.header_pattern, header).groups()
+        info = {'id': ID, 'name': name}
+        info.update(_split_pairs(pairs))
+        _intify(info, ('SV',))
+        return info
+
+
+class UniMes(UniMesMixin, FASTA):
+    pass
+
+
+class IndexedUniMes(UniMesMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+class SPDMixin(FlavoredMixin):
+    header_pattern = r'^([^|]+?)\s*\|\s*(([^|]+?)_([^|]+?))\s*\|\s*([^|]+?)\s*$'
+
+    def parser(self, header):
+        assert '=' not in header
+        ID, gene, gid, taxon, d = re.match(self.header_pattern, header).groups()
+        return {'id': ID, 'gene': gene, 'description': d,
+                'taxon': taxon, 'gene_id': gid}
+
+
+class SPD(SPDMixin, FASTA):
+    pass
+
+
+class IndexedSPD(SPDMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+class NCBIMixin(FlavoredMixin):
+    header_pattern = r'^(\S+)\s+(.*\S)\s+\[(.*)\]'
+
+    def parser(self, header):
+        ID, description, organism = re.match(self.header_pattern, header).groups()
+        return {'id': ID, 'description': description, 'taxon': organism}
+
+
+class NCBI(NCBIMixin, FASTA):
+    pass
+
+
+class IndexedNCBI(NCBIMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+class RefSeqMixin(FlavoredMixin):
+    header_pattern = r'^ref\|([^|]+)\|\s*([^\[]*\S)\s*\[(.*)\]'
+
+    def parser(self, header):
+        ID, description, organism = re.match(self.header_pattern, header).groups()
+        return {'id': ID, 'description': description, 'taxon': organism}
+
+
+class RefSeq(RefSeqMixin, FASTA):
+    pass
+
+
+class IndexedRefSeq(RefSeqMixin, TwoLayerIndexedFASTA):
+    pass
+
+
+def read(source=None, use_index=None, flavor=None, **kwargs):
+    """Parse a FASTA file. This function serves as a dispatcher between
+    different parsers available in this module.
+
+    Parameters
+    ----------
+    source : str or file or None, optional
+        A file object (or file name) with a FASTA database. Default is
+        :py:const:`None`, which means read standard input.
+    use_index : bool, optional
+        If :py:const:`True`, the created parser object will be an instance of
+        :py:class:`IndexedFASTA`. If :py:const:`False` (default), it will be
+        an instance of :py:class:`FASTA`.
+    flavor : str or None, optional
+        A supported FASTA header format. If specified, a format-specific
+        parser instance is returned.
+
+        .. note:: See :py:data:`std_parsers` for supported flavors.
+
+    Returns
+    -------
+    out : iterator of tuples
+        A named 2-tuple with FASTA header (str or dict) and sequence (str).
+        Attributes 'description' and 'sequence' are also provided.
+    """
+    try:
+        parser = std_parsers[flavor and flavor.lower()]
+    except KeyError:
+        raise aux.PyteomicsError('No parser for flavor: {}. Supported flavors: {}'.format(
+            flavor, ', '.join(map(str, std_parsers))))
+    use_index = aux._check_use_index(source, use_index, False)
+    return parser[use_index](source, **kwargs)
+
+
+@aux._file_writer()
+def write(entries, output=None):
+    """
+    Create a FASTA file with `entries`.
+
+    Parameters
+    ----------
+    entries : iterable of (str/dict, str) tuples
+        An iterable of 2-tuples in the form (description, sequence).
+        If description is a dictionary, it must have a special key, whose value
+        will be written as protein description. The special key is defined by the variable
+        :py:const:`RAW_HEADER_KEY`.
+    output : file-like or str, optional
+        A file open for writing or a path to write to. If the file exists,
+        it will be opened for writing. Default is :py:const:`None`, which
+        means write to standard output.
+
+        .. note::
+            The default mode for output files specified by name has been changed
+            from `a` to `w` in *pyteomics 4.6*. See `file_mode` to override the mode.
+
+    file_mode : str, keyword only, optional
+        If `output` is a file name, defines the mode the file will be opened in.
+        Otherwise will be ignored. Default is `'w'`.
+
+        .. note ::
+            The default changed from `'a'` in *pyteomics 4.6*.
+
+    Returns
+    -------
+    output_file : file object
+        The file where the FASTA is written.
+    """
+    for descr, seq in entries:
+        if isinstance(descr, str):
+            output.write('>' + descr.replace('\n', '\n;') + '\n')
+        elif isinstance(descr, dict) and RAW_HEADER_KEY in descr:
+            output.write('>' + descr[RAW_HEADER_KEY].replace('\n', '\n;') + '\n')
+        else:
+             raise aux.PyteomicsError('Cannot use provided description: ' + repr(descr))
+        output.write(''.join([('%s\n' % seq[i:i+70])
+            for i in range(0, len(seq), 70)]) + '\n')
+
+    return output.file
+
+
+def reverse(sequence, keep_nterm=False, keep_cterm=False):
+    """
+    Create a decoy sequence by reversing the original one.
+
+    Parameters
+    ----------
+    sequence : str
+        The initial sequence string.
+    keep_nterm : bool, optional
+        If :py:const:`True`, then the N-terminal residue will be kept.
+        Default is :py:const:`False`.
+    keep_cterm : bool, optional
+        If :py:const:`True`, then the C-terminal residue will be kept.
+        Default is :py:const:`False`.
+
+    Returns
+    -------
+    decoy_sequence : str
+        The decoy sequence.
+    """
+    start = 1 if keep_nterm else 0
+    end = len(sequence)-1 if keep_cterm else len(sequence)
+    if start == end:
+        return sequence
+    return sequence[:start] + sequence[start:end][::-1] + sequence[end:]
+
+
+def shuffle(sequence, keep_nterm=False, keep_cterm=False, keep_nterm_M=False, fix_aa=''):
+    """
+    Create a decoy sequence by shuffling the original one.
+
+    Parameters
+    ----------
+    sequence : str
+        The initial sequence string.
+    keep_nterm : bool, optional
+        If :py:const:`True`, then the N-terminal residue will be kept.
+        Default is :py:const:`False`.
+    keep_cterm : bool, optional
+        If :py:const:`True`, then the C-terminal residue will be kept.
+        Default is :py:const:`False`.
+    keep_nterm_M : bool, optional
+        If :py:const:`True`, then the N-terminal methionine will be kept.
+        Default is :py:const:`False`.
+    fix_aa : iterable, optional
+        Single letter codes for amino acids that should preserve their position
+        during shuffling.
+        Default is ''.
+
+    Returns
+    -------
+    decoy_sequence : str
+        The decoy sequence.
+    """
+
+    # empty sequence
+    if len(sequence) == 0:
+        return ''
+
+    # presereve the first position
+    if (keep_nterm_M and sequence[0] == 'M') or keep_nterm:
+        return sequence[0] + shuffle(sequence[1:], keep_cterm=keep_cterm,
+                       fix_aa=fix_aa)
+
+    # presereve the last position
+    if keep_cterm:
+        return shuffle(sequence[:-1], fix_aa=fix_aa) + sequence[-1]
+
+
+    if not isinstance(fix_aa, str):
+        fix_aa = ''.join(fix_aa)
+
+    fixed = []
+    position = 0
+    if len(fix_aa) > 0:  # non-empty fixed list
+        shuffled = []
+        for match in re.finditer(r'[{}]'.format(fix_aa), sequence):
+            fixed.append((match.start(), sequence[match.start()]))
+            shuffled.extend(sequence[position:match.start()])
+            position = match.end()
+        shuffled.extend(sequence[position:])
+
+    else:  # shuffle everything
+        shuffled = list(sequence)
+
+    random.shuffle(shuffled)
+
+    for fix in fixed:
+        shuffled.insert(fix[0], fix[1])
+
+    return ''.join(shuffled)
+
+
+def fused_decoy(sequence, decoy_mode='reverse', sep='R', **kwargs):
+    """
+    Create a "fused" decoy sequence by concatenating a decoy sequence with the original one.
+    The method and its use cases are described in:
+
+    Ivanov, M. V., Levitsky, L. I., & Gorshkov, M. V. (2016).
+    `Adaptation of Decoy Fusion Strategy for Existing Multi-Stage Search Workflows.
+    <http://doi.org/10.1007/s13361-016-1436-7>`_
+    Journal of The American Society for Mass Spectrometry, 27(9), 1579-1582.
+
+    Parameters
+    ----------
+    sequence : str
+        The initial sequence string.
+    decoy_mode : str or callable, optional
+        Type of decoy sequence to use. Should be one of the standard modes or any callable.
+        Standard modes are:
+
+        - 'reverse' for :py:func:`reverse`;
+        - 'shuffle' for :py:func:`shuffle`;
+        - 'fused' for :py:func:`fused_decoy` (if you love recursion).
+
+        Default is 'reverse'.
+    sep : str, optional
+        Amino acid motif that separates the decoy sequence from the target one.
+        This setting should reflect the enzyme specificity used in the search against the
+        database being generated. Default is 'R', which is suitable for trypsin searches.
+    **kwargs : given to the decoy generation function.
+
+    Examples
+    --------
+    >>> fused_decoy('PEPT')
+    'TPEPRPEPT'
+    >>> fused_decoy('MPEPT', 'shuffle', 'K', keep_nterm=True)
+    'MPPTEKMPEPT'
+    """
+    decoy = decoy_sequence(sequence, decoy_mode, **kwargs)
+    return decoy + sep + sequence
+
+
+_decoy_functions = {'reverse': reverse, 'shuffle': shuffle, 'fused': fused_decoy}
+
+
+def decoy_sequence(sequence, mode='reverse', **kwargs):
+    """
+    Create a decoy sequence out of a given sequence string.
+
+    Parameters
+    ----------
+    sequence : str
+        The initial sequence string.
+    mode : str or callable, optional
+        Type of decoy sequence. Should be one of the standard modes or any callable.
+        Standard modes are:
+
+        - 'reverse' for :py:func:`reverse`;
+        - 'shuffle' for :py:func:`shuffle`;
+        - 'fused' for :py:func:`fused_decoy`.
+
+        Default is 'reverse'.
+    **kwargs : given to the decoy function.
+
+    Returns
+    -------
+    decoy_sequence : str
+        The decoy sequence.
+    """
+    fmode = mode
+    if isinstance(mode, str):
+        fmode = _decoy_functions.get(mode)
+        if fmode is None:
+            raise aux.PyteomicsError('Unsupported decoy mode: {}'.format(mode))
+    return fmode(sequence, **kwargs)
+
+
+def decoy_entries(entries, mode='reverse', prefix=DECOY_PREFIX, decoy_only=True, **kwargs):
+    """Iterate over protein `entries` (tuples) and produce decoy entries.
+    The `entries` are only iterated once.
+
+    Parameters
+    ----------
+    entries : iterable of tuples
+        Any iterable of (description, sequence) pairs.
+    mode : str or callable, optional
+        Algorithm of decoy sequence generation. 'reverse' by default.
+        See :py:func:`decoy_sequence` for more information.
+    prefix : str, optional
+        A prefix to the protein descriptions of decoy entries. The default
+        value is `'DECOY_'`.
+    decoy_only : bool, optional
+        If set to :py:const:`True`, only the decoy entries will be written to
+        `output`. If :py:const:`False`, each consumed entry is yielded unchanged,
+        followed by its decoy couterpart.
+        :py:const:`True` by default.
+    **kwargs : given to :py:func:`decoy_sequence`.
+
+    Returns
+    -------
+    out : iterator
+        An iterator over new entries.
+    """
+    for item in entries:
+        if not decoy_only:
+            yield item
+        yield Protein(prefix + item[0], decoy_sequence(item[1], mode, **kwargs))
+
+
+@aux._file_reader()
+def decoy_db(source=None, mode='reverse', prefix=DECOY_PREFIX, decoy_only=False,
+             ignore_comments=False, parser=None, **kwargs):
+    """Iterate over sequences for a decoy database out of a given ``source``.
+
+    Parameters
+    ----------
+    source : file-like object or str or None, optional
+        A path to a FASTA database or a file object itself. Default is
+        :py:const:`None`, which means read standard input.
+    mode : str or callable, optional
+        Algorithm of decoy sequence generation. 'reverse' by default.
+        See :py:func:`decoy_sequence` for more information.
+    prefix : str, optional
+        A prefix to the protein descriptions of decoy entries. The default
+        value is `'DECOY_'`.
+    decoy_only : bool, optional
+        If set to :py:const:`True`, only the decoy entries will be written to
+        `output`. If :py:const:`False`, the entries from `source` will be
+        written first.
+        :py:const:`False` by default.
+    ignore_comments : bool, optional
+        If True then ignore the second and subsequent lines of description.
+        Default is :py:const:`False`.
+    parser : function or None, optional
+        Defines whether the fasta descriptions should be parsed. If it is a
+        function, that function will be given the description string, and
+        the returned value will be yielded together with the sequence.
+        The :py:data:`std_parsers` dict has parsers for several formats.
+        Hint: specify :py:func:`parse` as the parser to apply automatic
+        format guessing.
+        Default is :py:const:`None`, which means return the header "as is".
+    **kwargs : given to :py:func:`decoy_sequence`.
+
+    Returns
+    -------
+    out : iterator
+        An iterator over entries of the new database.
+    """
+
+    # store the initial position
+    pos = source.tell()
+    if not decoy_only:
+        with read(source, ignore_comments, parser) as f:
+            for x in f:
+                yield x
+        # return to the initial position in the source file to read again
+        source.seek(pos)
+
+    parser = parser or (lambda x: x)
+    with read(source, ignore_comments) as f:
+        for descr, seq in f:
+            yield Protein(parser(prefix + descr), decoy_sequence(seq, mode, **kwargs))
+
+
+@aux._file_writer()
+def write_decoy_db(source=None, output=None, mode='reverse', prefix=DECOY_PREFIX,
+        decoy_only=False, **kwargs):
+    """Generate a decoy database out of a given ``source`` and write to file.
+
+    If `output` is a path, the file will be open for appending, so no information
+    will be lost if the file exists. Although, the user should be careful when
+    providing open file streams as `source` and `output`. The reading and writing
+    will start from the current position in the files, which is where the last I/O
+    operation finished. One can use the :py:func:`file.seek` method to change it.
+
+    Parameters
+    ----------
+    source : file-like object or str or None, optional
+        A path to a FASTA database or a file object itself. Default is
+        :py:const:`None`, which means read standard input.
+    output : file-like object or str, optional
+        A path to the output database or a file open for writing.
+        Defaults to :py:const:`None`, the results go to the standard output.
+    mode : str or callable, optional
+        Algorithm of decoy sequence generation. 'reverse' by default.
+        See :py:func:`decoy_sequence` for more details.
+    prefix : str, optional
+        A prefix to the protein descriptions of decoy entries. The default
+        value is `'DECOY_'`
+    decoy_only : bool, optional
+        If set to :py:const:`True`, only the decoy entries will be written to
+        `output`. If :py:const:`False`, the entries from `source` will be
+        written as well.
+        :py:const:`False` by default.
+    file_mode : str, keyword only, optional
+        If `output` is a file name, defines the mode the file will be opened in.
+        Otherwise will be ignored. Default is 'a'.
+    **kwargs : given to :py:func:`decoy_sequence`.
+
+    Returns
+    -------
+    output : file
+        A (closed) file object for the created file.
+    """
+    with decoy_db(source, mode, prefix, decoy_only, **kwargs) as entries:
+        write(entries, output)
+        return output.file
+
+
+# auxiliary functions for parsing of FASTA headers
+def _split_pairs(s):
+    return dict(map(lambda x: x.strip(), x.split('='))
+            for x in re.split(r' (?=\w+=)', s.strip()))
+
+
+def _intify(d, keys):
+    for k in keys:
+        if k in d:
+            d[k] = int(d[k])
+
+
+std_parsers = {'uniprot': (UniProt, IndexedUniProt), 'uniref': (UniRef, IndexedUniRef),
+        'uniparc': (UniParc, IndexedUniParc), 'unimes': (UniMes, IndexedUniMes),
+        'spd': (SPD, IndexedSPD), 'ncbi': (NCBI, IndexedNCBI),
+        'refseq': (RefSeq, IndexedRefSeq),
+        None: (FASTA, IndexedFASTA)}
+"""A dictionary with parsers for known FASTA header formats. For now, supported
+formats are those described at
+`UniProt help page <http://www.uniprot.org/help/fasta-headers>`_."""
+
+
+_std_mixins = {'uniprot': UniProtMixin, 'uniref': UniRefMixin,
+        'uniparc': UniParcMixin, 'unimes': UniMesMixin, 'spd': SPDMixin,
+        'ncbi': NCBIMixin, 'refseq': RefSeqMixin}
+
+
+def parse(header, flavor='auto', parsers=None):
+    """Parse the FASTA header and return a nice dictionary.
+
+    Parameters
+    ----------
+
+    header : str
+        FASTA header to parse
+    flavor : str, optional
+        Short name of the header format (case-insensitive). Valid values are
+        :py:const:`'auto'` and keys of the `parsers` dict. Default is
+        :py:const:`'auto'`, which means try all formats in turn and return the
+        first result that can be obtained without an exception.
+    parsers : dict, optional
+        A dict where keys are format names (lowercased) and values are functions
+        that take a header string and return the parsed header.
+
+    Returns
+    -------
+
+    out : dict
+        A dictionary with the info from the header. The format depends on the
+        flavor.
+    """
+    parser_function = lambda cls: cls().parser
+    flavor = flavor.lower()
+    # accept strings with and without leading '>'
+    if header and header[0] == '>':
+        header = header[1:]
+
+    # choose the format
+    known = parsers or _std_mixins
+
+    if flavor == 'auto':
+        for parser in known.values():
+            try:
+                return parser_function(parser)(header)
+            except Exception:
+                pass
+        raise aux.PyteomicsError('Unknown FASTA header format: ' + header)
+    elif flavor in known:
+        try:
+            return parser_function(known[flavor])(header)
+        except Exception as e:
+            raise aux.PyteomicsError('Could not parse header as "{}". '
+                    'The error message was: {}: {}. Header: "{}"'.format(
+                        flavor, type(e).__name__, e.args[0], header))
+    raise aux.PyteomicsError('Unknown flavor: {}'.format(flavor))
+
+
+chain = aux._make_chain(read, 'read')
+decoy_chain = aux._make_chain(decoy_db, 'decoy_db')
diff --git a/pyteomics/mass/__init__.py b/pyteomics/mass/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5b9981ebfaaff792ed64ea1e10581d062d76f07
--- /dev/null
+++ b/pyteomics/mass/__init__.py
@@ -0,0 +1,6 @@
+from .mass import *
+try:
+    from . import unimod
+except ImportError:
+    # SQLAlchemy is not available
+    pass
\ No newline at end of file
diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py
new file mode 100644
index 0000000000000000000000000000000000000000..03afee248adc371397477d41e753619e2f3db8ac
--- /dev/null
+++ b/pyteomics/mass/mass.py
@@ -0,0 +1,1231 @@
+"""
+mass - molecular masses and isotope distributions
+=================================================
+
+Summary
+-------
+
+This module defines general functions for mass and isotope abundance
+calculations. For most of the functions, the user can define a given
+substance in various formats, but all of them would be reduced to the
+:py:func:`Composition <Composition.__init__>` object describing its
+chemical composition.
+
+
+Classes
+-------
+
+  :py:func:`Composition <Composition.__init__>` - a class storing chemical
+  composition of a substance.
+
+  :py:class:`Unimod` - a class representing a Python interface to the
+  `Unimod database <http://unimod.org/>`_
+  (see :py:mod:`pyteomics.mass.unimod` for a much more powerful alternative).
+
+Mass calculations
+-----------------
+
+  :py:func:`calculate_mass` - a general routine for mass / m/z
+  calculation. Can calculate mass for a polypeptide sequence, chemical
+  formula or elemental composition. Supplied with an ion type and
+  charge, the function would calculate m/z.
+
+  :py:func:`fast_mass` - a less powerful but much faster function for
+  polypeptide mass calculation.
+
+  :py:func:`fast_mass2` - a version of `fast_mass` that supports *modX* notation.
+
+Isotopic abundances
+-------------------
+
+  :py:func:`isotopic_composition_abundance` - calculate the relative
+  abundance of a given isotopic composition.
+
+  :py:func:`most_probable_isotopic_composition` - finds the most
+  abundant isotopic composition for a molecule defined by a
+  polypeptide sequence, chemical formula or elemental composition.
+
+  :py:func:`isotopologues` - iterate over possible isotopic conposition of a molecule,
+  possibly filtered by abundance.
+
+Data
+----
+
+  :py:data:`nist_mass` - a dict with exact masses of the most abundant
+  isotopes.
+
+  :py:data:`std_aa_comp` - a dict with the elemental compositions
+  of the standard twenty amino acid residues, selenocysteine and pyrrolysine.
+
+  :py:data:`std_ion_comp` - a dict with the relative elemental
+  compositions of the standard peptide fragment ions.
+
+  :py:data:`std_aa_mass` - a dict with the monoisotopic masses
+  of the standard twenty amino acid residues, selenocysteine and pyrrolysine.
+
+-----------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from __future__ import division
+import math
+from .. import parser
+from ..auxiliary import PyteomicsError, _nist_mass, BasicComposition
+from itertools import chain, product, combinations_with_replacement
+from collections import defaultdict
+try:
+    from urllib import urlopen
+except ImportError:
+    from urllib.request import urlopen
+from datetime import datetime
+import re
+import operator
+import warnings
+
+nist_mass = _nist_mass
+"""
+A dict with the exact element masses downloaded from the NIST website:
+http://www.nist.gov/pml/data/comp.cfm . There are entries for each
+element containing the masses and relative abundances of several
+abundant isotopes and a separate entry for undefined isotope with zero
+key, mass of the most abundant isotope and 1.0 abundance.
+"""
+
+PROTON = 'H+'
+
+def _make_isotope_string(element_name, isotope_num):
+    """Form a string label for an isotope."""
+    if isotope_num == 0:
+        return element_name
+    else:
+        return '{}[{}]'.format(element_name, isotope_num)
+
+
+def _parse_isotope_string(label):
+    """Parse an string with an isotope label and return the element name and
+    the isotope number.
+
+    >>> _parse_isotope_string('C')
+    ('C', 0)
+    >>> _parse_isotope_string('C[12]')
+    ('C', 12)
+    """
+    element_name, num = re.match(_isotope_string, label).groups()
+    isotope_num = int(num) if num else 0
+    return element_name, isotope_num
+
+
+# Initialize std_aa_comp and std_ion_comp before the Composition class
+# description, fill it later.
+std_aa_comp = {}
+"""A dictionary with elemental compositions of the twenty standard
+amino acid residues, selenocysteine, pyrrolysine,
+and standard H- and -OH terminal groups.
+"""
+
+std_ion_comp = {}
+"""A dict with relative elemental compositions of the standard peptide
+fragment ions. An elemental composition of a fragment ion is calculated as a
+difference between the total elemental composition of an ion
+and the sum of elemental compositions of its constituting amino acid residues.
+"""
+
+_isotope_string = r'^([A-Z][a-z+]*)(?:\[(\d+)\])?$'
+_atom = r'([A-Z][a-z+]*)(?:\[(\d+)\])?([+-]?\d+)?'
+_formula = r'^({})*$'.format(_atom)
+
+
+class Composition(BasicComposition):
+    """
+    A Composition object stores a chemical composition of a
+    substance. Basically, it is a dict object, with the names
+    of chemical elements as keys and values equal to an integer number of
+    atoms of the corresponding element in a substance.
+
+    The main improvement over dict is that Composition objects allow
+    adding and subtraction.
+    """
+    _kw_sources = {'formula', 'sequence', 'parsed_sequence', 'split_sequence', 'composition'}
+    _carrier_spec = r"^(?P<formula>\S+?)(?:(?P<sign>[+-])(?P<charge>\d+)?)?$"
+
+    def _from_parsed_sequence(self, parsed_sequence, aa_comp):
+        self.clear()
+        comp = defaultdict(int)
+        for label in parsed_sequence:
+            if label in aa_comp:
+                for elem, cnt in aa_comp[label].items():
+                    comp[elem] += cnt
+            else:
+                try:
+                    mod, aa = parser._split_label(label)
+                    for elem, cnt in chain(
+                            aa_comp[mod].items(), aa_comp[aa].items()):
+                        comp[elem] += cnt
+
+                except (PyteomicsError, KeyError):
+                    raise PyteomicsError('No information for %s in `aa_comp`' % label)
+        self._from_composition(comp)
+
+    def _from_split_sequence(self, split_sequence, aa_comp):
+        self.clear()
+        comp = defaultdict(int)
+        for group in split_sequence:
+            i = 0
+            while i < len(group):
+                for j in range(len(group) + 1, -1, -1):
+                    try:
+                        label = ''.join(group[i:j])
+                        for elem, cnt in aa_comp[label].items():
+                            comp[elem] += cnt
+                    except KeyError:
+                        continue
+                    else:
+                        i = j
+                        break
+                if j == 0:
+                    raise PyteomicsError("Invalid group starting from position %d: %s" % (i + 1, group))
+        self._from_composition(comp)
+
+    def _from_sequence(self, sequence, aa_comp):
+        parsed_sequence = parser.parse(
+            sequence,
+            labels=aa_comp,
+            show_unmodified_termini=True)
+        self._from_parsed_sequence(parsed_sequence, aa_comp)
+
+    def _from_formula(self, formula):
+        if not re.match(_formula, formula):
+            raise PyteomicsError('Invalid formula: ' + formula)
+        for elem, isotope, number in re.findall(_atom, formula):
+            self[_make_isotope_string(elem, int(isotope) if isotope else 0)] += int(number) if number else 1
+
+    def _from_composition(self, comp):
+        for isotope_string, num_atoms in comp.items():
+            element_name, isotope_num = _parse_isotope_string(
+                isotope_string)
+
+            # Remove explicitly undefined isotopes (e.g. X[0]).
+            self[_make_isotope_string(element_name, isotope_num)] = num_atoms
+
+    def __init__(self, *args, **kwargs):
+        """
+        A Composition object stores a chemical composition of a
+        substance. Basically it is a dict object, in which keys are the names
+        of chemical elements and values contain integer numbers of
+        corresponding atoms in a substance.
+
+        The main improvement over dict is that Composition objects allow
+        addition and subtraction.
+
+        A Composition object can be initialized with one of the
+        following arguments: formula, sequence, parsed_sequence or
+        split_sequence.
+
+        If none of these are specified, the constructor will look at the first
+        positional argument and try to build the object from it. Without
+        positional arguments, a Composition will be constructed directly from
+        keyword arguments.
+
+        If there's an ambiguity, i.e. the argument is both a valid sequence
+        and a formula (such as 'HCN'), it will be treated as a sequence. You
+        need to provide the 'formula' keyword to override this.
+
+        .. warning::
+
+            Be careful when supplying a list with a parsed sequence or a split
+            sequence as a keyword argument. It must be
+            obtained with enabled `show_unmodified_termini` option.
+            When supplying it as a positional argument, the option doesn't
+            matter, because the positional argument is always converted to
+            a sequence prior to any processing.
+
+        Parameters
+        ----------
+        formula : str, optional
+            A string with a chemical formula. All elements must be present in
+            `mass_data`.
+        sequence : str, optional
+            A polypeptide sequence string in modX notation.
+        parsed_sequence : list of str, optional
+            A polypeptide sequence parsed into a list of amino acids.
+        split_sequence : list of tuples of str, optional
+            A polypeptyde sequence parsed into a list of tuples
+            (as returned be :py:func:`pyteomics.parser.parse` with
+            ``split=True``).
+        aa_comp : dict, optional
+            A dict with the elemental composition of the amino acids (the
+            default value is std_aa_comp).
+        ion_comp : dict, optional
+            A dict with the relative elemental compositions of peptide ion
+            fragments (default is :py:data:`std_ion_comp`).
+        ion_type : str, optional
+            If specified, then the polypeptide is considered to be in the form
+            of the corresponding ion.
+        """
+        defaultdict.__init__(self, int)
+
+        aa_comp = kwargs.get('aa_comp', std_aa_comp)
+
+        kw_given = self._kw_sources.intersection(kwargs)
+        if len(kw_given) > 1:
+            raise PyteomicsError('Only one of {} can be specified!\n'
+                    'Given: {}'.format(', '.join(self._kw_sources),
+                        ', '.join(kw_given)))
+        elif kw_given:
+            kwa = kw_given.pop()
+            if kwa == 'formula':
+                self._from_formula(kwargs['formula'])
+            else:
+                getattr(self, '_from_' + kwa)(kwargs[kwa], aa_comp)
+
+        # can't build from kwargs
+        elif args:
+            if isinstance(args[0], dict):
+                self._from_composition(args[0])
+            elif isinstance(args[0], str):
+                try:
+                    self._from_sequence(args[0], aa_comp)
+                except PyteomicsError:
+                    try:
+                        self._from_formula(args[0])
+                    except PyteomicsError:
+                        raise PyteomicsError(
+                                'Could not create a Composition object from '
+                                'string: "{}": not a valid sequence or '
+                                'formula'.format(args[0]))
+            else:
+                try:
+                    self._from_sequence(parser.tostring(args[0], True), aa_comp)
+                except Exception:
+                    raise PyteomicsError('Could not create a Composition object'
+                            ' from `{}`. A Composition object must be '
+                            'specified by sequence, parsed or split sequence,'
+                            ' formula or dict.'.format(args[0]))
+        else:
+            self._from_composition(kwargs)
+
+        ion_comp = kwargs.get('ion_comp', std_ion_comp)
+        if 'ion_type' in kwargs:
+            self += ion_comp[kwargs['ion_type']]
+
+        # Charge is not supported in kwargs
+        charge = self['H+']
+        if 'charge' in kwargs:
+            if charge:
+                raise PyteomicsError('Charge is specified both by the number of protons and `charge` in kwargs')
+            else:
+                warnings.warn('charge and charge carrier should be specified when calling mass(). '
+                    'Support for charge in Composition.__init__ will be removed in a future version.',
+                    FutureWarning)
+                self['H+'] = kwargs['charge']
+
+    @classmethod
+    def _parse_carrier(cls, spec):
+        """Parse a charge carrier spec.
+        The spec syntax is: <formula>[+-][N]
+        <formula> is a chemical formula as supported by :py:meth:`_from_formula`.
+        [+-] is one of "+" or "-", N is a natural number (1 is assumed if omitted).
+        If both the sign and the charge are missing, the charge of this group can be
+        specified as the number of protons in `<formula>`. Otherwise, having protons
+        in `<formula>` is an error.
+
+        Returns
+        -------
+        out : tuple
+            Parsed :py:class:`Composition` and charge of the charge carrier.
+        """
+        if spec is None:
+            return cls({PROTON: 1}), 1
+        try:
+            formula, sign, charge = re.match(cls._carrier_spec, spec).groups()
+        except AttributeError:
+            raise PyteomicsError('Invalid charge carrier specification: ' + spec)
+        comp = cls(formula=formula)
+        if sign is not None and PROTON in comp:
+            raise PyteomicsError('Carrier contains protons and also has a charge specified.')
+        if sign is None:
+            # only formula is given
+            if PROTON not in comp:
+                charge = None
+            charge = comp[PROTON]
+        elif charge is None:
+            charge = (-1, 1)[sign == '+']
+        else:
+            charge = int(charge) * (-1, 1)[sign == '+']
+        return comp, charge
+
+    @staticmethod
+    def _mass_to_mz(mass, composition=None, **kwargs):
+        mass_data = kwargs.get('mass_data', nist_mass)
+        absolute = kwargs.get('absolute', True)
+        average = kwargs.get('average', False)
+
+        # Calculate m/z if required
+        charge = kwargs.get('charge')
+        if charge:
+            # get charge carrier mass and charge
+            charge_carrier = kwargs.get('charge_carrier')
+            ccharge = kwargs.get('carrier_charge')
+            if isinstance(charge_carrier, dict):
+                carrier_comp = Composition(charge_carrier)
+                if ccharge and PROTON in carrier_comp:
+                    raise PyteomicsError('`carrier_charge` specified but the charge carrier contains protons.')
+                carrier_charge = ccharge or carrier_comp[PROTON]
+                if not carrier_charge:
+                    raise PyteomicsError('Charge carrier charge not specified.')
+            else:
+                carrier_comp, carrier_charge = (composition or Composition)._parse_carrier(charge_carrier)
+                if carrier_charge and ccharge:
+                    raise PyteomicsError('Both `carrier_charge` and charge in carrier spec are given.')
+                carrier_charge = ccharge or carrier_charge
+                if not carrier_charge:
+                    raise PyteomicsError('Charge of the charge carrier group not specified.')
+            if charge % carrier_charge:
+                raise PyteomicsError('The `charge` must be a multiple of the carrier charge. Given: {} and {}'.format(
+                    charge, carrier_charge))
+            num = charge // carrier_charge
+            carrier_mass = carrier_comp.mass(mass_data=mass_data, average=average, charge=0)
+
+        if charge and (composition is None or not composition['H+']):
+            mass += carrier_mass * num
+        if charge and composition and composition['H+']:
+            raise PyteomicsError('Composition contains protons and charge is explicitly specified.')
+        if charge is None and composition and composition['H+']:
+            warnings.warn('Charge is not specified, but the Composition contains protons. Assuming m/z calculation.')
+            charge = composition['H+']
+        if charge:
+            mass /= charge
+        if charge and charge < 0 and absolute:
+            mass = abs(mass)
+        return mass
+
+    def mass(self, **kwargs):
+        """Calculate the mass or *m/z* of a :py:class:`Composition`.
+
+        Parameters
+        ----------
+        average : bool, optional
+            If :py:const:`True` then the average mass is calculated.
+            Note that mass is not averaged for elements with specified isotopes.
+            Default is :py:const:`False`.
+        charge : int, optional
+            If not 0 then m/z is calculated. See also: `charge_carrier`.
+        charge_carrier : str or dict, optional
+            Chemical group carrying the charge. Defaults to a proton, "H+".
+            If string, must be a chemical formula, as supported by the
+            :class:`Composition` `formula` argument,
+            except it must end with a charge formatted as "[+-][N]".
+            If N is omitted, single charge is assumed.
+            Examples of `charge_carrier`: "H+", "NH3+"
+            (here, 3 is part of the composition, and + is a single charge),
+            "Fe+2" ("Fe" is the formula and "+2" is the charge).
+            .. note :: `charge` must be a multiple of `charge_carrier` charge.
+
+            If dict, it is the atomic composition of the group.
+            In this case, the charge can be passed separately as `carrier_charge`
+            or it will be deduced from the number of protons in `charge_carrier`.
+        carrier_charge : int, optional
+            Charge of the charge carrier group (if `charge_carrier` is specified
+            as a composition dict).
+
+            .. note :: `charge` must be a multiple of `charge_charge`.
+
+        mass_data : dict, optional
+            A dict with the masses of the chemical elements (the default
+            value is :py:data:`nist_mass`).
+        ion_comp : dict, optional
+            A dict with the relative elemental compositions of peptide ion
+            fragments (default is :py:data:`std_ion_comp`).
+        ion_type : str, optional
+            If specified, then the polypeptide is considered to be in the form
+            of the corresponding ion. Do not forget to specify the charge state!
+        absolute : bool, optional
+            If :py:const:`True` (default), the m/z value returned will always be positive,
+            even for negatively charged ions.
+
+            .. note ::
+                `absolute` only applies when `charge` is negative.
+                The mass can still be negative for negative compositions.
+
+        Returns
+        -------
+        mass : float
+        """
+        composition = self
+        mass_data = kwargs.get('mass_data', nist_mass)
+
+        # Calculate mass
+        mass = 0.0
+        average = kwargs.get('average', False)
+
+        for isotope_string, amount in composition.items():
+            element_name, isotope_num = _parse_isotope_string(isotope_string)
+            # Calculate average mass if required and the isotope number is
+            # not specified.
+            if (not isotope_num) and average:
+                for isotope, data in mass_data[element_name].items():
+                    if isotope:
+                        mass += (amount * data[0] * data[1])
+            else:
+                mass += (amount * mass_data[element_name][isotope_num][0])
+
+        return self._mass_to_mz(mass, self, **kwargs)
+
+
+std_aa_comp.update({
+    'A':   Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1}),
+    'C':   Composition({'H': 5, 'C': 3, 'S': 1, 'O': 1, 'N': 1}),
+    'D':   Composition({'H': 5, 'C': 4, 'O': 3, 'N': 1}),
+    'E':   Composition({'H': 7, 'C': 5, 'O': 3, 'N': 1}),
+    'F':   Composition({'H': 9, 'C': 9, 'O': 1, 'N': 1}),
+    'G':   Composition({'H': 3, 'C': 2, 'O': 1, 'N': 1}),
+    'H':   Composition({'H': 7, 'C': 6, 'N': 3, 'O': 1}),
+    'I':   Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
+    'J':   Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
+    'K':   Composition({'H': 12, 'C': 6, 'N': 2, 'O': 1}),
+    'L':   Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}),
+    'M':   Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1}),
+    'N':   Composition({'H': 6, 'C': 4, 'O': 2, 'N': 2}),
+    'P':   Composition({'H': 7, 'C': 5, 'O': 1, 'N': 1}),
+    'Q':   Composition({'H': 8, 'C': 5, 'O': 2, 'N': 2}),
+    'R':   Composition({'H': 12, 'C': 6, 'N': 4, 'O': 1}),
+    'S':   Composition({'H': 5, 'C': 3, 'O': 2, 'N': 1}),
+    'T':   Composition({'H': 7, 'C': 4, 'O': 2, 'N': 1}),
+    'V':   Composition({'H': 9, 'C': 5, 'O': 1, 'N': 1}),
+    'W':   Composition({'C': 11, 'H': 10, 'N': 2, 'O': 1}),
+    'Y':   Composition({'H': 9, 'C': 9, 'O': 2, 'N': 1}),
+    'U':   Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1, 'Se' : 1}),
+    'O':   Composition({'H': 19, 'C': 12, 'O': 2, 'N': 3}),
+    'H-':  Composition({'H': 1}),
+    '-OH': Composition({'O': 1, 'H': 1}),
+    })
+
+
+std_ion_comp.update({
+    'M':        Composition(formula=''),
+    'M-H2O':    Composition(formula='H-2O-1'),
+    'M-NH3':    Composition(formula='N-1H-3'),
+    'a':        Composition(formula='H-2O-1' + 'C-1O-1'),
+    'a-H2O':    Composition(formula='H-2O-1' + 'C-1O-1' + 'H-2O-1'),
+    'a-NH3':    Composition(formula='H-2O-1' + 'C-1O-1' + 'N-1H-3'),
+    'b':        Composition(formula='H-2O-1'),
+    'b-H2O':    Composition(formula='H-2O-1' + 'H-2O-1'),
+    'b-NH3':    Composition(formula='H-2O-1' + 'N-1H-3'),
+    'c':        Composition(formula='H-2O-1' + 'NH3'),
+    'c-1':      Composition(formula='H-2O-1' + 'NH3' + 'H-1'),
+    'c-dot':    Composition(formula='H-2O-1' + 'NH3' + 'H1'),
+    'c+1':      Composition(formula='H-2O-1' + 'NH3' + 'H1'),
+    'c+2':      Composition(formula='H-2O-1' + 'NH3' + 'H2'),
+    'c-H2O':    Composition(formula='H-2O-1' + 'NH3' + 'H-2O-1'),
+    'c-NH3':    Composition(formula='H-2O-1'),
+    'x':        Composition(formula='H-2O-1' + 'CO2'),
+    'x-H2O':    Composition(formula='H-2O-1' + 'CO2' + 'H-2O-1'),
+    'x-NH3':    Composition(formula='H-2O-1' + 'CO2' + 'N-1H-3'),
+    'y':        Composition(formula=''),
+    'y-H2O':    Composition(formula='H-2O-1'),
+    'y-NH3':    Composition(formula='N-1H-3'),
+    'z':        Composition(formula='H-2O-1' + 'ON-1H-1'),
+    'z-dot':    Composition(formula='H-2O-1' + 'ON-1'),
+    'z+1':      Composition(formula='H-2O-1' + 'ON-1H1'),
+    'z+2':      Composition(formula='H-2O-1' + 'ON-1H2'),
+    'z+3':      Composition(formula='H-2O-1' + 'ON-1H3'),
+    'z-H2O':    Composition(formula='H-2O-1' + 'ON-1H-1' + 'H-2O-1'),
+    'z-NH3':    Composition(formula='H-2O-1' + 'ON-1H-1' + 'N-1H-3'),
+    })
+
+
+def calculate_mass(*args, **kwargs):
+    """Calculates the monoisotopic mass of a polypeptide defined by a
+    sequence string, parsed sequence, chemical formula or
+    Composition object.
+
+    One or none of the following keyword arguments is required:
+    **formula**, **sequence**, **parsed_sequence**, **split_sequence**
+    or **composition**.
+    All arguments given are used to create a :py:class:`Composition` object,
+    unless an existing one is passed as a keyword argument.
+
+    Note that if a sequence string is supplied and terminal groups are not
+    explicitly shown, then the mass is calculated for a polypeptide with
+    standard terminal groups (NH2- and -OH).
+
+    .. warning::
+
+        Be careful when supplying a list with a parsed sequence. It must be
+        obtained with enabled `show_unmodified_termini` option.
+
+    Parameters
+    ----------
+    formula : str, optional
+        A string with a chemical formula.
+    sequence : str, optional
+        A polypeptide sequence string in modX notation.
+    proforma : str, optional
+        A polypeptide sequeence string in `ProForma notation <https://www.psidev.info/proforma>`_,
+        or a :py:class:`pyteomics.proforma.ProForma` object.
+    parsed_sequence : list of str, optional
+        A polypeptide sequence parsed into a list of amino acids.
+    composition : Composition, optional
+        A Composition object with the elemental composition of a substance.
+    aa_comp : dict, optional
+        A dict with the elemental composition of the amino acids (the
+        default value is std_aa_comp).
+    average : bool, optional
+        If :py:const:`True` then the average mass is calculated. Note that mass
+        is not averaged for elements with specified isotopes. Default is
+        :py:const:`False`.
+    charge : int, optional
+        If not 0 then m/z is calculated: the mass is increased
+        by the corresponding number of proton masses and divided
+        by `charge`.
+    charge_carrier : str or dict, optional
+        Chemical group carrying the charge. Defaults to a proton, "H+".
+        If string, must be a chemical formula, as supported by the
+        :class:`Composition` `formula` argument,
+        except it must end with a charge formatted as "[+-][N]".
+        If N is omitted, single charge is assumed.
+        Examples of `charge_carrier`: "H+", "NH3+"
+        (here, 3 is part of the composition, and + is a single charge),
+        "Fe+2" ("Fe" is the formula and "+2" is the charge).
+
+        .. note ::
+            `charge` must be a multiple of `charge_carrier` charge.
+
+        If dict, it is the atomic composition of the group.
+        In this case, the charge can be passed separately as `carrier_charge`
+        or it will be deduced from the number of protons in `charge_carrier`.
+    carrier_charge : int, optional
+        Charge of the charge carrier group (if `charge_carrier` is specified
+        as a composition dict).
+
+        .. note ::
+            `charge` must be a multiple of `charge_charge`.
+
+    mass_data : dict, optional
+        A dict with the masses of the chemical elements (the default
+        value is :py:data:`nist_mass`).
+    ion_comp : dict, optional
+        A dict with the relative elemental compositions of peptide ion
+        fragments (default is :py:data:`std_ion_comp`).
+    ion_type : str, optional
+        If specified, then the polypeptide is considered to be in the form
+        of the corresponding ion. Do not forget to specify the charge state!
+    absolute : bool, optional
+        If :py:const:`True` (default), the m/z value returned will always be positive,
+        even for negatively charged ions.
+
+        .. note ::
+            `absolute` only applies when `charge` is negative.
+            The mass can still be negative for negative compositions.
+
+    Returns
+    -------
+    mass : float
+    """
+    if 'proforma' in kwargs:
+        # do not try to create a composition
+        from .. import proforma
+        proteoform = kwargs.pop('proforma')
+        if isinstance(proteoform, str):
+            proteoform = proforma.ProForma.parse(proteoform)
+        return Composition._mass_to_mz(proteoform.mass, **kwargs)
+
+    # These parameters must be passed to mass(), not __init__
+    mass_kw = {}
+    for k in ['charge', 'charge_carrier', 'carrier_charge', 'absolute']:
+        if k in kwargs:
+            mass_kw[k] = kwargs.pop(k)
+    # Make a copy of `composition` keyword argument.
+    composition = (Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs))
+    kwargs.update(mass_kw)
+    return composition.mass(**kwargs)
+
+
+def most_probable_isotopic_composition(*args, **kwargs):
+    """Calculate the most probable isotopic composition of a peptide
+    molecule/ion defined by a sequence string, parsed sequence,
+    chemical formula or :py:class:`Composition` object.
+
+    Note that if a sequence string without terminal groups is supplied then the
+    isotopic composition is calculated for a polypeptide with standard
+    terminal groups (H- and -OH).
+
+    For each element, only two most abundant isotopes are considered.
+
+    Parameters
+    ----------
+    formula : str, optional
+        A string with a chemical formula.
+    sequence : str, optional
+        A polypeptide sequence string in modX notation.
+    parsed_sequence : list of str, optional
+        A polypeptide sequence parsed into a list of amino acids.
+    composition : :py:class:`Composition`, optional
+        A :py:class:`Composition` object with the elemental composition of a
+        substance.
+    elements_with_isotopes : list of str
+        A list of elements to be considered in isotopic distribution
+        (by default, every element has a isotopic distribution).
+    aa_comp : dict, optional
+        A dict with the elemental composition of the amino acids (the
+        default value is :py:data:`std_aa_comp`).
+    mass_data : dict, optional
+        A dict with the masses of chemical elements (the default
+        value is :py:data:`nist_mass`).
+    ion_comp : dict, optional
+        A dict with the relative elemental compositions of peptide ion
+        fragments (default is :py:data:`std_ion_comp`).
+
+    Returns
+    -------
+    out: tuple (Composition, float)
+        A tuple with the most probable isotopic composition and its
+        relative abundance.
+    """
+
+    composition = (dict(kwargs['composition']) if 'composition' in kwargs
+                   else Composition(*args, **kwargs))
+
+    # Removing isotopes from the composition.
+    for isotope_string in composition:
+        element_name, isotope_num = _parse_isotope_string(isotope_string)
+        if isotope_num:
+            composition[element_name] += composition.pop(isotope_string)
+
+    mass_data = kwargs.get('mass_data', nist_mass)
+    elements_with_isotopes = kwargs.get('elements_with_isotopes')
+    isotopic_composition = Composition()
+
+    for element_name in composition:
+        if not elements_with_isotopes or (element_name in elements_with_isotopes):
+            # Take the two most abundant isotopes.
+            first_iso, second_iso = sorted([(i[0], i[1][1]) for i in mass_data[element_name].items() if i[0]],
+                key=lambda x: -x[1])[:2]
+
+            # Write the number of isotopes of the most abundant type.
+            first_iso_str = _make_isotope_string(element_name, first_iso[0])
+            isotopic_composition[first_iso_str] = int(math.ceil(
+                composition[element_name])) * first_iso[1]
+
+            # Write the number of the second isotopes.
+            second_iso_str = _make_isotope_string(element_name, second_iso[0])
+            isotopic_composition[second_iso_str] = composition[element_name] - isotopic_composition[first_iso_str]
+        else:
+            isotopic_composition[element_name] = composition[element_name]
+
+    return (isotopic_composition,
+            isotopic_composition_abundance(composition=isotopic_composition, mass_data=mass_data))
+
+
+def isotopic_composition_abundance(*args, **kwargs):
+    """Calculate the relative abundance of a given isotopic composition
+    of a molecule.
+
+    Parameters
+    ----------
+    formula : str, optional
+        A string with a chemical formula.
+    composition : Composition, optional
+        A Composition object with the isotopic composition of a substance.
+    mass_data : dict, optional
+        A dict with the masses of chemical elements (the default
+        value is :py:data:`nist_mass`).
+
+    Returns
+    -------
+    relative_abundance : float
+        The relative abundance of a given isotopic composition.
+    """
+
+    composition = (Composition(kwargs['composition'])
+                   if 'composition' in kwargs
+                   else Composition(*args, **kwargs))
+
+    isotopic_composition = defaultdict(dict)
+
+    # Check if there are default and non-default isotopes of the same
+    # element and rearrange the elements.
+    for element in composition:
+        element_name, isotope_num = _parse_isotope_string(element)
+
+        # If there is already an entry for this element and either it
+        # contains a default isotope or newly added isotope is default
+        # then raise an exception.
+        if (element_name in isotopic_composition) and (isotope_num == 0 or 0 in isotopic_composition[element_name]):
+            raise PyteomicsError(
+                'Please specify the isotopic states of all atoms of %s or do not specify them at all.' % element_name)
+        else:
+            isotopic_composition[element_name][isotope_num] = composition[element]
+
+    # Calculate relative abundance.
+    mass_data = kwargs.get('mass_data', nist_mass)
+    num1, num2, denom = 1, 1, 1
+    for element_name, isotope_dict in isotopic_composition.items():
+        num1 *= math.factorial(sum(isotope_dict.values()))
+        for isotope_num, isotope_content in isotope_dict.items():
+            denom *= math.factorial(isotope_content)
+            if isotope_num:
+                num2 *= mass_data[element_name][isotope_num][1] ** isotope_content
+
+    return num2 * (num1 / denom)
+
+
+def isotopologues(*args, **kwargs):
+    """Iterate over possible isotopic states of a molecule.
+    The molecule can be defined by formula, sequence, parsed sequence, or composition.
+    The space of possible isotopic compositions is restrained by parameters
+    ``elements_with_isotopes``, ``isotope_threshold``, ``overall_threshold``.
+
+    Parameters
+    ----------
+    formula : str, optional
+        A string with a chemical formula.
+    sequence : str, optional
+        A polypeptide sequence string in modX notation.
+    parsed_sequence : list of str, optional
+        A polypeptide sequence parsed into a list of amino acids.
+    composition : :py:class:`Composition`, optional
+        A :py:class:`Composition` object with the elemental composition of a
+        substance.
+    report_abundance : bool, optional
+        If :py:const:`True`, the output will contain 2-tuples: `(composition, abundance)`.
+        Otherwise, only compositions are yielded. Default is :py:const:`False`.
+    elements_with_isotopes : container of str, optional
+        A set of elements to be considered in isotopic distribution
+        (by default, every element has an isotopic distribution).
+    isotope_threshold : float, optional
+        The threshold abundance of a specific isotope to be considered.
+        Default is :py:const:`5e-4`.
+    overall_threshold : float, optional
+        The threshold abundance of the calculateed isotopic composition.
+        Default is :py:const:`0`.
+    aa_comp : dict, optional
+        A dict with the elemental composition of the amino acids (the
+        default value is :py:data:`std_aa_comp`).
+    mass_data : dict, optional
+        A dict with the masses of chemical elements (the default
+        value is :py:data:`nist_mass`).
+
+    Returns
+    -------
+    out : iterator
+        Iterator over possible isotopic compositions.
+    """
+    iso_threshold = kwargs.pop('isotope_threshold', 5e-4)
+    overall_threshold = kwargs.pop('overall_threshold', 0.0)
+    mass_data = kwargs.get('mass_data', nist_mass)
+    elements_with_isotopes = kwargs.get('elements_with_isotopes')
+    report_abundance = kwargs.get('report_abundance', False)
+    composition = Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs)
+    other_kw = kwargs.copy()
+    for k in Composition._kw_sources:
+        other_kw.pop(k, None)
+
+    dict_elem_isotopes = {}
+    for element in composition:
+        if elements_with_isotopes is None or element in elements_with_isotopes:
+            element_name, isotope_num = _parse_isotope_string(element)
+            isotopes = {k: v for k, v in mass_data[element_name].items() if k != 0 and v[1] >= iso_threshold}
+            list_isotopes = [_make_isotope_string(element_name, k) for k in isotopes]
+            dict_elem_isotopes[element] = list_isotopes
+        else:
+            dict_elem_isotopes[element] = [element]
+    all_isotoplogues = []
+    for element, list_isotopes in dict_elem_isotopes.items():
+        n = composition[element]
+        list_comb_element_n = []
+        for elementXn in combinations_with_replacement(list_isotopes, n):
+            list_comb_element_n.append(elementXn)
+        all_isotoplogues.append(list_comb_element_n)
+
+    for isotopologue in product(*all_isotoplogues):
+        ic = Composition(formula=''.join(atom for el in isotopologue for atom in el), **other_kw)
+        if report_abundance or overall_threshold > 0.0:
+            abundance = isotopic_composition_abundance(composition=ic, **other_kw)
+            if abundance > overall_threshold:
+                if report_abundance:
+                    yield (ic, abundance)
+                else:
+                    yield ic
+        else:
+            yield ic
+
+
+std_aa_mass = {
+    'G': 57.02146372057,
+    'A': 71.03711378471,
+    'S': 87.03202840427001,
+    'P': 97.05276384885,
+    'V': 99.06841391299,
+    'T': 101.04767846841,
+    'C': 103.00918478471,
+    'L': 113.08406397713001,
+    'I': 113.08406397713001,
+    'J': 113.08406397713001,
+    'N': 114.04292744114001,
+    'D': 115.02694302383001,
+    'Q': 128.05857750527997,
+    'K': 128.09496301399997,
+    'E': 129.04259308796998,
+    'M': 131.04048491299,
+    'H': 137.05891185845002,
+    'F': 147.06841391298997,
+    'U': 150.95363508471,
+    'R': 156.10111102359997,
+    'Y': 163.06332853254997,
+    'W': 186.07931294985997,
+    'O': 237.14772686284996}
+"""A dictionary with monoisotopic masses of the twenty standard
+amino acid residues, selenocysteine and pyrrolysine.
+"""
+
+
+def fast_mass(sequence, ion_type=None, charge=None, **kwargs):
+    """Calculate monoisotopic mass of an ion using the fast
+    algorithm. May be used only if amino acid residues are presented in
+    one-letter code.
+
+    Parameters
+    ----------
+    sequence : str
+        A polypeptide sequence string.
+    ion_type : str, optional
+        If specified, then the polypeptide is considered to be
+        in a form of corresponding ion. Do not forget to
+        specify the charge state!
+    charge : int, optional
+        If not 0 then m/z is calculated: the mass is increased
+        by the corresponding number of proton masses and divided
+        by z.
+    mass_data : dict, optional
+        A dict with the masses of chemical elements (the default
+        value is :py:data:`nist_mass`).
+    aa_mass : dict, optional
+        A dict with the monoisotopic mass of amino acid residues
+        (default is std_aa_mass);
+    ion_comp : dict, optional
+        A dict with the relative elemental compositions of peptide ion
+        fragments (default is :py:data:`std_ion_comp`).
+
+    Returns
+    -------
+    mass : float
+        Monoisotopic mass or m/z of a peptide molecule/ion.
+    """
+    aa_mass = kwargs.get('aa_mass', std_aa_mass)
+    try:
+        mass = sum(aa_mass[i] for i in sequence)
+    except KeyError as e:
+        raise PyteomicsError('No mass data for residue: ' + e.args[0])
+
+    mass_data = kwargs.get('mass_data', nist_mass)
+    mass += mass_data['H'][0][0] * 2 + mass_data['O'][0][0]
+
+    if ion_type:
+        try:
+            icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type]
+        except KeyError:
+            raise PyteomicsError('Unknown ion type: {}'.format(ion_type))
+
+        mass += sum(mass_data[element][0][0] * num for element, num in icomp.items())
+
+    if charge:
+        mass = (mass + mass_data['H+'][0][0] * charge) / charge
+
+    return mass
+
+
+def fast_mass2(sequence, ion_type=None, charge=None, **kwargs):
+    """Calculate monoisotopic mass of an ion using the fast
+    algorithm. *modX* notation is fully supported.
+
+    Parameters
+    ----------
+    sequence : str
+        A polypeptide sequence string.
+    ion_type : str, optional
+        If specified, then the polypeptide is considered to be
+        in a form of corresponding ion. Do not forget to
+        specify the charge state!
+    charge : int, optional
+        If not 0 then m/z is calculated: the mass is increased
+        by the corresponding number of proton masses and divided
+        by z.
+    mass_data : dict, optional
+        A dict with the masses of chemical elements (the default
+        value is :py:data:`nist_mass`).
+    aa_mass : dict, optional
+        A dict with the monoisotopic mass of amino acid residues
+        (default is std_aa_mass);
+    ion_comp : dict, optional
+        A dict with the relative elemental compositions of peptide ion
+        fragments (default is :py:data:`std_ion_comp`).
+
+    Returns
+    -------
+    mass : float
+        Monoisotopic mass or m/z of a peptide molecule/ion.
+    """
+    aa_mass = kwargs.get('aa_mass', std_aa_mass)
+    mass_data = kwargs.get('mass_data', nist_mass)
+    try:
+        comp = parser.amino_acid_composition(sequence,
+                show_unmodified_termini=True,
+                allow_unknown_modifications=True,
+                labels=aa_mass)
+    except PyteomicsError:
+        raise PyteomicsError('Mass not specified for label(s): {}'.format(
+            ', '.join(set(parser.parse(sequence)).difference(aa_mass))))
+
+    try:
+        mass = 0
+        for aa, num in comp.items():
+            if aa in aa_mass:
+                mass += aa_mass[aa] * num
+            elif parser.is_term_mod(aa):
+                assert num == 1
+                mass += calculate_mass(formula=aa.strip('-'), mass_data=mass_data)
+            else:
+                mod, X = parser._split_label(aa)
+                mass += (aa_mass[mod] + aa_mass[X]) * num
+    except KeyError as e:
+        raise PyteomicsError('Unspecified mass for modification: "{}"'.format(e.args[0]))
+
+    if ion_type:
+        try:
+            icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type]
+        except KeyError:
+            raise PyteomicsError('Unknown ion type: {}'.format(ion_type))
+
+        mass += sum(mass_data[element][0][0] * num
+             for element, num in icomp.items())
+
+    if charge:
+        mass = (mass + mass_data['H+'][0][0] * charge) / charge
+
+    return mass
+
+
+class Unimod():
+    """A class for Unimod database of modifications.
+    The list of all modifications can be retrieved via `mods` attribute.
+    Methods for convenient searching are `by_title` and `by_name`.
+    For more elaborate filtering, iterate manually over the list.
+
+    .. note::
+        See :py:mod:`pyteomics.mass.unimod` for a new alternative class with
+        more features.
+    """
+
+    def __init__(self, source='http://www.unimod.org/xml/unimod.xml'):
+        """Create a database and fill it from XML file retrieved from `source`.
+
+        Parameters
+        ----------
+
+        source : str or file, optional
+            A file-like object or a URL to read from. Don't forget the ``'file://'``
+            prefix when pointing to local files.
+        """
+        from lxml import etree
+        from ..xml import _local_name
+
+        def process_mod(mod):
+            d = mod.attrib
+            new_d = {}
+            for key in ('date_time_modified', 'date_time_posted'):
+                new_d[key] = datetime.strptime(d.pop(key), '%Y-%m-%d %H:%M:%S')
+            comp = Composition()
+            for delta in self._xpath('delta', mod):  # executed 1 time
+                for key in ('avge_mass', 'mono_mass'):
+                    new_d[key] = float(delta.attrib.pop(key))
+                for elem in self._xpath('element', delta):
+                    e_d = elem.attrib
+                    amount = int(e_d.pop('number'))
+                    label = e_d.pop('symbol')
+                    isotope, symbol = re.match(r'^(\d*)(\D+)$', label).groups()
+                    if not isotope:
+                        isotope = 0
+                    else:
+                        isotope = int(isotope)
+                    comp += Composition(formula=_make_isotope_string(symbol, isotope), mass_data=self._massdata) * amount
+            new_d['composition'] = comp
+            new_d['record_id'] = int(d.pop('record_id'))
+            new_d['approved'] = d.pop('approved') == '1'
+            new_d.update(d)
+            spec = []
+            for sp in self._xpath('specificity', mod):
+                sp_d = sp.attrib
+                sp_new_d = {}
+                sp_new_d['hidden'] = (sp_d.pop('hidden') == '1')
+                sp_new_d['spec_group'] = int(sp_d.pop('spec_group'))
+                sp_new_d.update(sp_d)
+                notes = []
+                for note in self._xpath('*', sp):
+                    if note.text and note.text.strip():
+                        notes.append(note.text.strip())
+                if notes:
+                    sp_new_d['note'] = '\n'.join(notes)
+                spec.append(sp_new_d)
+            new_d['specificity'] = spec
+
+            alt_names = []
+            for alt_name in self._xpath('alt_name', mod):
+                alt_names.append(alt_name.text)
+            if alt_names:
+                new_d['alt_names'] = alt_names
+
+            refs = []
+            for ref in self._xpath('xref', mod):
+                ref_d = {}
+                for sub in ref.iterchildren():
+                    ref_d[_local_name(sub)] = sub.text
+                for key in ('text', 'source', 'url'):
+                    if key not in ref_d:
+                        ref_d[key] = None
+                refs.append(ref_d)
+            new_d['refs'] = refs
+            return new_d
+
+        if isinstance(source, str):
+            self._tree = etree.parse(urlopen(source))
+        else:
+            self._tree = etree.parse(source)
+        self._massdata = self._mass_data()
+        self._mods = []
+        self._id = {}
+        for i, mod in enumerate(self._xpath('/unimod/modifications/mod')):
+            mod_dict = process_mod(mod)
+            self._mods.append(mod_dict)
+            self._id[mod_dict['record_id']] = i
+
+    def _xpath(self, path, element=None):
+        from ..xml import xpath
+        if element is None:
+            return xpath(self._tree, path, 'umod')
+        return xpath(element, path, 'umod')
+
+    def _mass_data(self):
+        massdata = defaultdict(dict)
+        elements = [x.attrib for x in self._xpath('/unimod/elements/elem')]
+        avg = {}
+        for elem in elements:
+            i, label = re.match(r'^(\d*)(\D+)$', elem['title']).groups()
+            if not i:
+                iso = 0
+            else:
+                iso = int(i)
+            massdata[label][iso] = (float(elem['mono_mass']), float(iso == 0))
+            if not iso:
+                avg[label] = float(elem['avge_mass'])
+        for elem, isotopes in massdata.items():
+            isotopes[int(round(isotopes[0][0]))] = isotopes[0]
+            if len(isotopes) == 3:
+                m1, m2 = (x[1][0] for x in sorted(isotopes.items())[1:])
+                m_avg = avg[elem]
+                a = (m2 - m_avg) / (m2 - m1)
+                b = (m_avg - m1) / (m2 - m1)
+                for state, abundance in zip(sorted(isotopes)[1:], (a, b)):
+                    isotopes[state] = (isotopes[state][0], abundance)
+        return massdata
+
+    @property
+    def mods(self):
+        """Get the list of Unimod modifications"""
+        return self._mods
+
+    @property
+    def mass_data(self):
+        """Get element mass data extracted from the database"""
+        return self._massdata
+
+    def by_title(self, title, strict=True):
+        """Search modifications by title. If a single modification is found,
+        it is returned. Otherwise, a list will be returned.
+
+        Parameters
+        ----------
+        title : str
+            The modification title.
+        strict : bool, optional
+            If :py:const:`False`, the search will return all modifications
+            whose title **contains** `title`, otherwise equality is required.
+            :py:const:`True` by default.
+
+        Returns
+        -------
+        out : dict or list
+            A single modification or a list of modifications.
+        """
+        f = {True: operator.eq, False: operator.contains}
+        func = f[strict]
+        result = [m for m in self._mods if func(m['title'], title)]
+        if len(result) == 1:
+            return result[0]
+        return result
+
+    def by_name(self, name, strict=True):
+        """Search modifications by name. If a single modification is found,
+        it is returned. Otherwise, a list will be returned.
+
+        Parameters
+        ----------
+        name : str
+            The full name of the modification(s).
+        strict : bool, optional
+            If :py:const:`False`, the search will return all modifications
+            whose full name **contains** `title`, otherwise equality is
+            required. :py:const:`True` by default.
+
+        Returns
+        -------
+        out : dict or list
+            A single modification or a list of modifications.
+        """
+        f = {True: operator.eq, False: operator.contains}
+        func = f[strict]
+        result = [m for m in self._mods if func(m['full_name'], name)]
+        if len(result) == 1:
+            return result[0]
+        return result
+
+    def by_id(self, i):
+        """Search modifications by record ID. If a modification is found,
+        it is returned. Otherwise, :py:exc:`KeyError` is raised.
+
+        Parameters
+        ----------
+        i : int or str
+            The Unimod record ID.
+
+        Returns
+        -------
+        out : dict
+            A single modification dict.
+        """
+        if isinstance(i, str):
+            i = int(i)
+        return self._mods[self._id[i]]
+
+    __getitem__ = by_id
+
+
+def neutral_mass(mz, z, charge_carrier=_nist_mass[PROTON][0][0]):
+    return (mz * abs(z)) - (z * charge_carrier)
+
+
+def mass_charge_ratio(neutral_mass, z, charge_carrier=_nist_mass[PROTON][0][0]):
+    return (neutral_mass + (z * charge_carrier)) / abs(z)
diff --git a/pyteomics/mass/unimod.py b/pyteomics/mass/unimod.py
new file mode 100644
index 0000000000000000000000000000000000000000..471d00ee9c349d6c194ae2db95aa9752d2293dd6
--- /dev/null
+++ b/pyteomics/mass/unimod.py
@@ -0,0 +1,798 @@
+"""
+unimod - interface to the Unimod database
+=========================================
+
+This module provides an interface to the relational Unimod database.
+The main class is :py:class:`Unimod`.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml` and :py:mod:`sqlalchemy`.
+"""
+
+#   Copyright 2015 Joshua Klein, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import re
+
+from lxml import etree
+from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta
+from sqlalchemy.orm import relationship, backref, object_session
+from sqlalchemy.ext.associationproxy import association_proxy
+from sqlalchemy import (Numeric, Unicode,
+                        Column, Integer, ForeignKey,
+                        UnicodeText, Boolean, event)
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+
+from . import mass
+
+model_registry = set()
+
+
+class SubclassRegisteringDeclarativeMeta(DeclarativeMeta):
+    def __new__(cls, name, parents, attrs):
+        new_type = super(SubclassRegisteringDeclarativeMeta,
+                         cls).__new__(cls, name, parents, attrs)
+        model_registry.add(new_type)
+        return new_type
+
+
+Base = declarative_base(metaclass=SubclassRegisteringDeclarativeMeta)
+
+_unimod_xml_download_url = 'http://www.unimod.org/xml/unimod_tables.xml'
+
+try:
+    basestring
+except:
+    basestring = (str, bytes)
+
+
+CompositionType = mass.Composition
+
+
+def simple_repr(self):  # pragma: no cover
+    template = '{self.__class__.__name__}({d})'
+    d = {'%s=%r' % (k, v) for k, v in self.__dict__.items() if not k.startswith('_')}
+    return template.format(self=self, d=', '.join(d))
+
+Base.__repr__ = simple_repr
+
+
+def remove_namespace(doc, namespace):
+    """Remove namespace in the passed document in place."""
+    ns = u'{%s}' % namespace
+    nsl = len(ns)
+    for elem in doc.getiterator():
+        if elem.tag.startswith(ns):
+            elem.tag = elem.tag[nsl:]
+
+
+def preprocess_xml(doc_path):
+    """
+    Parse and drop namespaces from an XML document.
+
+    Parameters
+    ----------
+    doc_path : str
+
+    Returns
+    -------
+    out : etree.ElementTree
+    """
+    tree = etree.parse(doc_path)
+    root = tree.getroot()
+    for ns in root.nsmap.values():
+        remove_namespace(tree, ns)
+    return tree
+
+
+def _formula_parser(formula, session):
+    """
+    Parse a unimod formula composed of elements,
+    isotopes, and other bricks.
+
+    In order to look up a Brick's composition, this
+    function must have access to a session.
+
+    Parameters
+    ----------
+    formula : str
+        A Unimod formula of the form `A(n) B(m)...`
+        where A, B, ... are element names or bricks and
+        (n), (m)... are parenthesized possibly signed integers or
+        omitted in which case they are interpreted as 1
+    session : Session
+        An active SQLAlchemy session for looking up bricks in the database
+
+    Returns
+    -------
+    out : CompositionType
+    """
+    composition = CompositionType()
+    for token in formula.split(' '):
+        match = re.search(r'(?P<isotope>\d+)?(?P<elemet>[^\(]+)(?:\((?P<count>-?\d+)\))?', token)
+        if match:
+            isotope, element, count = match.groups()
+            if count is not None:
+                count = int(count)
+            else:
+                count = 1
+            if isotope is not None:
+                name = mass._make_isotope_string(element, isotope)
+            else:
+                name = element
+            is_brick = session.query(Brick).filter(Brick.brick == name).first()
+            if is_brick is None:
+                composition[name] += count
+            else:
+                composition += is_brick.composition * count
+    return composition
+
+
+def _composition_listener(attr):
+    """
+    Attach event listeners to an InstrumentedAttribute
+    to trigger formula parsing on load and on change.
+    """
+    @event.listens_for(attr, 'set')
+    def _update_composition_from_formula(target, value, oldvalue, initiator):
+        session = object_session(target)
+        if value == '' or value is None:
+            return
+        # If the object hasn't been associated with a session,
+        # we can't look up bricks.
+        if session is None:
+            return
+        target.composition = _formula_parser(value, session)
+
+    @event.listens_for(attr.class_, 'load')
+    def _update_composition_on_load(target, context):
+        value = getattr(target, attr.prop.key)
+        if value == '' or value is None:
+            return
+        session = object_session(target)
+        target.composition = _formula_parser(value, session)
+
+
+def has_composition(attr_name):
+    """
+    A decorator to simplify flagging a Model with a column
+    to be treated as a formula for parsing. Calls :func:`_composition_listener`
+    internally.
+    """
+    def decorator(model):
+        _composition_listener(getattr(model, attr_name))
+        return model
+    return decorator
+
+
+class HasFullNameMixin(object):
+    """
+    A simple mixin to standardize equality operators
+    for models with a :attr:`full_name` attribute.
+    """
+    def __eq__(self, other):
+        try:
+            return self.full_name == other.full_name
+        except AttributeError:
+            return False
+
+    def __ne__(self, other):
+        return not self == other
+
+
+class AlternativeName(Base):
+    __tablename__ = 'AlternativeName'
+
+    _tag_name = 'alt_names_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            alt_name=attrib['alt_name'],
+            modification_id=int(attrib['mod_key'])
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    alt_name = Column(Unicode(256), index=True)
+    modification_id = Column(Integer, ForeignKey('Modification.id'), index=True)
+
+
+class AminoAcid(Base, HasFullNameMixin):
+    __tablename__ = 'AminoAcid'
+
+    _tag_name = 'amino_acids_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            full_name=attrib['full_name'],
+            one_letter=attrib['one_letter'],
+            three_letter=attrib['three_letter'],
+            num_H=int(attrib['num_H']),
+            num_O=int(attrib['num_O']),
+            num_C=int(attrib['num_C']),
+            num_N=int(attrib['num_N']),
+            num_S=int(attrib['num_S']),
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    num_H = Column(Integer)
+    num_O = Column(Integer)
+    num_C = Column(Integer)
+    num_N = Column(Integer)
+    num_S = Column(Integer)
+    full_name = Column(Unicode(25), index=True)
+    one_letter = Column(Unicode(10), index=True)
+    three_letter = Column(Unicode(10), index=True)
+
+
+class Classification(Base):
+    __tablename__ = 'Classification'
+
+    _tag_name = 'classifications_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            classification=attrib['classification']
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    classification = Column(Unicode(30), index=True)
+
+
+class Position(Base):
+    __tablename__ = 'Position'
+
+    _tag_name = 'positions_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            position=attrib['position']
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    position = Column(Unicode(20), index=True)
+
+
+class Brick(Base, HasFullNameMixin):
+    __tablename__ = 'Brick'
+
+    _tag_name = 'bricks_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            brick=attrib['brick'],
+            full_name=attrib['full_name']
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    brick = Column(Unicode(64), index=True)
+    full_name = Column(Unicode(128), index=True)
+
+    elements = relationship('BrickToElement')
+
+    @property
+    def composition(self):
+        composition = CompositionType()
+        for element_relation in self.elements:
+            symbol = element_relation.element
+            isotope, element = re.search(r'(?P<isotope>\d+)?(?P<element>\S+)', symbol).groups()
+            if isotope:
+                isotope = int(isotope)
+                iso_str = mass._make_isotope_string(element, isotope)
+            else:
+                iso_str = element
+            count = element_relation.count
+            composition[iso_str] = count
+        return composition
+
+
+class Fragment(Base):
+    __tablename__ = 'Fragment'
+
+    _tag_name = 'fragments_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            modification_id=int(attrib['mod_key'])
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    modification_id = Column(Integer, ForeignKey('Modification.id'), index=True)
+
+    _fragment_composition = relationship('FragmentComposition')
+
+    @property
+    def composition(self):
+        composition = CompositionType()
+        session = object_session(self)
+        for fragment_composition_relation in self._fragment_composition:
+            symbol = fragment_composition_relation.brick_string
+            isotope, element = re.search(r'(?P<isotope>\d+)?(?P<element>\S+)', symbol).groups()
+            count = fragment_composition_relation.count
+            if count is not None:
+                count = int(count)
+            else:
+                count = 1
+            if isotope:
+                name = mass._make_isotope_string(element, isotope)
+            else:
+                name = element
+            is_brick = session.query(Brick).filter(Brick.brick == name).first()
+            if is_brick is None:
+                composition[name] += count
+            else:
+                composition += is_brick.composition * count
+        return composition
+
+
+class FragmentComposition(Base):
+    __tablename__ = 'FragmentComposition'
+
+    _tag_name = 'fragment_comp_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            brick_string=attrib['brick'],
+            fragment_id=int(attrib['fragments_key']),
+            count=int(attrib['num_brick'])
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    brick_string = Column(Unicode(64), ForeignKey(Brick.brick), index=True)
+    fragment_id = Column(Integer, ForeignKey('Fragment.id'), index=True)
+    count = Column(Integer)
+
+
+class ModificationToBrick(Base):
+    __tablename__ = 'ModificationToBrick'
+
+    _tag_name = 'mod2brick_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            brick_string=(attrib['brick']),
+            modification_id=int(attrib['mod_key']),
+            count=int(attrib['num_brick'])
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    brick_string = Column(Unicode(64), ForeignKey(Brick.brick), index=True)
+    modification_id = Column(Integer, ForeignKey('Modification.id'), index=True)
+    count = Column(Integer)
+
+
+class BrickToElement(Base):
+    __tablename__ = 'BrickToElement'
+
+    _tag_name = 'brick2element_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            brick_id=int(attrib['brick_key']),
+            count=int(attrib['num_element']),
+            element=attrib['element']
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    brick_id = Column(Integer, ForeignKey(Brick.id), index=True)
+    element = Column(Unicode(16), ForeignKey('Element.element'), index=True)
+    element_obj = relationship('Element', uselist=False)
+    count = Column(Integer)
+
+
+class Element(Base, HasFullNameMixin):
+    __tablename__ = 'Element'
+
+    _tag_name = 'elements_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            average_mass=float(attrib['avge_mass']),
+            monoisotopic_mass=float(attrib['mono_mass']),
+            full_name=attrib['full_name'],
+            element=attrib['element']
+
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    average_mass = Column(Numeric(12, 6, asdecimal=False))
+    monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False))
+    full_name = Column(Unicode(64), index=True)
+    element = Column(Unicode(16), index=True)
+
+
+@has_composition('_composition')
+class Modification(Base, HasFullNameMixin):
+    __tablename__ = 'Modification'
+
+    _tag_name = 'modifications_row'
+
+    id = Column(Integer, primary_key=True)
+    username_of_poster = Column(Unicode(128))
+    average_mass = Column(Numeric(12, 6, asdecimal=False), index=True)
+    ex_code_name = Column(Unicode(64), index=True)
+    monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False), index=True)
+    full_name = Column(Unicode(128), index=True)
+    code_name = Column(Unicode(128), index=True)
+    _composition = Column(Unicode(128), index=True)
+    approved = Column(Boolean, index=True)
+
+    notes = relationship('MiscNotesModifications')
+    specificities = relationship('Specificity')
+    bricks = relationship(ModificationToBrick)
+    _fragments = relationship(Fragment)
+
+    _alt_names = relationship(AlternativeName, backref=backref('modification'))
+    # Maps the list of AlternativeName instances loaded dynamically from _alt_names
+    # into a list of plain strings, since the AlternativeName type contains no
+    # additional information.
+    alternative_names = association_proxy('_alt_names', 'alt_name')
+    fragments = association_proxy('_fragments', 'composition')
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            username_of_poster=attrib['username_of_poster'],
+            average_mass=float(attrib['avge_mass']),
+            monoisotopic_mass=float(attrib['mono_mass']),
+            ex_code_name=attrib['ex_code_name'],
+            code_name=attrib['code_name'],
+            full_name=attrib['full_name'],
+            approved=bool(int(attrib['approved'])),
+            _composition=attrib['composition']
+            )
+        for note in tag:
+            if note.tag == MiscNotesModifications._tag_name:
+                model_note = MiscNotesModifications._from_tag(note, inst.id)
+                if model_note is not None:
+                    inst.notes.append(model_note)
+        return inst
+
+
+class MiscNotesModifications(Base):
+    __tablename__ = 'MiscNotesModifications'
+    _tag_name = 'misc_notes'
+
+    id = Column(Integer, primary_key=True)
+    modification_id = Column(Integer, ForeignKey(Modification.id), index=True)
+    text = Column(UnicodeText)
+
+    @classmethod
+    def _from_tag(cls, tag, modification_id):
+        if tag.text is None:
+            return
+        return cls(text=tag.text, modification_id=modification_id)
+
+
+class Specificity(Base):
+    __tablename__ = 'Specificity'
+
+    _tag_name = 'specificity_row'
+
+    id = Column(Integer, primary_key=True)
+    position_id = Column(Integer, ForeignKey(Position.id), index=True)
+    classification_id = Column(Integer, ForeignKey(Classification.id), index=True)
+    classification = relationship('Classification', uselist=False)
+    # Map through one_letter
+    amino_acid = Column(Unicode(10), ForeignKey(AminoAcid.one_letter), index=True)
+    modification_id = Column(Integer, ForeignKey(Modification.id), index=True)
+    hidden = Column(Boolean, index=True)
+    group = Column(Integer, index=True)
+    neutral_losses = relationship('SpecificityToNeutralLoss')
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            position_id=int(attrib['position_key']),
+            classification_id=int(attrib['classifications_key']),
+            hidden=bool(int(attrib['hidden'])),
+            amino_acid=attrib['one_letter'],
+            modification_id=int(attrib['mod_key']),
+            )
+        return inst
+
+
+class NeutralLoss(Base):
+    __tablename__ = 'NeutralLoss'
+
+    _tag_name = 'neutral_losses_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            brick_string=(attrib['brick']),
+            count=int(attrib['num_brick']),
+            specificity_id=int(attrib['spec_key'])
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    brick_string = Column(Unicode(64), index=True)
+    specificity_id = Column(Integer, ForeignKey(Specificity.id), index=True)
+    count = Column(Integer)
+
+
+@has_composition('_composition')
+class SpecificityToNeutralLoss(Base):
+    __tablename__ = 'SpecificityToNeutralLoss'
+
+    _tag_name = 'spec2nl_row'
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls(
+            id=int(attrib['record_id']),
+            specificity_id=int(attrib['spec_key']),
+            monoisotopic_mass=float(attrib['nl_mono_mass']),
+            average_mass=float(attrib['nl_avge_mass']),
+            is_required_peptide_neutral_loss=bool(int(attrib['is_req_pep_nl'])),
+            is_peptide_neutral_loss=bool(int(attrib['is_pep_nl'])),
+            is_slave=bool(int(attrib['is_slave_nl'])),
+            _composition=attrib['nl_composition']
+            )
+        return inst
+
+    id = Column(Integer, primary_key=True)
+    specificity_id = Column(Integer, ForeignKey(Specificity.id), index=True)
+    specificity = relationship(Specificity, uselist=False)
+    monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False), index=True)
+    average_mass = Column(Numeric(12, 6, asdecimal=False), index=True)
+    _composition = Column(Unicode(128))
+    is_slave = Column(Boolean, index=True)
+    is_peptide_neutral_loss = Column(Boolean, index=True)
+    is_required_peptide_neutral_loss = Column(Boolean, index=True)
+
+
+class CrossreferenceSource(Base):
+    __tablename__ = 'CrossreferenceSource'
+    _tag_name = 'xref_sources_row'
+
+    id = Column(Integer, primary_key=True)
+    source = Column(Unicode(64), index=True)
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls()
+        inst.id = int(attrib['record_id'])
+        inst.source = attrib['xref_source']
+        return inst
+
+
+class Crossreference(Base):
+    __tablename__ = 'Crossreference'
+
+    _tag_name = 'xrefs_row'
+
+    id = Column(Integer, primary_key=True)
+    source_id = Column(Integer, ForeignKey(CrossreferenceSource.id), index=True)
+    source = relationship(CrossreferenceSource, uselist=False)
+    url = Column(Unicode(128))
+    modification_id = Column(Integer, ForeignKey(Modification.id), index=True)
+    text = Column(UnicodeText)
+
+    @classmethod
+    def from_tag(cls, tag):
+        attrib = tag.attrib
+        inst = cls()
+        inst.id = int(attrib['record_id'])
+        inst.url = attrib['xref_url']
+        inst.source_id = int(attrib['xref_source_key'])
+        inst.modification_id = int(attrib['mod_key'])
+        text = []
+        for node in tag.getchildren():
+            if node.tag == 'xref_text':
+                if node.text is not None:
+                    text.append(node.text)
+        inst.text = '\n'.join(text)
+        return inst
+
+
+def load(doc_path, output_path='sqlite://'):
+    """
+    Parse the relational table-like XML file provided by http://www.unimod.org/downloads.html
+    and convert each <tag>_row into an equivalent database entry.
+
+    By default the table will be held in memory.
+    """
+    tree = preprocess_xml(doc_path)
+    engine = create_engine(output_path)
+    Base.metadata.create_all(engine)
+    session = sessionmaker(bind=engine, autoflush=False)()
+    for model in model_registry:
+        if hasattr(model, '_tag_name') and hasattr(model, 'from_tag'):
+            for tag in tree.iterfind('.//' + model._tag_name):
+                session.add(model.from_tag(tag))
+        session.commit()
+    return session
+
+
+def session(path='sqlite:///unimod.db'):
+    engine = create_engine(path)
+    Base.metadata.create_all(engine)
+    session = sessionmaker(bind=engine, autoflush=False)()
+    return session
+
+
+class Unimod(object):
+    """
+    Main class representing the relational Unimod database.
+
+    Examples
+    --------
+
+    If you just wish to get a new copy of the data and store it in a temporary
+    in-memory database, invoking the type without parameters works without issue.
+
+    >>> new_db = Unimod()
+
+    If you want to persist a snapshot of the Unimod database to disk and query it
+    from there, or to re-use a previously downloaded database copy, pass a database
+    driver prefixed path:
+
+    >>> reused_db = Unimod("sqlite:///path/to/unimod.db")
+
+    If the path did not previously exist, a new copy of Unimod will be downloaded
+    and stored there on the first use, but be immediately available on subsequent
+    uses.
+    """
+    def __init__(self, path=None):
+        """
+        Initialize the object from a database file.
+
+        Parameters
+        ----------
+        path : str or None, optional
+            If :py:class:`str`, should point to a database.
+            Use a dialect-specific prefix, like ``'sqlite://'``.
+            If :py:const:`None` (default), a relational
+            XML file will be downloaded from default location.
+        """
+        if path is None:
+            self.path = None
+            self.session = load(_unimod_xml_download_url)
+        else:
+            self.path = path
+            try:
+                self.session = session(path)
+                if self.session.query(Modification).first() is None:
+                    raise Exception()
+            except:
+                # Database may not yet exist at that location
+                self.session = load(_unimod_xml_download_url, path)
+                self.session.query(Modification).first()
+
+    def get(self, identifier, strict=True):
+        """
+        Get a modification matching `identifier`.
+        Replaces both :py:mod:`by_name` and :py:mod:`by_title` methods
+        in the old class.
+
+        Parameters
+        ----------
+        identifier : str
+
+        strict : bool, optional
+            Defaults to :py:const:`True`.
+
+        Returns
+        -------
+        out : Modification
+        """
+        if isinstance(identifier, int):
+            mod = self.session.query(Modification).get(identifier)
+            if mod is None:
+                raise KeyError(identifier)
+            return mod
+        elif isinstance(identifier, basestring):
+            if strict:
+                mod = self.session.query(Modification).filter(
+                    (Modification.full_name == identifier) |
+                    (Modification.code_name == identifier) |
+                    (Modification.ex_code_name == identifier)).first()
+                if mod is None:
+                    alt_name = self.session.query(AlternativeName).filter(
+                        AlternativeName.alt_name == identifier).first()
+                    if alt_name is None:
+                        raise KeyError(identifier)
+                    mod = alt_name.modification
+                return mod
+            else:
+                qname = '%%%s%%' % identifier
+                mod = self.session.query(Modification).filter(
+                    (Modification.full_name.like(qname)) |
+                    (Modification.code_name.like(qname)) |
+                    (Modification.ex_code_name.like(qname))).first()
+                if mod is None:
+                    alt_name = self.session.query(AlternativeName).filter(
+                        AlternativeName.alt_name.like(qname)).first()
+                    if alt_name is None:
+                        raise KeyError(identifier)
+                    mod = alt_name.modification
+                return mod
+
+    by_title = by_name = get
+
+    __getitem__ = get
+
+    @property
+    def mods(self):
+        return self.session.query(Modification).all()
+
+    def __iter__(self):
+        return iter(self.session.query(Modification).yield_per(1000))
+
+    def query(self, *args):
+        '''Compose an SQL query using SQLAlchemy's ORM interface.
+
+        See :mod:`sqlalchemy`'s Session documentation for more details.
+        '''
+        return self.session.query(*args)
+
+    def execute(self, *args, **kwargs):
+        '''Execute an SQLAlchemy statement or a SQL string against the database,
+        returning the resulting database cursor.
+
+        See :mod:`sqlalchemy`'s Session documentation for more details.
+        '''
+        return self.session.execute(*args, **kwargs)
diff --git a/pyteomics/mgf.py b/pyteomics/mgf.py
new file mode 100644
index 0000000000000000000000000000000000000000..811e43ebd65f90465707cb221048e6ab611092fb
--- /dev/null
+++ b/pyteomics/mgf.py
@@ -0,0 +1,830 @@
+"""
+mgf - read and write MS/MS data in Mascot Generic Format
+========================================================
+
+Summary
+-------
+
+`MGF <http://www.matrixscience.com/help/data_file_help.html>`_ is a simple
+human-readable format for MS/MS data. It allows storing MS/MS peak lists and
+exprimental parameters.
+
+This module provides classes and functions for access to data stored in
+MGF files.
+Parsing is done using :py:class:`MGF` and :py:class:`IndexedMGF` classes.
+The :py:func:`read` function can be used as an entry point.
+MGF spectra are converted to dictionaries. MS/MS data points are
+(optionally) represented as :py:mod:`numpy` arrays.
+Also, common parameters can be read from MGF file header with
+:py:func:`read_header` function.
+:py:func:`write` allows creation of MGF files.
+
+Classes
+-------
+
+  :py:class:`MGF` - a text-mode MGF parser. Suitable to read spectra from a file consecutively.
+  Needs a file opened in text mode (or will open it if given a file name).
+
+  :py:class:`IndexedMGF` - a binary-mode MGF parser. When created, builds a byte offset index
+  for fast random access by spectrum titles. Sequential iteration is also supported.
+  Needs a seekable file opened in binary mode (if created from existing file object).
+
+  :py:class:`MGFBase` - abstract class, the common ancestor of the two classes above.
+  Can be used for type checking.
+
+Functions
+---------
+
+  :py:func:`read` - an alias for :py:class:`MGF` or :py:class:`IndexedMGF`.
+
+  :py:func:`get_spectrum` - read a single spectrum with given title from a file.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`read_header` - get a dict with common parameters for all spectra
+  from the beginning of MGF file.
+
+  :py:func:`write` - write an MGF file.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import itertools as it
+import sys
+import warnings
+from . import auxiliary as aux
+
+
+class MGFBase(aux.MaskedArrayConversionMixin):
+    """Abstract mixin class representing an MGF file. Subclasses implement different approaches to parsing."""
+    _comments = set('#;!/')
+    _array_keys = ['m/z array', 'intensity array', 'charge array', 'ion array']
+    _array_keys_unicode = [u'm/z array', u'intensity array', u'charge array', u'ion array']
+    encoding = None
+
+    def __init__(self, source=None, **kwargs):
+        """Create an MGF file object, set MGF-specific parameters.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MGF format. Default is
+            :py:const:`None`, which means read standard input.
+
+        use_header : bool, optional, keyword only
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`True`.
+
+        convert_arrays : one of {0, 1, 2}, optional, keyword only
+            If `0`, m/z, intensities and (possibly) charges or (possibly) ions will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        read_charges : bool, optional, keyword only
+            If `True` (default), fragment charges are reported. Disabling it improves performance.
+
+        read_ions : bool, optional
+            If `True` (default: False), fragment ions are reported. Disabling it improves performance.
+            Note that right now, only one of (read_charges, read_ions) may be True.
+
+        dtype : type or str or dict, optional, keyword only
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.
+
+        encoding : str, optional, keyword only
+            File encoding.
+        """
+
+        super(MGFBase, self).__init__(source, **kwargs)
+        self._use_header = kwargs.pop('use_header', True)
+        self._read_charges = kwargs.pop('read_charges', True)
+        self._read_ions = kwargs.pop('read_ions', False)
+        # Make sure no charges are read if ions are read
+        if self._read_ions:
+            self._read_charges = False
+        if self._use_header:
+            self._read_header()
+        else:
+            self._header = None
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__, (self._source_init,), self.__getstate__())
+
+    def __getstate__(self):
+        state = super(MGFBase, self).__getstate__()
+        state['use_header'] = self._use_header
+        state['header'] = self._header
+        return state
+
+    def __setstate__(self, state):
+        super(MGFBase, self).__setstate__(state)
+        self._header = state['header']
+        self._use_header = state['use_header']
+
+    @staticmethod
+    def parse_precursor_charge(charge_text, list_only=False):
+        return aux._parse_charge(charge_text, list_only=list_only)
+
+    @staticmethod
+    def parse_peak_charge(charge_text, list_only=False):
+        return aux._parse_charge(charge_text, list_only=False)
+
+    @staticmethod
+    def parse_peak_ion(ion_text):
+        return aux._parse_ion(ion_text)
+
+    @property
+    def header(self):
+        if self._header is None:
+            self._read_header()
+        return self._header
+
+    def _read_header_lines(self, header_lines):
+        header = {}
+        for line in header_lines:
+            if line.strip() == 'BEGIN IONS':
+                break
+            l = line.split('=')
+            if len(l) == 2:
+                key = l[0].lower()
+                val = l[1].strip()
+                header[key] = val
+        if 'charge' in header:
+            header['charge'] = self.parse_precursor_charge(header['charge'], True)
+        self._header = header
+
+    def _read_spectrum_lines(self, lines):
+        """Read a single spectrum from ``self._source``.
+
+        Returns
+        -------
+        out : dict
+        """
+
+        masses = []
+        intensities = []
+        charges = []
+        ions = []
+
+        params = self.header.copy() if self._use_header else {}
+
+        for i, line in enumerate(lines):
+            sline = line.strip()
+            if sline == 'BEGIN IONS':
+                if i == 0:
+                    continue
+                else:
+                    raise aux.PyteomicsError('Error when parsing MGF: unexpected start of spectrum.')
+            if not sline or sline[0] in self._comments:
+                pass
+            elif sline == 'END IONS':
+                if 'pepmass' in params:
+                    try:
+                        pepmass = tuple(map(float, params['pepmass'].split()))
+                    except ValueError:
+                        raise aux.PyteomicsError('MGF format error: cannot parse '
+                                'PEPMASS = {}'.format(params['pepmass']))
+                    else:
+                        params['pepmass'] = pepmass + (None,) * (2-len(pepmass))
+                if isinstance(params.get('charge'), aux.basestring):
+                    params['charge'] = self.parse_precursor_charge(params['charge'], True)
+                if 'rtinseconds' in params:
+                    params['rtinseconds'] = aux.unitfloat(params['rtinseconds'], 'second')
+                out = {'params': params, 'm/z array': masses, 'intensity array': intensities}
+                if self._read_charges:
+                    out['charge array'] = charges
+                if self._read_ions:
+                    out['ion array'] = ions
+                self._build_all_arrays(out)
+                if self.encoding and sys.version_info.major == 2:
+                    for key, ukey in zip(self._array_keys + ['params'], self._array_keys_unicode + [u'params']):
+                        if key in out:
+                            out[ukey] = out.pop(key)
+                return out
+
+            else:
+                if '=' in sline:  # spectrum-specific parameters!
+                    l = sline.split('=', 1)
+                    params[l[0].lower()] = l[1].strip()
+                else:  # this must be a peak list
+                    l = sline.split()
+                    try:
+                        masses.append(float(l[0]))
+                        intensities.append(float(l[1]))
+                        if self._read_charges:
+                            charges.append(self.parse_peak_charge(l[2]) if len(l) > 2 else 0)
+                        if self._read_ions:
+                            ions.append(self.parse_peak_ion(l[2]) if len(l) > 2 else "")
+                    except ValueError:
+                        raise aux.PyteomicsError(
+                             'Error when parsing %s. Line:\n%s' % (getattr(self._source, 'name', 'MGF file'), line))
+                    except IndexError:
+                        pass
+
+    def get_spectrum(self, title):
+        raise NotImplementedError()
+
+    @staticmethod
+    def _get_time(spectrum):
+        try:
+            return spectrum['params']['rtinseconds']
+        except KeyError:
+            raise aux.PyteomicsError('RT information not found.')
+
+
+class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixin, aux.IndexSavingTextReader):
+    """
+    A class representing an MGF file. Supports the `with` syntax and direct iteration for sequential
+    parsing. Specific spectra can be accessed by title using the indexing syntax in constant time.
+    If created using a file object, it needs to be opened in binary mode.
+
+    When iterated, :py:class:`IndexedMGF` object yields spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array',
+    'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
+    'ion_array' is an array of Ions (str)
+    and 'params' stores a :py:class:`dict` of parameters (keys and values are
+    :py:class:`str`, keys corresponding to MGF, lowercased).
+
+    Attributes
+    ----------
+
+    header : dict
+        The file header.
+    time : RTLocator
+        A property used for accessing spectra by retention time.
+    """
+    delimiter = 'BEGIN IONS'
+
+    def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True,
+                 dtype=None, encoding='utf-8', index_by_scans=False, read_ions=False, _skip_index=False, **kwargs):
+        """
+        Create an :py:class:`IndexedMGF` (binary-mode) reader for a given MGF file.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MGF format. Default is
+            :py:const:`None`, which means read standard input.
+
+            .. note :: If a file object is given, it must be opened in binary mode.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`True`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        read_charges : bool, optional
+            If `True` (default), fragment charges are reported. Disabling it improves performance.
+
+        read_ions : bool, optional
+            If `True` (default: False), fragment ion types are reported. Disabling it improves performance.
+            Note that right now, only one of (read_charges, read_ions) may be True.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.
+
+        encoding : str, optional
+            File encoding.
+
+        block_size : int, optinal
+            Size of the chunk (in bytes) used to parse the file when creating the byte offset index.
+
+        Returns
+        -------
+
+        out : IndexedMGF
+            The reader object.
+        """
+        self._index_by_scans = index_by_scans
+        self._read_ions = read_ions
+        self.label = r'SCANS=(\d+)\s*' if index_by_scans else r'TITLE=([^\n]*\S)\s*'
+        super(IndexedMGF, self).__init__(source, parser_func=self._read, pass_file=False, args=(), kwargs={},
+                                         use_header=use_header, convert_arrays=convert_arrays,
+                                         read_charges=read_charges,
+                                         dtype=dtype, encoding=encoding, read_ions=read_ions, _skip_index=_skip_index,
+                                         **kwargs)
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__,
+                (self._source_init, False, self._convert_arrays, self._read_charges,
+                 None, self.encoding, self._index_by_scans, self._read_ions, True),
+                self.__getstate__())
+
+    @aux._keepstate_method
+    def _read_header(self):
+        try:
+            first = next(v for v in self._offset_index.values())[0]
+        except StopIteration:  # the index is empty, no spectra in file
+            first = -1
+        header_lines = self.read(first).decode(self.encoding).split('\n')
+        return self._read_header_lines(header_lines)
+
+    def _item_from_offsets(self, offsets):
+        start, end = offsets
+        lines = self._read_lines_from_offsets(start, end)
+        return self._read_spectrum_lines(lines)
+
+    def _read(self, **kwargs):
+        for _, offsets in self._offset_index.items():
+            spectrum = self._item_from_offsets(offsets)
+            yield spectrum
+
+    def get_spectrum(self, key):
+        return self.get_by_id(key)
+
+
+class MGF(MGFBase, aux.FileReader):
+    """
+    A class representing an MGF file. Supports the `with` syntax and direct iteration for sequential
+    parsing. Specific spectra can be accessed by title using the indexing syntax (if the file is seekable),
+    but it takes linear time to search through the file. Consider using :py:class:`IndexedMGF` for
+    constant-time access to spectra.
+
+    :py:class:`MGF` object behaves as an iterator, **yielding** spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array',
+    'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
+    'ion_array' is a masked array of Ions (str)
+    and 'params' stores a :py:class:`dict` of parameters (keys and values are
+    :py:class:`str`, keys corresponding to MGF, lowercased).
+
+    Attributes
+    ----------
+
+    header : dict
+        The file header.
+
+    """
+
+    def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True,
+            read_ions=False, dtype=None, encoding=None):
+        """
+        Create an :py:class:`MGF` (text-mode) reader for a given MGF file.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MGF format. Default is
+            :py:const:`None`, which means read standard input.
+
+            ..note :: If a file object is given, it must be opened in text mode.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`True`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        read_charges : bool, optional
+            If `True` (default), fragment charges are reported. Disabling it improves performance.
+
+        read_ions : bool, optional
+            If `True` (default: False), fragment ion types are reported. Disabling it improves performance.
+            Note that right now, only one of (read_charges, read_ions) may be True.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.
+
+        encoding : str, optional
+            File encoding.
+
+        Returns
+        -------
+
+        out : MGF
+            The reader object.
+        """
+        super(MGF, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={},
+            encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges,
+            read_ions=read_ions, dtype=dtype)
+
+    @aux._keepstate_method
+    def _read_header(self):
+        return self._read_header_lines(self._source)
+
+    def _read_spectrum(self):
+        return self._read_spectrum_lines(self._source)
+
+    def _read(self):
+        for line in self._source:
+            if line.strip() == 'BEGIN IONS':
+                yield self._read_spectrum()
+
+    @aux._keepstate_method
+    def get_spectrum(self, title):
+        for line in self._source:
+            sline = line.strip()
+            if sline[:5] == 'TITLE' and sline.split('=', 1)[1].strip() == title:
+                spectrum = self._read_spectrum()
+                spectrum['params']['title'] = title
+                return spectrum
+
+    def __getitem__(self, key):
+        return self.get_spectrum(key)
+
+
+def read(*args, **kwargs):
+    """Returns a reader for a given MGF file. Most of the parameters repeat the
+    instantiation signature of :py:class:`MGF` and :py:class:`IndexedMGF`.
+    Additional parameter `use_index` helps decide which class to instantiate
+    for given `source`.
+
+    Parameters
+    ----------
+
+    source : str or file or None, optional
+        A file object (or file name) with data in MGF format. Default is
+        :py:const:`None`, which means read standard input.
+
+    use_header : bool, optional
+        Add the info from file header to each dict. Spectrum-specific parameters
+        override those from the header in case of conflict.
+        Default is :py:const:`True`.
+
+    convert_arrays : one of {0, 1, 2}, optional
+        If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+        If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+        If `2`, charges will be reported as a masked array (default).
+        The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+    read_charges : bool, optional
+        If `True` (default), fragment charges are reported. Disabling it improves performance.
+
+    read_ions : bool, optional
+        If `True` (default: False), fragment ion types are reported. Disabling it improves performance.
+        Note that right now, only one of (read_charges, read_ions) may be True.
+
+    dtype : type or str or dict, optional
+        dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+        Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'.
+
+    encoding : str, optional
+        File encoding.
+
+    use_index : bool, optional
+        Determines which parsing method to use. If :py:const:`True` (default), an instance of
+        :py:class:`IndexedMGF` is created. This facilitates random access by spectrum titles.
+        If an open file is passed as `source`, it needs to be open in binary mode.
+
+        If :py:const:`False`, an instance of :py:class:`MGF` is created. It reads
+        `source` in text mode and is suitable for iterative parsing. Access by spectrum title
+        requires linear search and thus takes linear time.
+
+    block_size : int, optinal
+        Size of the chunk (in bytes) used to parse the file when creating the byte offset index.
+        (Accepted only for :py:class:`IndexedMGF`.)
+
+    Returns
+    -------
+
+    out : MGFBase
+        Instance of :py:class:`MGF` or :py:class:`IndexedMGF`.
+    """
+    if args:
+        source = args[0]
+    else:
+        source = kwargs.get('source')
+    use_index = kwargs.pop('use_index', None)
+    use_index = aux._check_use_index(source, use_index, True)
+    tp = IndexedMGF if use_index else MGF
+    return tp(*args, **kwargs)
+
+
+def get_spectrum(source, title, *args, **kwargs):
+    """Read one spectrum (with given `title`) from `source`.
+
+    See :py:func:`read` for explanation of parameters affecting the output.
+
+    .. note :: Only the key-value pairs after the "TITLE =" line will be included in the output.
+
+    Parameters
+    ----------
+
+    source : str or file or None
+        File to read from.
+    title : str
+        Spectrum title.
+    *args
+        Given to :py:func:`read`.
+    **kwargs
+        Given to :py:func:`read`.
+
+    Returns
+    -------
+    out : dict or None
+        A dict with the spectrum, if it is found, and None otherwise.
+
+    """
+    with read(source, *args, **kwargs) as f:
+        return f[title]
+
+
+@aux._keepstate
+def read_header(source):
+    """
+    Read the specified MGF file, get search parameters specified in the header
+    as a :py:class:`dict`, the keys corresponding to MGF format (lowercased).
+
+    Parameters
+    ----------
+
+    source : str or file
+        File name or file object representing an file in MGF format.
+
+    Returns
+    -------
+
+    header : dict
+    """
+    with aux._file_obj(source, 'r') as source:
+        header = {}
+        for line in source:
+            if line.strip() == 'BEGIN IONS':
+                break
+            l = line.split('=')
+            if len(l) == 2:
+                key = l[0].lower()
+                val = l[1].strip()
+                header[key] = val
+        if 'charge' in header:
+            header['charge'] = aux._parse_charge(header['charge'], True)
+        return header
+
+
+_default_key_order = ['title', 'pepmass', 'rtinseconds', 'charge']
+
+
+def _pepmass_repr(k, pepmass):
+    outstr = k.upper() + '='
+    if not isinstance(pepmass, (str, int, float)):  # assume iterable
+        try:
+            outstr += ' '.join(str(x) for x in pepmass if x is not None)
+        except TypeError:
+            raise aux.PyteomicsError('Cannot handle parameter: PEPMASS = {}'.format(pepmass))
+    else:
+        outstr += str(pepmass)
+    return outstr
+
+
+def _charge_repr(k, charge):
+    try:
+        val = aux.Charge(charge)
+    except (TypeError, aux.PyteomicsError):
+        val = aux.ChargeList(charge)
+    return '{}={}'.format(k.upper(), val)
+
+
+def _default_repr(key, val):
+    return '{}={}'.format(key.upper(), val)
+
+
+_default_value_formatters = {'pepmass': _pepmass_repr, 'charge': _charge_repr}
+
+
+@aux._file_writer()
+def write(spectra, output=None, header='', key_order=_default_key_order, fragment_format=None,
+    write_charges=True, write_ions=False, use_numpy=None, param_formatters=_default_value_formatters):
+    """
+    Create a file in MGF format.
+
+    Parameters
+    ----------
+
+    spectra : iterable
+        A **sequence** of dictionaries with keys 'm/z array', 'intensity array',
+        and 'params'. 'm/z array' and 'intensity array' should be sequences of
+        :py:class:`int`, :py:class:`float`, or :py:class:`str`. Strings will
+        be written 'as is'. The sequences should be of equal length, otherwise
+        excessive values will be ignored.
+
+        'params' should be a :py:class:`dict` with keys corresponding to MGF
+        format. Keys must be strings, they will be uppercased and used as is,
+        without any format consistency tests. Values can be of any type allowing
+        string representation.
+
+        'charge array' or 'ion array' can also be specified.
+
+        .. note ::
+            Passing a single spectrum will work, but will trigger a warning. This usage pattern is discouraged.
+            To ensure correct output when writing multiple spectra,
+            it is recommended to construct a sequence of spectra first and then call :py:func:`write` once.
+
+        .. seealso ::
+            This discussion of usage patterns of :py:func:`write`: https://github.com/levitsky/pyteomics/discussions/109
+
+    output : str or file or None, optional
+        Path or a file-like object open for writing. If an existing file is
+        specified by file name, it will be opened for writing.
+        Default value is :py:const:`None`, which means using standard output.
+
+        .. note::
+            The default mode for output files specified by name has been changed
+            from `a` to `w` in *pyteomics 4.6*. See `file_mode` to override the mode.
+
+    header : dict or (multiline) str or list of str, optional
+        In case of a single string or a list of strings, the header will be
+        written 'as is'. In case of dict, the keys (must be strings) will be
+        uppercased.
+
+    write_charges : bool, optional
+        If :py:const:`False`, fragment charges from 'charge array' will not be written.
+        Default is :py:const:`True`.
+
+    write_ions : bool, optional
+        If :py:const:`False`, fragment ions from 'ion array' will not be written.
+        If :py:const:`True`, then `write_charges` is set to :py:const:`False`.
+        Default is :py:const:`False`.
+
+    fragment_format : str, optional
+        Format string for m/z, intensity and charge (or ion annotation) of a fragment. Useful to set
+        the number of decimal places, e.g.:
+        ``fragment_format='%.4f %.0f'``. Default is ``'{} {} {}'``.
+
+        .. note::
+            The supported format syntax differs depending on other parameters.
+            If `use_numpy` is :py:const:`True` and :py:mod:`numpy` is available,
+            fragment peaks will be written using :py:func:`numpy.savetxt`. Then,
+            `fragment_format` must be recognized by that function.
+
+            Otherwise, plain Python string formatting is done.
+            See `the docs
+            <https://docs.python.org/library/string.html#format-specification-mini-language>`_
+            for details on writing the format string.
+            If some or all charges are missing, an empty string is substituted
+            instead, so formatting as :py:class:`!float` or :py:class:`!int` will raise an exception.
+            Hence it is safer to just use ``{}`` for charges.
+
+    key_order : list, optional
+        A list of strings specifying the order in which params will be written in
+        the spectrum header. Unlisted keys will be in arbitrary order.
+        Default is :py:data:`_default_key_order`.
+
+        .. note:: This does not affect the order of lines in the global header.
+
+    param_formatters : dict, optional
+        A dict mapping parameter names to functions. Each function must accept
+        two arguments (key and value) and return a string.
+        Default is :py:data:`_default_value_formatters`.
+
+    use_numpy : bool, optional
+        Controls whether fragment peak arrays are written using :py:func:`numpy.savetxt`.
+        Using :py:func:`numpy.savetxt` is faster, but cannot handle sparse arrays of fragment charges.
+        You may want to disable this if you need to save spectra with 'charge arrays' with missing values.
+
+        If not specified, will be set to the opposite of `write_chrages`.
+        If :py:mod:`numpy` is not available, this parameter has no effect.
+
+    file_mode : str, keyword only, optional
+        If `output` is a file name, defines the mode the file will be opened in.
+        Otherwise will be ignored. Default is `'w'`.
+
+        .. note ::
+            The default changed from `'a'` in *pyteomics 4.6*.
+
+    encoding : str, keyword only, optional
+        Output file encoding (if `output` is specified by name).
+
+    Returns
+    -------
+
+    output : file
+    """
+    def key_value_line(key, val):
+        return param_formatters.get(key, _default_repr)(key, val) + '\n'
+
+    nones = (None, np.nan, np.ma.masked) if np is not None else (None,)
+
+    if fragment_format is None:
+        fragment_format = '{} {} {}'
+        np_format_2 = '%.5f %.1f'
+        np_format_3 = '%.5f %.1f %d'
+        np_format_i = '%.5f %.1f %s'
+    else:
+        np_format_2 = np_format_3 = np_format_i = fragment_format
+    format_str = fragment_format + '\n'
+
+    if write_ions:
+        write_charges = False
+    if use_numpy is None:
+        use_numpy = not write_charges
+
+    if isinstance(header, dict):
+        head_dict = header.copy()
+        head_lines = [key_value_line(k, v) for k, v in header.items()]
+        head_str = '\n'.join(head_lines)
+    else:
+        if isinstance(header, str):
+            head_str = header
+            head_lines = header.split('\n')
+        else:
+            head_lines = list(header)
+            head_str = '\n'.join(header)
+        head_dict = {}
+        for line in head_lines:
+            if not line.strip() or any(line.startswith(c) for c in MGF._comments):
+                continue
+            l = line.split('=')
+            if len(l) == 2:
+                head_dict[l[0].lower()] = l[1].strip()
+    if head_str:
+        output.write(head_str + '\n\n')
+
+    if isinstance(spectra, dict) and 'm/z array' in spectra:
+        spectra = (spectra, )
+        warnings.warn("Passing a single spectrum to `write()` is discouraged. "
+            "To write a set of spectra, pass them to `write()` all at once. "
+            "For more info, see: https://github.com/levitsky/pyteomics/discussions/109.")
+
+    for spectrum in spectra:
+        output.write('BEGIN IONS\n')
+        found = set()
+        for key in it.chain(key_order, spectrum['params']):
+            if key not in found and key in spectrum['params']:
+                found.add(key)
+                val = spectrum['params'][key]
+                if val != head_dict.get(key):
+                    output.write(key_value_line(key, val))
+
+        try:
+            success = True
+            if np is not None and use_numpy:
+                if (not write_charges or 'charge array' not in spectrum) and (not write_ions or 'ion array' not in spectrum):
+                    X = np.empty((len(spectrum['m/z array']), 2))
+                    X[:, 0] = spectrum['m/z array']
+                    X[:, 1] = spectrum['intensity array']
+                    np.savetxt(output, X, fmt=np_format_2)
+                elif isinstance(spectrum.get('charge array'), np.ndarray):
+                    X = np.empty((len(spectrum['m/z array']), 3))
+                    X[:, 0] = spectrum['m/z array']
+                    X[:, 1] = spectrum['intensity array']
+                    X[:, 2] = spectrum['charge array']
+                    np.savetxt(output, X, fmt=np_format_3)
+                elif isinstance(spectrum.get('ion array'), np.ndarray):
+                    X = np.empty((len(spectrum['m/z array']), 3), dtype=object)
+                    X[:, 0] = spectrum['m/z array']
+                    X[:, 1] = spectrum['intensity array']
+                    X[:, 2] = spectrum['ion array']
+                    np.savetxt(output, X, fmt=np_format_i)
+                else:
+                    success = False
+            else:
+                success = False
+
+            if not success:
+                for m, i, c in zip(spectrum['m/z array'],
+                        spectrum['intensity array'],
+                        spectrum.get('charge array', it.cycle((None,))) if write_charges else
+                            spectrum.get('ion array', it.cycle((None,))) if write_ions else
+                            it.cycle((None,))):
+                    output.write(format_str.format(
+                        m, i,
+                        (c if c not in nones else '')))
+        except KeyError:
+            raise aux.PyteomicsError("'m/z array' and 'intensity array' must be present in all spectra.")
+        output.write('END IONS\n\n')
+    return output
+
+
+chain = aux._make_chain(read, 'read')
diff --git a/pyteomics/ms1.py b/pyteomics/ms1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7cc3be10da07f388c58db652f94b54b9ed2e36
--- /dev/null
+++ b/pyteomics/ms1.py
@@ -0,0 +1,492 @@
+"""
+ms1 - read and write MS/MS data in MS1 format
+=============================================
+
+Summary
+-------
+
+`MS1 <http://dx.doi.org/10.1002/rcm.1603>`_ is a simple
+human-readable format for MS1 data. It allows storing MS1 peak lists and exprimental parameters.
+
+This module provides minimalistic infrastructure for access to data stored in MS1 files.
+Two main classes are :py:class:`MS1`, which provides an iterative, text-mode parser,
+and :py:class:`IndexedMS1`, which is a binary-mode parser that supports random access using scan IDs
+and retention times.
+The function :py:func:`read` helps dispatch between the two classes.
+Also, common parameters can be read from MS1 file header with :py:func:`read_header` function.
+
+Classes
+-------
+
+  :py:class:`MS1` - a text-mode MS1 parser. Suitable to read spectra from a file consecutively.
+  Needs a file opened in text mode (or will open it if given a file name).
+
+  :py:class:`IndexedMS1` - a binary-mode MS1 parser. When created, builds a byte offset index
+  for fast random access by spectrum ID. Sequential iteration is also supported.
+  Needs a seekable file opened in binary mode (if created from existing file object).
+
+  :py:class:`MS1Base` - abstract class, the common ancestor of the two classes above.
+  Can be used for type checking.
+
+Functions
+---------
+
+  :py:func:`read` - an alias for :py:class:`MS1` or :py:class:`IndexedMS1`.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`read_header` - get a dict with common parameters for all spectra
+  from the beginning of MS1 file.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from . import auxiliary as aux
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+
+class MS1Base(aux.ArrayConversionMixin):
+    """Abstract class representing an MS1 file. Subclasses implement different approaches to parsing."""
+    _array_keys = ['m/z array', 'intensity array']
+    _float_keys = ['RTime', 'RetTime']
+
+    def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding=None, **kwargs):
+        """
+        Create an instance of a :py:class:`MS1Base` parser.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MS1 format. Default is
+            :py:const:`None`, which means read standard input.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`False`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array'.
+
+        encoding : str, optional
+            File encoding.
+        """
+        super(MS1Base, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, **kwargs)
+        if convert_arrays and np is None:
+            raise aux.PyteomicsError('numpy is required for array conversion')
+        self._use_header = use_header
+        if use_header:
+            self._header = self._read_header()
+        else:
+            self._header = None
+        self._source_name = getattr(source, 'name', str(source))
+
+    def reset(self):
+        super(MS1Base, self).reset()
+        self._pending_line = None
+
+    @property
+    def header(self):
+        return self._header
+
+    def _read_header_lines(self, lines):
+        header = {}
+        for line in lines:
+            if line[0] != 'H':
+                break
+            tokens = line.split('\t', 2)
+            if len(tokens) < 3:
+                tokens = line.split(None, 2)
+            key = tokens[1]
+            val = tokens[2].strip()
+            header[key] = val
+        return header
+
+    def _make_scan(self, info):
+        for key in self._float_keys:
+            if key in info['params']:
+                info['params'][key] = float(info['params'][key])
+        self._build_all_arrays(info)
+        return info
+
+    def _handle_S(self, line, sline, params):
+        sline = line.strip().split(None, 3)
+        params['scan'] = tuple(sline[1:3])
+        if len(sline) == 4:  # in MS2 the S line contains the precursor m/z as a 4th column
+            params['precursor m/z'] = float(sline[3])
+
+    def _handle_I(self, line, sline, params):
+        params[sline[1]] = sline[2]
+
+    def _handle_Z(self, line, sline, params):
+        params.setdefault('charge', []).append(float(sline[1]))
+        params.setdefault('neutral mass', []).append(float(sline[2]))
+
+    def _handle_D(self, line, sline, params):
+        params.setdefault('analyzer', []).append(sline[1:])
+
+    def _handle_peak(self, line, sline, info):
+        try:
+            info['m/z array'].append(float(sline[0]))            # this may cause
+            info['intensity array'].append(float(sline[1]))      # exceptions...
+        except ValueError:
+            raise aux.PyteomicsError(
+                'Error when parsing %s. Line: %s' % (self._source_name, line))
+        except IndexError:
+            pass
+
+    def _read_spectrum_lines(self, lines):
+        params = {}
+        info = {'params': params}
+        for k in self._array_keys:
+            info[k] = []
+        if self._use_header:
+            params.update(self.header)
+        if self._pending_line:
+            reading_spectrum = True
+            self._handle_S(self._pending_line, None, params)
+        else:
+            reading_spectrum = False
+        line_count = 0
+        for i, line in enumerate(lines):
+            line_count = i
+            sline = line.strip().split(None, 2)
+            if not sline:
+                continue
+            if not reading_spectrum:
+                if sline[0] == 'S':
+                    reading_spectrum = True
+                    self._handle_S(line, sline, params)
+                # otherwise we are not interested; do nothing, just move along
+            else:
+                if not sline:
+                    pass
+                elif sline[0] == 'S':
+                    self._pending_line = line
+                    return self._make_scan(info)
+
+                else:
+                    if sline[0] == 'I':  # spectrum-specific parameters!
+                        self._handle_I(line, sline, params)
+                    elif sline[0] == 'Z':  # MS2-specific charge state guess
+                        self._handle_Z(line, sline, params)
+                    elif sline[0] == 'D':  # MS2-specific analyzer annotation
+                        self._handle_D(line, sline, params)
+                    else:  # this must be a peak list
+                        self._handle_peak(line, sline, info)
+        self._pending_line = None
+        if line_count == 0:
+            return
+        return self._make_scan(info)
+
+    def __getstate__(self):
+        state = super(MS1Base, self).__getstate__()
+        state['use_header'] = self._use_header
+        state['header'] = self._header
+        return state
+
+    def __setstate__(self, state):
+        super(MS1Base, self).__setstate__(state)
+        self._use_header = state['use_header']
+        self._header = state['header']
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__,
+            (self._source_init, False, self._convert_arrays, None, self.encoding),
+            self.__getstate__())
+
+
+class MS1(MS1Base, aux.FileReader):
+    """
+    A class representing an MS1 file. Supports the `with` syntax and direct iteration for sequential
+    parsing.
+
+    :py:class:`MS1` object behaves as an iterator, **yielding** spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array',
+    'intensity array', and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    and 'params' stores a :py:class:`dict` of parameters.
+
+    Attributes
+    ----------
+
+    header : dict
+        The file header.
+
+    """
+    def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding=None, **kwargs):
+        """
+        Create an :py:class:`MS1` (text-mode) reader for a given MS1 file.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MS1 format. Default is
+            :py:const:`None`, which means read standard input.
+
+            .. note :: If a file object is given, it must be opened in text mode.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`False`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array'.
+
+        encoding : str, optional
+            File encoding.
+
+        Returns
+        -------
+
+        out : MS1
+            The reader object.
+        """
+        super(MS1, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding,
+            mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={})
+
+    @aux._keepstate_method
+    def _read_header(self):
+        return self._read_header_lines(self._source)
+
+    def _read(self):
+        def get_next_spectrum():
+            return self._read_spectrum_lines(self._source)
+
+        for spectrum in iter(get_next_spectrum, None):
+            yield spectrum
+
+
+class IndexedMS1(MS1Base, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixin, aux.IndexedTextReader):
+    """
+    A class representing an MS1 file. Supports the `with` syntax and direct iteration for sequential
+    parsing. Specific spectra can be accessed by title using the indexing syntax in constant time.
+    If created using a file object, it needs to be opened in binary mode.
+
+    When iterated, :py:class:`IndexedMS1` object yields spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', 'intensity array' and 'params'.
+    'm/z array' and 'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    and 'params' stores a :py:class:`dict` of parameters (keys and values are
+    :py:class:`str`, keys corresponding to MS1).
+
+    .. warning ::
+        Labels for scan objects are constructed as the first number in the S line, as follows:
+        for a line ``S  0   1`` the label is `'0'`. If these labels are not unique
+        for the scans in the file, the indexed parser will not work correctly. Consider using
+        :py:class:`MS1` instead.
+
+    Attributes
+    ----------
+
+    header : dict
+        The file header.
+    time : RTLocator
+        A property used for accessing spectra by retention time.
+    """
+
+    delimiter = '\nS'
+    label = r'^[\n]?S\s+(\S+)'
+
+    def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding='utf-8', _skip_index=False, **kwargs):
+        """
+        Create an :py:class:`IndexedMS1` (binary-mode) reader for a given MS1 file.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MS1 format. Default is
+            :py:const:`None`, which means read standard input.
+
+            .. note :: If a file object is given, it must be opened in binary mode.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`True`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array'.
+
+        encoding : str, optional
+            File encoding.
+
+        block_size : int, optinal
+            Size of the chunk (in bytes) used to parse the file when creating the byte offset index.
+
+        Returns
+        -------
+
+        out : IndexedMS1
+            The reader object.
+        """
+        super(IndexedMS1, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding,
+            parser_func=self._read, pass_file=False, args=(), kwargs={}, _skip_index=_skip_index, **kwargs)
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__,
+            (self._source_init, False, self._convert_arrays, None, self.encoding, True),
+            self.__getstate__())
+
+    @aux._keepstate_method
+    def _read_header(self):
+        try:
+            first = next(v for v in self._offset_index.values())[0]
+        except StopIteration: # the index is empty, no spectra in file
+            first = -1
+        header_lines = self.read(first).decode(self.encoding).split('\n')
+        return self._read_header_lines(header_lines)
+
+    def _item_from_offsets(self, offsets):
+        start, end = offsets
+        lines = self._read_lines_from_offsets(start, end)
+        return self._read_spectrum_lines(lines)
+
+    def _read(self, **kwargs):
+        for _, offsets in self._offset_index.items():
+            spectrum = self._item_from_offsets(offsets)
+            yield spectrum
+
+    def get_spectrum(self, key):
+        return self.get_by_id(key)
+
+    def _get_time(self, spectrum):
+        try:
+            return spectrum['params']['RTime']
+        except KeyError:
+            raise aux.PyteomicsError('RT information not found.')
+
+
+def read_header(source, *args, **kwargs):
+    """
+    Read the specified MS1 file, get the parameters specified in the header
+    as a :py:class:`dict`.
+
+    Parameters
+    ----------
+
+    source : str or file
+        File name or file object representing an file in MS1 format.
+
+    Returns
+    -------
+
+    header : dict
+    """
+    kwargs['use_header'] = True
+    return read(source, *args, **kwargs).header
+
+
+def read(*args, **kwargs):
+    """Read an MS1 file and return entries iteratively.
+
+    Read the specified MS1 file, **yield** spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array',
+    'intensity array', and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    and 'params' stores a :py:class:`dict` of parameters.
+
+    Parameters
+    ----------
+
+    source : str or file or None, optional
+        A file object (or file name) with data in MS1 format. Default is
+        :py:const:`None`, which means read standard input.
+
+    use_header : bool, optional
+        Add the info from file header to each dict. Spectrum-specific parameters
+        override those from the header in case of conflict.
+        Default is :py:const:`False`.
+
+    convert_arrays : one of {0, 1, 2}, optional
+        If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+        If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+        If `2`, charges will be reported as a masked array (default).
+        The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+    dtype : type or str or dict, optional
+        dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+        Keys should be 'm/z array' and/or 'intensity array'.
+
+    encoding : str, optional
+        File encoding.
+
+    use_index : bool, optional
+        Determines which parsing method to use. If :py:const:`True`, an instance of
+        :py:class:`IndexedMS1` is created. This facilitates random access by scan titles.
+        If an open file is passed as `source`, it needs to be open in binary mode.
+
+        If :py:const:`False` (default), an instance of :py:class:`MS1` is created. It reads
+        `source` in text mode and is suitable for iterative parsing.
+
+        .. warning ::
+            Labels for scan objects are constructed as the first number in the S line, as follows:
+            for a line ``S  0   1`` the label is `'0'`. If these labels are not unique
+            for the scans in the file, the indexed parser will not work correctly.
+
+    block_size : int, optinal
+        Size of the chunk (in bytes) used to parse the file when creating the byte offset index.
+        (Accepted only for :py:class:`IndexedMS1`.)
+
+    Returns
+    -------
+
+    out : :py:class:`MS1Base`
+        An instance of :py:class:`MS1` or :py:class:`IndexedMS1`, depending on `use_index` and `source`.
+    """
+    if args:
+        source = args[0]
+    else:
+        source = kwargs.get('source')
+    use_index = kwargs.pop('use_index', None)
+    use_index = aux._check_use_index(source, use_index, False)
+    tp = IndexedMS1 if use_index else MS1
+
+    return tp(*args, **kwargs)
+
+
+chain = aux._make_chain(read, 'read')
diff --git a/pyteomics/ms2.py b/pyteomics/ms2.py
new file mode 100644
index 0000000000000000000000000000000000000000..16afbbf17eb624da3dd8b4f33a4be6a4fc7593e9
--- /dev/null
+++ b/pyteomics/ms2.py
@@ -0,0 +1,396 @@
+"""
+ms2 - read and write MS/MS data in MS2 format
+=============================================
+
+Summary
+-------
+
+`MS2 <http://dx.doi.org/10.1002/rcm.1603>`_ is a simple
+human-readable format for MS2 data. It allows storing MS2 peak lists and
+exprimental parameters.
+
+This module provides minimalistic infrastructure for access to data stored in
+MS2 files.
+Two main classes are :py:class:`MS2`, which provides an iterative, text-mode parser,
+and :py:class:`IndexedMS2`, which is a binary-mode parser that supports random access using scan IDs
+and retention times.
+The function :py:func:`read` helps dispatch between the two classes.
+Also, common parameters can be read from MS2 file header with
+:py:func:`read_header` function.
+
+Classes
+-------
+
+  :py:class:`MS2` - a text-mode MS2 parser. Suitable to read spectra from a file consecutively.
+  Needs a file opened in text mode (or will open it if given a file name).
+
+  :py:class:`IndexedMS2` - a binary-mode MS2 parser. When created, builds a byte offset index
+  for fast random access by spectrum ID. Sequential iteration is also supported.
+  Needs a seekable file opened in binary mode (if created from existing file object).
+
+  :py:class:`MS2Base` - abstract class, the common ancestor of the two classes above.
+  Can be used for type checking.
+
+Functions
+---------
+
+  :py:func:`read` - an alias for :py:class:`MS2` or :py:class:`IndexedMS1`.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`read_header` - get a dict with common parameters for all spectra
+  from the beginning of MS2 file.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from pyteomics import auxiliary as aux
+from pyteomics.ms1 import MS1, IndexedMS1, MS1Base
+
+
+class MS2Base(aux.MaskedArrayConversionMixin, MS1Base):
+    """Abstract class representing an MS2 file. Subclasses implement different approaches to parsing."""
+    _array_keys = ['m/z array', 'intensity array', 'charge array', 'resolution array']
+    _float_keys = ['RTime', 'RetTime', 'IonInjectionTime', 'PrecursorInt']
+
+    def __init__(self, source=None, use_header=False, convert_arrays=2, dtype=None, read_charges=True, read_resolutions=True, encoding=None, **kwargs):
+        """
+        Create an instance of a :py:class:`MS2Base` parser.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MS1 format. Default is
+            :py:const:`None`, which means read standard input.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`False`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        read_charges : bool, optional
+            If `True` (default), fragment charges are reported. Disabling it improves performance.
+            Charge is expected to be the **third** number on the line, after peak *m/z* and intensity.
+
+        read_resolutions : bool, optional
+            If `True` (default), fragment peak resolutions are reported. Disabling it improves performance.
+            Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array', 'resolution array'.
+
+        encoding : str, optional
+            File encoding.
+        """
+        super(MS2Base, self).__init__(source=source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype,
+            encoding=encoding, **kwargs)
+        self._read_charges = read_charges
+        self._read_resolutions = read_resolutions
+
+    def _handle_peak(self, line, sline, info):
+        super(MS2Base, self)._handle_peak(line, sline, info)
+        if self._read_charges:
+            if len(sline) > 2:
+                sline = line.strip().split()
+                try:
+                    info['charge array'].append(int(sline[2]))
+                except ValueError:
+                    raise aux.PyteomicsError("Error parsing fragment charge on line: " + line)
+            else:
+                info['charge array'].append(0)
+        if self._read_resolutions:
+            if len(sline) > 2:
+                sline = line.strip().split()
+                try:
+                    info['resolution array'].append(int(sline[3]))
+                except ValueError:
+                    raise aux.PyteomicsError("Error parsing fragment peak resolution on line: " + line)
+            else:
+                info['resolution array'].append(0)
+
+    def _make_scan(self, info):
+        if not self._read_charges:
+            del info['charge array']
+        if not self._read_resolutions:
+            del info['resolution array']
+        return super(MS2Base, self)._make_scan(info)
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__,
+            (self._source_init, False, self._convert_arrays, None, self._read_charges, self._read_resolutions, self.encoding),
+            self.__getstate__())
+
+
+class MS2(MS2Base, MS1):
+    """
+    A class representing an MS2 file. Supports the `with` syntax and direct iteration for sequential
+    parsing.
+
+    :py:class:`MS2` object behaves as an iterator, **yielding** spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array',
+    'intensity array', and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    and 'params' stores a :py:class:`dict` of parameters.
+
+    Attributes
+    ----------
+
+    header : dict
+        The file header.
+
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Create an :py:class:`MS2` (text-mode) reader for a given MS2 file.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MS2 format. Default is
+            :py:const:`None`, which means read standard input.
+
+            .. note :: If a file object is given, it must be opened in text mode.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`False`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        read_charges : bool, optional
+            If `True` (default), fragment charges are reported. Disabling it improves performance.
+            Charge is expected to be the **third** number on the line, after peak *m/z* and intensity.
+
+        read_resolutions : bool, optional
+            If `True` (default), fragment peak resolutions are reported. Disabling it improves performance.
+            Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array'.
+
+        encoding : str, optional
+            File encoding.
+
+        Returns
+        -------
+
+        out : MS2
+            The reader object.
+        """
+        super(MS2, self).__init__(*args, **kwargs)
+
+
+class IndexedMS2(IndexedMS1, MS2Base):
+    """
+    A class representing an MS2 file. Supports the `with` syntax and direct iteration for sequential
+    parsing. Specific spectra can be accessed by title using the indexing syntax in constant time.
+    If created using a file object, it needs to be opened in binary mode.
+
+    When iterated, :py:class:`IndexedMS2` object yields spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array',
+    'intensity array', 'charge array' and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints,
+    and 'params' stores a :py:class:`dict` of parameters (keys and values are
+    :py:class:`str`, keys corresponding to MS2).
+
+    .. warning ::
+        Labels for scan objects are constructed as the first number in the S line, as follows:
+        for a line ``S  0   1   123.4`` the label is `'0'`. If these labels are not unique
+        for the scans in the file, the indexed parser will not work correctly. Consider using
+        :py:class:`MS2` instead.
+
+    Attributes
+    ----------
+
+    header : dict
+        The file header.
+    time : RTLocator
+        A property used for accessing spectra by retention time.
+    """
+    def __init__(self, source=None, use_header=False, convert_arrays=2, dtype=None, read_charges=True, read_resolutions=True,
+                 encoding='utf-8', _skip_index=False, **kwargs):
+        """
+        Create an :py:class:`IndexedMS2` (binary-mode) reader for a given MS2 file.
+
+        Parameters
+        ----------
+
+        source : str or file or None, optional
+            A file object (or file name) with data in MS2 format. Default is
+            :py:const:`None`, which means read standard input.
+
+            .. note :: If a file object is given, it must be opened in binary mode.
+
+        use_header : bool, optional
+            Add the info from file header to each dict. Spectrum-specific parameters
+            override those from the header in case of conflict.
+            Default is :py:const:`True`.
+
+        convert_arrays : one of {0, 1, 2}, optional
+            If `0`, m/z, intensities and (possibly) charges will be returned as regular lists.
+            If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s.
+            If `2`, charges will be reported as a masked array (default).
+            The default option is the slowest. `1` and `2` require :py:mod:`numpy`.
+
+        read_charges : bool, optional
+            If `True` (default), fragment charges are reported. Disabling it improves performance.
+            Charge is expected to be the **third** number on the line, after peak *m/z* and intensity.
+
+        read_resolutions : bool, optional
+            If `True` (default), fragment peak resolutions are reported. Disabling it improves performance.
+            Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge.
+
+        dtype : type or str or dict, optional
+            dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+            Keys should be 'm/z array', 'intensity array', 'charge array'.
+
+        encoding : str, optional
+            File encoding.
+
+        block_size : int, optinal
+            Size of the chunk (in bytes) used to parse the file when creating the byte offset index.
+
+        Returns
+        -------
+
+        out : IndexedMS2
+            The reader object.
+        """
+        super(IndexedMS2, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype,
+            read_charges=read_charges, read_resolutions=read_resolutions, encoding=encoding, _skip_index=_skip_index, **kwargs)
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__,
+            (self._source_init, False, self._convert_arrays, None, self._read_charges, self._read_resolutions, self.encoding, True),
+            self.__getstate__())
+
+
+def read_header(source, *args, **kwargs):
+    """
+    Read the specified MS2 file, get the parameters specified in the header
+    as a :py:class:`dict`.
+
+    Parameters
+    ----------
+
+    source : str or file
+        File name or file object representing an file in MS2 format.
+
+    Returns
+    -------
+
+    header : dict
+    """
+    kwargs['use_header'] = True
+    return read(source, *args, **kwargs).header
+
+
+def read(*args, **kwargs):
+    """Read an MS2 file and return entries iteratively.
+
+    Read the specified MS2 file, **yield** spectra one by one.
+    Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array',
+    'intensity array', and 'params'. 'm/z array' and
+    'intensity array' store :py:class:`numpy.ndarray`'s of floats,
+    and 'params' stores a :py:class:`dict` of parameters.
+
+    Parameters
+    ----------
+
+    source : str or file or None, optional
+        A file object (or file name) with data in MS2 format. Default is
+        :py:const:`None`, which means read standard input.
+
+    use_header : bool, optional
+        Add the info from file header to each dict. Spectrum-specific parameters
+        override those from the header in case of conflict.
+        Default is :py:const:`False`.
+
+    convert_arrays : bool, optional
+        If :py:const:`False`, m/z and intensities will be returned as regular lists.
+        If :py:const:`True` (default), they will be converted to regular :py:class:`numpy.ndarray`'s.
+        Conversion requires :py:mod:`numpy`.
+
+    read_charges : bool, optional
+        If `True` (default), fragment charges are reported. Disabling it improves performance.
+        Charge is expected to be the **third** number on the line, after peak *m/z* and intensity.
+
+    read_resolutions : bool, optional
+        If `True` (default), fragment peak resolutions are reported. Disabling it improves performance.
+        Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge.
+
+    dtype : type or str or dict, optional
+        dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key.
+        Keys should be 'm/z array' and/or 'intensity array'.
+
+    encoding : str, optional
+        File encoding.
+
+    use_index : bool, optional
+        Determines which parsing method to use. If :py:const:`True`, an instance of
+        :py:class:`IndexedMS2` is created. This facilitates random access by scan titles.
+        If an open file is passed as `source`, it needs to be open in binary mode.
+
+        .. warning ::
+            Labels for scan objects are constructed as the first number in the S line, as follows:
+            for a line ``S  0   1   123.4`` the label is `'0'`. If these labels are not unique
+            for the scans in the file, the indexed parser will not work correctly.
+
+        If :py:const:`False` (default), an instance of :py:class:`MS2` is created. It reads
+        `source` in text mode and is suitable for iterative parsing.
+
+    block_size : int, optinal
+        Size of the chunk (in bytes) used to parse the file when creating the byte offset index.
+        (Accepted only for :py:class:`IndexedMS2`.)
+
+    Returns
+    -------
+
+    out :
+        An instance of :py:class:`MS2` or :py:class:`IndexedMS2`, depending on `use_index` and `source`.
+    """
+    if args:
+        source = args[0]
+    else:
+        source = kwargs.get('source')
+    use_index = kwargs.pop('use_index', None)
+    use_index = aux._check_use_index(source, use_index, False)
+    tp = IndexedMS2 if use_index else MS2
+
+    return tp(*args, **kwargs)
+
+
+chain = aux._make_chain(read, 'read')
diff --git a/pyteomics/mzid.py b/pyteomics/mzid.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df70bff0fb5dfcbffc15edf3a97bfa374c3c0f5
--- /dev/null
+++ b/pyteomics/mzid.py
@@ -0,0 +1,453 @@
+"""
+mzid - mzIdentML file reader
+============================
+
+Summary
+-------
+
+`mzIdentML <http://www.psidev.info/mzidentml>`_  is one of the standards
+developed by the Proteomics Informatics working group of the HUPO Proteomics
+Standard Initiative.
+
+This module provides a minimalistic way to extract information from mzIdentML
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`MzIdentML`) to iterate over entries in
+``<SpectrumIdentificationResult>`` elements, i.e. groups of identifications
+for a certain spectrum. Note that each entry can contain more than one PSM
+(peptide-spectrum match). They are accessible with "SpectrumIdentificationItem"
+key.
+:py:class:`MzIdentML` objects also support direct indexing by element ID.
+
+Data access
+-----------
+
+  :py:class:`MzIdentML` - a class representing a single MzIdentML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through peptide-spectrum matches in an mzIdentML
+  file. Data from a single PSM group are converted to a human-readable dict.
+  Basically creates an :py:class:`MzIdentML` object and reads it.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`DataFrame` - read MzIdentML files into a :py:class:`pandas.DataFrame`.
+
+Target-decoy approach
+---------------------
+
+  :py:func:`filter` - read a chain of mzIdentML files and filter to a certain
+  FDR using TDA.
+
+  :py:func:`filter.chain` - chain a series of filters applied independently to
+  several files.
+
+  :py:func:`filter.chain.from_iterable` - chain a series of filters applied
+  independently to an iterable of files.
+
+  :py:func:`filter_df` - filter MzIdentML files and return a :py:class:`pandas.DataFrame`.
+
+  :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be
+  consiudered decoy.
+
+  :py:func:`fdr` - estimate the false discovery rate of a set of identifications
+  using the target-decoy approach.
+
+  :py:func:`qvalues` - get an array of scores and local FDR values for a PSM
+  set using the target-decoy approach.
+
+Controlled Vocabularies
+~~~~~~~~~~~~~~~~~~~~~~~
+mzIdentML relies on controlled vocabularies to describe its contents extensibly. See
+`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_
+for more details on how they are used.
+
+Handling Time Units and Other Qualified Quantities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+mzIdentML contains information which may be described as using a variety of different time units.
+See `Unit Handling <../data.html#unit-handling>`_ for more information.
+
+
+Deprecated functions
+--------------------
+
+  :py:func:`version_info` - get information about mzIdentML version and schema.
+  You can just read the corresponding attribute of the :py:class:`MzIdentML`
+  object.
+
+  :py:func:`get_by_id` - get an element by its ID and extract the data from it.
+  You can just call the corresponding method of the :py:class:`MzIdentML`
+  object.
+
+  :py:func:`iterfind` - iterate over elements in an mzIdentML file.
+  You can just call the corresponding method of the :py:class:`MzIdentML`
+  object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml`.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import warnings
+from . import auxiliary as aux
+from . import xml, _schema_defaults
+
+
+class MzIdentML(xml.MultiProcessingXML, xml.IndexSavingXML):
+    """Parser class for MzIdentML files."""
+    file_format = 'mzIdentML'
+    _root_element = 'MzIdentML'
+    _default_schema = _schema_defaults._mzid_schema_defaults
+    _default_version = '1.1.0'
+    _default_iter_tag = 'SpectrumIdentificationResult'
+    _structures_to_flatten = {'Fragmentation'}
+    _indexed_tags = {'SpectrumIdentificationResult', 'SpectrumIdentificationItem',
+                     'SearchDatabase', 'SourceFile', 'SpectraData', 'Sample',
+                     'DBSequence',  'Peptide', 'PeptideEvidence',
+                     'Measure', 'TranslationTable', 'MassTable', 'Enzyme',
+                     'Organization', 'AnalysisSoftware', 'BibliographicReference', 'Person', 'Provider',
+                     'SpectrumIdentificationList', 'SpectrumIdentificationProtocol', 'SpectrumIdentification',
+                     'ProteinDetectionList', 'ProteinDetectionProtocol', 'ProteinDetection',
+                     'ProteinDetectionHypothesis', 'ProteinAmbiguityGroup',
+                    }
+
+    _element_handlers = xml.XML._element_handlers.copy()
+    _element_handlers.update({
+        "Modification": xml.XML._promote_empty_parameter_to_name,
+        "SpectrumIDFormat": xml.XML._promote_empty_parameter_to_name,
+        "FileFormat": xml.XML._promote_empty_parameter_to_name,
+        "Role": xml.XML._promote_empty_parameter_to_name
+    })
+
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault('retrieve_refs', True)
+        super(MzIdentML, self).__init__(*args, **kwargs)
+
+    def _get_info_smart(self, element, **kwargs):
+        """Extract the info in a smart way depending on the element type"""
+        name = xml._local_name(element)
+        kwargs = dict(kwargs)
+        rec = kwargs.pop("recursive", None)
+
+        # Try not to recursively unpack the root element
+        # unless the user really wants to.
+        if name == self._root_element:
+            return self._get_info(element,
+                    recursive=(rec if rec is not None else False),
+                    **kwargs)
+        else:
+            return self._get_info(element,
+                    recursive=(rec if rec is not None else True),
+                    **kwargs)
+
+    def _retrieve_refs(self, info, **kwargs):
+        """Retrieves and embeds the data for each attribute in `info` that
+        ends in _ref. Removes the id attribute from `info`"""
+        for k, v in dict(info).items():
+            if k.endswith('_ref'):
+                try:
+                    by_id = self.get_by_id(v, retrieve_refs=True)
+                except KeyError:
+                    warnings.warn('Ignoring unresolved reference: ' + v)
+                else:
+                    info.update(by_id)
+                    del info[k]
+                    info.pop('id', None)
+
+def read(source, **kwargs):
+    """Parse `source` and iterate through peptide-spectrum matches.
+
+    .. note:: This function is provided for backward compatibility only.
+        It simply creates an :py:class:`MzIdentML` instance using
+        provided arguments and returns it.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target mzIdentML file or the file object itself.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    retrieve_refs : bool, optional
+        If :py:const:`True`, additional information from references will be
+        automatically added to the results. The file processing time will
+        increase. Default is :py:const:`True`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    build_id_cache : bool, optional
+        Defines whether a cache of element IDs should be built and stored on the
+        created :py:class:`MzIdentML` instance. Default value is the value of
+        `retrieve_refs`.
+
+        .. note:: This parameter is ignored when ``use_index`` is ``True`` (default).
+
+    use_index : bool, optional
+        Defines whether an index of byte offsets needs to be created for
+        the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored.
+
+    indexed_tags : container of bytes, optional
+        Defines which elements need to be indexed. Empty set by default.
+
+    Returns
+    -------
+    out : MzIdentML
+       An iterator over the dicts with PSM properties.
+    """
+    kwargs = kwargs.copy()
+    kwargs.setdefault('retrieve_refs', True)
+    kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs'))
+    return MzIdentML(source, **kwargs)
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified "XPath".
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create an :py:class:`MzIdentML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    retrieve_refs : bool, optional
+        If :py:const:`True`, additional information from references will be
+        automatically added to the results. The file processing time will
+        increase. Default is :py:const:`False`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    build_id_cache : bool, optional
+        Defines whether a cache of element IDs should be built and stored on the
+        created :py:class:`MzIdentML` instance. Default value is the value of
+        `retrieve_refs`.
+
+    Returns
+    -------
+    out : iterator
+    """
+    kwargs = kwargs.copy()
+    kwargs['build_id_cache'] = kwargs.get('build_id_cache',
+            kwargs.get('retrieve_refs'))
+    return MzIdentML(source, **kwargs).iterfind(path, **kwargs)
+
+version_info = xml._make_version_info(MzIdentML)
+
+def get_by_id(source, elem_id, **kwargs):
+    """Parse `source` and return the element with `id` attribute equal
+    to `elem_id`. Returns :py:const:`None` if no such element is found.
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`get_by_id` calls on one file, you should
+        create an :py:class:`MzIdentML` object and use its
+        :py:meth:`!get_by_id` method.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target mzIdentML file of the file object itself.
+
+    elem_id : str
+        The value of the `id` attribute to match.
+
+    Returns
+    -------
+    out : :py:class:`dict` or :py:const:`None`
+    """
+    return MzIdentML(source, **kwargs).get_by_id(elem_id, **kwargs)
+
+
+# chain = aux._make_chain(read, 'read')
+chain = aux.ChainBase._make_chain(MzIdentML)
+
+
+def is_decoy(psm, prefix=None):
+    """Given a PSM dict, return :py:const:`True` if all proteins in the dict
+    are marked as decoy, and :py:const:`False` otherwise.
+
+    Parameters
+    ----------
+    psm : dict
+        A dict, as yielded by :py:func:`read`.
+    prefix : ignored
+
+    Returns
+    -------
+    out : bool
+    """
+    return all(pe['isDecoy'] for sii in psm['SpectrumIdentificationItem']
+            for pe in sii['PeptideEvidenceRef'])
+
+
+def DataFrame(*args, **kwargs):
+    """Read MzIdentML files into a :py:class:`pandas.DataFrame`.
+
+    Requires :py:mod:`pandas`.
+
+    .. warning :: Only the first 'SpectrumIdentificationItem' element is considered in every
+                  'SpectrumIdentificationResult'.
+
+    Parameters
+    ----------
+    *args
+        Passed to :py:func:`chain`.
+    **kwargs
+        Passed to :py:func:`chain`.
+
+    sep : str or None, keyword only, optional
+        Some values related to PSMs (such as protein information) are variable-length
+        lists. If `sep` is a :py:class:`str`, they will be packed into single string using
+        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
+        :py:const:`None`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    data = []
+
+    sep = kwargs.pop('sep', None)
+    with chain(*args, **kwargs) as f:
+        for item in f:
+            info = {}
+            for k, v in item.items():
+                if isinstance(v, (str, int, float)):
+                    info[k] = v
+            sii = item.get('SpectrumIdentificationItem', [None])[0]
+            if sii is not None:
+                info.update((k, v) for k, v in sii.items() if isinstance(v, (str, int, float)))
+                evref = sii.get('PeptideEvidenceRef')
+                if evref:
+                    prot_descr, accessions, isd, starts, ends, lengths = [], [], [], [], [], []
+                    for d in evref:
+                        prot_descr.append(d.get('protein description'))
+                        accessions.append(d.get('accession'))
+                        isd.append(d.get('isDecoy'))
+                        starts.append(d.get('start'))
+                        ends.append(d.get('end'))
+                        lengths.append(d.get('length'))
+                    isd = all(isd)
+                    if sep is not None:
+                        if all(isinstance(prd, str) for prd in prot_descr):
+                            prot_descr = sep.join(prot_descr)
+
+                        if all(isinstance(acc, str) for acc in accessions):
+                            accessions = sep.join(accessions)
+
+                    if all(prd is None for prd in prot_descr):
+                        prot_descr = None
+                    if all(acc is None for acc in accessions):
+                        accessions = None
+
+                    info.update((k, v) for k, v in evref[0].items() if isinstance(v, (str, int, float, list)))
+                    info['protein description'] = prot_descr
+                    info['accession'] = accessions
+                    info['isDecoy'] = isd
+                    info['start'] = starts
+                    info['end'] = ends
+                    info['length'] = lengths
+            data.append(info)
+    df = pd.DataFrame(data)
+    return df
+
+
+def filter_df(*args, **kwargs):
+    """Read MzIdentML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs.
+    Positional arguments can be MzIdentML files or DataFrames.
+
+    Requires :py:mod:`pandas`.
+
+    .. warning :: Only the first 'SpectrumIdentificationItem' element is considered in every
+                  'SpectrumIdentificationResult'.
+
+    Parameters
+    ----------
+    key : str / iterable / callable, keyword only, optional
+        Default is 'mascot:expectation value'.
+    is_decoy : str / iterable / callable, keyword only, optional
+        Default is 'isDecoy'.
+    *args
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+    **kwargs
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    kwargs.setdefault('key', 'mascot:expectation value')
+    kwargs.setdefault('is_decoy', 'isDecoy')
+    if all(isinstance(arg, pd.DataFrame) for arg in args):
+        df = pd.concat(args)
+    else:
+        df = DataFrame(*args, **kwargs)
+    return aux.filter(df, **kwargs)
+
+
+fdr = aux._make_fdr(is_decoy, None)
+_key = lambda x: min(
+    sii['mascot:expectation value'] for sii in x['SpectrumIdentificationItem'])
+qvalues = aux._make_qvalues(chain, is_decoy, None, _key)
+filter = aux._make_filter(chain, is_decoy, None, _key, qvalues)
+filter.chain = aux._make_chain(filter, 'filter', True)
diff --git a/pyteomics/mzml.py b/pyteomics/mzml.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a961329293b01d6318abe52cac3ac13128eb95
--- /dev/null
+++ b/pyteomics/mzml.py
@@ -0,0 +1,546 @@
+"""
+mzml - reader for mass spectrometry data in mzML format
+=======================================================
+
+Summary
+-------
+
+mzML is a standard rich XML-format for raw mass spectrometry data storage.
+Please refer to `psidev.info <http://www.psidev.info/index.php?q=node/257>`_
+for the detailed specification of the format and structure of mzML files.
+
+This module provides a minimalistic way to extract information from mzML
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`MzML` or :py:class:`PreIndexedMzML`)
+to iterate over entries in ``<spectrum>`` elements.
+:py:class:`MzML` and :py:class:`PreIndexedMzML` also support direct indexing
+with spectrum IDs.
+
+Data access
+-----------
+
+  :py:class:`MzML` - a class representing a single mzML file.
+  Other data access functions use this class internally.
+
+  :py:class:`PreIndexedMzML` - a class representing a single mzML file.
+  Uses byte offsets listed at the end of the file for quick access to spectrum elements.
+
+  :py:func:`read` - iterate through spectra in mzML file. Data from a
+  single spectrum are converted to a human-readable dict. Spectra themselves are
+  stored under 'm/z array' and 'intensity array' keys.
+
+  :py:func:`chain` - read multiple mzML files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+Controlled Vocabularies
+~~~~~~~~~~~~~~~~~~~~~~~
+mzML relies on controlled vocabularies to describe its contents extensibly. See
+`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_
+for more details on how they are used.
+
+Handling Time Units and Other Qualified Quantities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+mzML contains information which may be described as using a variety of different time units.
+See `Unit Handling <../data.html#unit-handling>`_ for more information.
+
+Deprecated functions
+--------------------
+
+  :py:func:`version_info` - get version information about the mzML file.
+  You can just read the corresponding attribute of the :py:class:`MzML` object.
+
+  :py:func:`iterfind` - iterate over elements in an mzML file.
+  You can just call the corresponding method of the :py:class:`MzML` object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml` and :py:mod:`numpy`.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import re
+import warnings
+import numpy as np
+from . import xml, auxiliary as aux, _schema_defaults
+from .xml import etree
+
+NON_STANDARD_DATA_ARRAY = 'non-standard data array'
+
+STANDARD_ARRAYS = set([
+    'm/z array',
+    'intensity array',
+    'charge array',
+    'signal to noise array',
+    'time array',
+    'wavelength array',
+    'flow rate array',
+    'pressure array',
+    'temperature array',
+    'mean charge array',
+    'resolution array',
+    'baseline array',
+    'noise array',
+    'sampled noise m/z array',
+    'sampled noise intensity array',
+    'sampled noise baseline array',
+    'ion mobility array',
+    'deconvoluted ion mobility drift time array',
+    'deconvoluted inverse reduced ion mobility array',
+    'deconvoluted ion mobility array',
+    'raw ion mobility drift time array',
+    'raw inverse reduced ion mobility array',
+    'raw ion mobility array',
+    'mean inverse reduced ion mobility array',
+    'mean ion mobility array',
+    'mean ion mobility drift time array',
+    'mass array',
+    'scanning quadrupole position lower bound m/z array',
+    'scanning quadrupole position upper bound m/z array',
+])
+
+
+class MzML(aux.BinaryArrayConversionMixin, aux.TimeOrderedIndexedReaderMixin, xml.MultiProcessingXML, xml.IndexSavingXML):
+    """Parser class for mzML files."""
+    file_format = 'mzML'
+    _root_element = 'mzML'
+    _default_schema = _schema_defaults._mzml_schema_defaults
+    _default_version = '1.1.0'
+    _default_iter_tag = 'spectrum'
+    _structures_to_flatten = {'binaryDataArrayList', 'referenceableParamGroupRef'}
+    _indexed_tags = {'spectrum', 'chromatogram'}
+
+    def __init__(self, *args, **kwargs):
+        self.decode_binary = kwargs.pop('decode_binary', True)
+        self._referenceable_param_groups = {}
+        super(MzML, self).__init__(*args, **kwargs)
+
+    def __getstate__(self):
+        state = super(MzML, self).__getstate__()
+        state['decode_binary'] = self.decode_binary
+        return state
+
+    def __setstate__(self, state):
+        super(MzML, self).__setstate__(state)
+        self.decode_binary = state['decode_binary']
+
+    def _handle_referenceable_param_group(self, param_group_ref, **kwargs):
+        ref_name = param_group_ref.attrib['ref']
+        if ref_name not in self._referenceable_param_groups:
+            params = self._referenceable_param_groups[ref_name] = self._retrieve_param_group(ref_name)
+            return params
+        return self._referenceable_param_groups[ref_name]
+
+    @xml._keepstate
+    def _retrieve_param_group(self, ref_name):
+        group = self.get_by_id(ref_name)
+        group.pop("id", None)
+        return [xml._XMLParam(k, v, None) for k, v in group.items()]
+
+    def _detect_array_name(self, info):
+        """Determine what the appropriate name for this
+        array is by inspecting the available param-based
+        keys.
+
+        Parameters
+        ----------
+        info : dict
+            The collapsed binary tag plus
+            associated *Param  data
+
+        Returns
+        -------
+        out : str
+            The name for this array entry
+        """
+        # If this is a non-standard array, we hope the userParams
+        # will conform to the same array suffix pattern.
+        is_non_standard = False
+
+        # Accumulate possible name candidates
+        candidates = []
+        for k in info:
+            if k.endswith(' array') and not info[k]:
+                if NON_STANDARD_DATA_ARRAY == k:
+                    is_non_standard = True
+                else:
+                    candidates.append(k)
+        # A non-standard data array term key might have the name for the data array
+        # as the value.
+        nonstandard_name = info.get(NON_STANDARD_DATA_ARRAY)
+        if nonstandard_name:
+            return nonstandard_name
+        if isinstance(info.get('name'), list):
+            for val in info['name']:
+                if val.endswith(' array'):
+                    if NON_STANDARD_DATA_ARRAY == val:
+                        is_non_standard = True
+                    else:
+                        candidates.append(val)
+        # Name candidate resolution
+        n_candidates = len(candidates)
+        # Easy case, exactly one name given
+        if n_candidates == 1:
+            return candidates[0]
+        # We are missing information, but at least
+        # if we know the array is non-standard we
+        # can report it as such. Otherwise fall back
+        # to "binary". This fallback signals special
+        # behavior elsewhere.
+        if n_candidates == 0:
+            invalid = {"encodedLength", "dataProcessingRef", "arrayLength",
+                       "binary"}
+            for k in info:
+                if k in invalid:
+                    continue
+                candidates.append(k)
+            if len(candidates) == 0:
+                if is_non_standard:
+                    return NON_STANDARD_DATA_ARRAY
+                warnings.warn("No options for non-standard data array")
+                return "binary"
+            else:
+                warnings.warn(
+                    "Multiple options for naming binary array after no valid name found: %r" % candidates)
+                return max(candidates, key=len)
+        # Multiple choices means we need to make a decision which could
+        # mask data from the user. This should never happen but stay safe.
+        # There are multiple options to choose from. There is no way to
+        # make a good choice here. We first prefer the standardized
+        # arrays before falling back to just guessing.
+        else:
+            candidates = set(candidates)
+            # Maybe we just have a repeated term?
+            if len(candidates) == 1:
+                return next(iter(candidates))
+            warnings.warn(
+                "Multiple options for naming binary array: %r" % candidates)
+            standard_options = candidates & STANDARD_ARRAYS
+            if standard_options:
+                return max(standard_options, key=len)
+            return max(candidates, key=len)
+
+    def _determine_array_dtype(self, info):
+        dtype = None
+        types = {'32-bit float': np.float32, '64-bit float': np.float64,
+                 '32-bit integer': np.int32, '64-bit integer': np.int64,
+                 'null-terminated ASCII string': np.uint8}
+        for t, code in types.items():
+            if t in info:
+                dtype = code
+                del info[t]
+                break
+        # sometimes it's under 'name'
+        else:
+            if 'name' in info:
+                for t, code in types.items():
+                    if t in info['name']:
+                        dtype = code
+                        info['name'].remove(t)
+                        break
+        return dtype
+
+    def _determine_compression(self, info):
+        known_compression_types = set(self.compression_type_map)
+        found_compression_types = known_compression_types & set(info)
+        if found_compression_types:
+            found_compression_types = tuple(found_compression_types)
+            if len(found_compression_types) == 1:
+                del info[found_compression_types[0]]
+                return found_compression_types[0]
+            warnings.warn("Multiple options for binary array compression: %r" % (
+                found_compression_types,))
+            return found_compression_types[0]
+        elif "name" in info:
+            found_compression_types = known_compression_types & set(info['name'])
+            if found_compression_types:
+                found_compression_types = tuple(found_compression_types)
+                if len(found_compression_types) == 1:
+                    del info['name'][found_compression_types[0]]
+                    return found_compression_types[0]
+                else:
+                    warnings.warn("Multiple options for binary array compression: %r" % (
+                        found_compression_types,))
+                    return found_compression_types[0]
+        else:
+            return 'no compression'
+
+    def _handle_binary(self, info, **kwargs):
+        """Special handling when processing and flattening
+        a <binary> tag and its sibling *Param tags.
+
+        Parameters
+        ----------
+        info : dict
+            Unprocessed binary array data and metadata
+
+        Returns
+        -------
+        out : dict
+            The processed and flattened data array and metadata
+        """
+        dtype = self._determine_array_dtype(info)
+        compressed = self._determine_compression(info)
+        name = self._detect_array_name(info)
+        binary = info.pop('binary')
+        if not self.decode_binary:
+            info[name] = self._make_record(binary, compressed, dtype, name)
+            return info
+
+        if binary:
+            array = self.decode_data_array(binary, compressed, dtype)
+        else:
+            array = np.array([], dtype=dtype)
+
+        if name == 'binary':
+            info[name] = self._convert_array(None, array)
+        else:
+            info = {name: self._convert_array(name, array)}
+        return info
+
+    def _get_info_smart(self, element, **kw):
+        name = xml._local_name(element)
+        kwargs = dict(kw)
+        rec = kwargs.pop('recursive', None)
+        if name in {'indexedmzML', 'mzML'}:
+            info = self._get_info(element,
+                    recursive=(rec if rec is not None else False),
+                    **kwargs)
+        else:
+            info = self._get_info(element,
+                    recursive=(rec if rec is not None else True),
+                    **kwargs)
+        if 'binary' in info and isinstance(info, dict):
+            info = self._handle_binary(info, **kwargs)
+
+        if 'binaryDataArray' in info and isinstance(info, dict):
+            for array in info.pop('binaryDataArray'):
+                info.update(array)
+        intkeys = {'ms level'}
+        for k in intkeys:
+            if k in info:
+                try:
+                    info[k] = int(info[k])
+                except (ValueError, TypeError):
+                    pass
+        return info
+
+    def _retrieve_refs(self, info, **kwargs):
+        """Retrieves and embeds the data for each attribute in `info` that
+        ends in _ref. Removes the id attribute from `info`"""
+        for k, v in dict(info).items():
+            if k == 'ref':
+                by_id = self.get_by_id(v, retrieve_refs=True)
+                if by_id is None:
+                    warnings.warn('Ignoring unresolved reference: ' + v)
+                else:
+                    info.update(by_id)
+                    del info[k]
+                    info.pop('id', None)
+
+    @staticmethod
+    def _get_time(scan):
+        return scan['scanList']['scan'][0]['scan start time']
+
+
+def read(source, read_schema=False, iterative=True, use_index=False, dtype=None, huge_tree=False, decode_binary=True):
+    """Parse `source` and iterate through spectra.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target mzML file or the file object itself.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzML header. Otherwise, use default parameters.
+        Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    use_index : bool, optional
+        Defines whether an index of byte offsets needs to be created for
+        spectrum elements. Default is :py:const:`False`.
+
+    dtype : type or dict, optional
+        dtype to convert arrays to, one for both m/z and intensity arrays or one for each key.
+        If :py:class:`dict`, keys should be 'm/z array' and 'intensity array'.
+
+    decode_binary : bool, optional
+        Defines whether binary data should be decoded and included in the output
+        (under "m/z array", "intensity array", etc.).
+        Default is :py:const:`True`.
+
+    huge_tree : bool, optional
+        This option is passed to the `lxml` parser and defines whether
+        security checks for XML tree depth and node size should be disabled.
+        Default is :py:const:`False`.
+        Enable this option for trusted files to avoid XMLSyntaxError exceptions
+        (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`).
+
+    Returns
+    -------
+    out : iterator
+       An iterator over the dicts with spectrum properties.
+    """
+
+    return MzML(source, read_schema=read_schema, iterative=iterative,
+                use_index=use_index, dtype=dtype, huge_tree=huge_tree,
+                decode_binary=decode_binary)
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified "XPath".
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create an :py:class:`MzML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header. Otherwise, use default
+        parameters. Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    decode_binary : bool, optional
+        Defines whether binary data should be decoded and included in the output
+        (under "m/z array", "intensity array", etc.).
+        Default is :py:const:`True`.
+
+    Returns
+    -------
+    out : iterator
+    """
+    return MzML(source, **kwargs).iterfind(path, **kwargs)
+
+version_info = xml._make_version_info(MzML)
+
+# chain = aux._make_chain(read, 'read')
+
+chain = aux.ChainBase._make_chain(MzML)
+
+
+class PreIndexedMzML(MzML):
+    """Parser class for mzML files, subclass of :py:class:`MzML`.
+    Uses byte offsets listed at the end of the file for quick access to spectrum elements.
+    """
+    def _build_index(self):
+        """
+        Build up a `dict` of `dict` of offsets for elements. Calls :meth:`_find_index_list`
+        and assigns the return value to :attr:`_offset_index`
+        """
+        index = self._find_index_list()
+        if index:
+            self._offset_index = index
+        else:
+            warnings.warn('Could not extract the embedded offset index. Falling back to default indexing procedure.')
+            super(PreIndexedMzML, self)._build_index()
+
+    @xml._keepstate
+    def _iterparse_index_list(self, offset):
+        index_map = xml.HierarchicalOffsetIndex()
+        index = index_map._inner_type()
+        self._source.seek(offset)
+        try:
+            for event, elem in etree.iterparse(self._source, events=('start', 'end'), remove_comments=True):
+                if event == 'start':
+                    if elem.tag == 'index':
+                        index = {}
+                        index_map[elem.attrib['name']] = index
+                else:
+                    if elem.tag == 'offset':
+                        index[elem.attrib['idRef']] = int(elem.text)
+                    elem.clear()
+        except etree.XMLSyntaxError:
+            # The iteration has reached the end of the indexList tag and the parser
+            # encounters the later elements in the document.
+            pass
+        return index_map
+
+    @xml._keepstate
+    def _find_index_list_offset(self):
+        """
+        Search relative to the bottom of the file upwards to find the offsets
+        of the index lists.
+
+        Returns
+        -------
+        list of int
+            A list of byte offsets for `<indexList>` elements
+        """
+        self._source.seek(-1024, 2)
+        text = self._source.read(1024)
+        index_offsets = list(map(int, re.findall(br'<indexListOffset>(\d+)</indexListOffset>', text)))
+        return index_offsets
+
+    @xml._keepstate
+    def _find_index_list(self):
+        """
+        Extract lists of index offsets from the end of the file.
+
+        Returns
+        -------
+        dict of str -> dict of str -> int
+        """
+        offsets = self._find_index_list_offset()
+        index_list = xml.HierarchicalOffsetIndex()
+        for offset in offsets:
+            # Sometimes the offset is at the very beginning of the file,
+            # due to a bug in an older version of ProteoWizard. If this crude
+            # check fails, don't bother searching the entire file, and fall back
+            # on the base class's mechanisms.
+            #
+            # Alternative behavior here would be to start searching for the start
+            # of the index from the bottom of the file, but this version of Proteowizard
+            # also emits invalid offsets which do not improve retrieval time.
+            if offset < 1024:
+                continue
+            index_list = self._iterparse_index_list(offset)
+        return index_list
diff --git a/pyteomics/mzmlb.py b/pyteomics/mzmlb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cda3cacf5aaa1bfee7029981db91fd4849c923d
--- /dev/null
+++ b/pyteomics/mzmlb.py
@@ -0,0 +1,618 @@
+# -*- coding: utf8 -*-
+"""
+mzmlb - reader for mass spectrometry data in mzMLb format
+=========================================================
+
+.. warning::
+    This is a **Provisional Implementation**. The mzMLb format has been published
+    but is not yet broadly available.
+
+Summary
+-------
+mzMLb is an HDF5 container format wrapping around the standard rich XML-format
+for raw mass spectrometry data storage. Please refer to [1]_ for more information
+about mzMLb and its features. Please refer to
+`psidev.info <https://www.psidev.info/mzML>`_ for the detailed
+specification of the format and structure of mzML files.
+
+This module provides a minimalistic way to extract information from mzMLb
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`MzMLb` to iterate over entries in ``<spectrum>`` elements.
+:py:class:`MzMLb` also support direct indexing with spectrum IDs or indices.
+
+Data access
+-----------
+
+  :py:class:`MzMLb` - a class representing a single mzMLb file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through spectra in mzMLb file. Data from a
+  single spectrum are converted to a human-readable dict. Spectra themselves are
+  stored under 'm/z array' and 'intensity array' keys.
+
+  :py:func:`chain` - read multiple mzMLb files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+Controlled Vocabularies
+~~~~~~~~~~~~~~~~~~~~~~~
+mzMLb relies on controlled vocabularies to describe its contents extensibly. See
+`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_
+for more details on how they are used.
+
+Handling Time Units and Other Qualified Quantities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+mzMLb contains information which may be described as using a variety of different time units.
+See `Unit Handling <../data.html#unit-handling>`_ for more information.
+
+References
+----------
+.. [1] Bhamber, R. S., Jankevics, A., Deutsch, E. W., Jones, A. R., & Dowsey, A. W. (2021).
+    MzMLb: A Future-Proof Raw Mass Spectrometry Data Format Based on Standards-Compliant
+    mzML and Optimized for Speed and Storage Requirements. Journal of Proteome Research,
+    20(1), 172–183. https://doi.org/10.1021/acs.jproteome.0c00192
+"""
+
+import io
+import warnings
+import logging
+from collections import namedtuple
+
+import h5py
+try:
+    logging.getLogger("hdf5plugin").addHandler(logging.NullHandler())
+    import hdf5plugin
+except ImportError:
+    hdf5plugin = None
+
+import numpy as np
+
+from pyteomics.mzml import MzML as _MzML
+from pyteomics.auxiliary.file_helpers import HierarchicalOffsetIndex, TaskMappingMixin, TimeOrderedIndexedReaderMixin, FileReader
+from pyteomics import auxiliary as aux, xml
+
+
+def delta_predict(data, copy=True):
+    '''Reverse the lossy transformation of the delta compression
+    helper.
+
+    Parameters
+    ----------
+    data : :class:`numpy.ndarray`
+        The data to transform
+    copy : bool
+        Whether to make a copy of the data array or transform it in-place.
+
+    Returns
+    -------
+    :class:`numpy.ndarray`
+        The transformed data array
+    '''
+    if copy:
+        out = data.copy()
+    else:
+        out = data
+    for i in range(2, len(data)):
+        out[i] = out[i] + out[i - 1] - out[0]
+    return out
+
+
+def linear_predict(data, copy=True):
+    '''Reverse the lossy transformation of the linear interpolation compression
+    helper.
+
+    Parameters
+    ----------
+    data : :class:`numpy.ndarray`
+        The data to transform
+    copy : bool
+        Whether to make a copy of the data array or transform it in-place.
+
+    Returns
+    -------
+    :class:`numpy.ndarray`
+        The transformed data array
+    '''
+    if copy:
+        out = data.copy()
+    else:
+        out = data
+    for i in range(2, len(data)):
+        out[i] = out[i] + 2 * out[i - 1] - out[i - 2] - out[1]
+    return out
+
+
+class HDF5ByteBuffer(io.RawIOBase):
+    '''Helper class that looks file-like so that we can pass a HDF5 byte dataset to
+    an arbitrary XML parser.
+
+    Implements :class:`~io.RawIOBase` for reading.
+    '''
+    def __init__(self, buffer, offset=None):
+        if offset is None:
+            offset = 0
+        self.buffer = buffer
+        self.offset = offset
+        self.size = self.buffer.size
+        self.mode = 'rb'
+
+    def readable(self):
+        return True
+
+    def seekable(self):
+        return True
+
+    def isatty(self):
+        return False
+
+    def seek(self, offset, whence=0):
+        if whence == io.SEEK_SET:
+            self.offset = offset
+        elif whence == io.SEEK_CUR:
+            self.offset += offset
+        elif whence == io.SEEK_END:
+            self.offset = self.size - offset
+        else:
+            raise ValueError("Bad whence %r" % whence)
+        return self.offset
+
+    def tell(self):
+        return self.offset
+
+    def close(self):
+        return
+
+    @property
+    def closed(self):
+        return False
+
+    def readinto(self, b):
+        n = len(b)
+        temp = self._read(n)
+        m = len(temp)
+        b[:m] = temp[:]
+        return m
+
+    def readall(self):
+        return bytes(self._read(-1))
+
+    def read(self, n=-1):
+        return bytes(self._read(n))
+
+    def write(self, b):
+        raise ValueError("Read-only stream")
+
+    def _read(self, n=-1):
+        if n == -1:
+            n = self.size + 1
+        dat = bytearray(np.array(self.buffer[self.offset:self.offset + n]))
+        self.offset += n
+        return dat
+
+
+class external_array_slice(namedtuple('external_array_slice',
+                           ['array_name', 'offset', 'length', 'source', 'transform', 'key', 'dtype'])):
+    def decode(self):
+        """Decode :attr:`data` into a numerical array
+
+        Returns
+        -------
+        np.ndarray
+        """
+        return self.source._decode_record(self)
+
+
+class ExternalDataMzML(_MzML):
+    '''An MzML parser that reads data arrays from an external provider.
+
+    This is an implementation detail of :class:`MzMLb`.
+    '''
+    def __init__(self, *args, **kwargs):
+        self._external_data_registry = kwargs.pop("external_data_registry", None)
+        super(ExternalDataMzML, self).__init__(*args, **kwargs)
+
+    def _make_record(self, array_name, offset, length, transform, name, dtype):
+        return external_array_slice(array_name, offset, length, self, transform, name, dtype)
+
+    def _transform_array(self, array, transform):
+        if transform is None:
+            return array
+        elif "linear prediction" == transform:
+            return linear_predict(array, copy=False)
+        elif "delta prediction" == transform:
+            return delta_predict(array, copy=False)
+        else:
+            raise ValueError("Transformation not recognized")
+
+    def _retrieve_external_array(self, array_name, length, offset):
+        array = self._external_data_registry.get(array_name, length, offset)
+        return array
+
+    def decode_data_array(self, array_name, offset, length, transform=None, dtype=np.float64):
+        array = self._retrieve_external_array(array_name, length, offset)
+        array = self._transform_array(array, transform)
+        return array
+
+    def _decode_record(self, record):
+        array = self.decode_data_array(
+            record.array_name, record.offset, record.length, record.transform, record.dtype)
+        return self._finalize_record_conversion(array, record)
+
+    def _handle_binary(self, info, **kwargs):
+        if not self.decode_binary:
+            self.decode_binary = True
+            # Binary decoding works totally differently here, not supporting the previous signatures
+            # that the parent method will use. Pretend we are decoding because it is a no-op in the
+            # parent method.
+            result = super(ExternalDataMzML, self)._handle_binary(info, **kwargs)
+            self.decode_binary = False
+        else:
+            result = super(ExternalDataMzML, self)._handle_binary(info, **kwargs)
+        try:
+            array_name = info['external HDF5 dataset']
+        except KeyError:
+            array_name = info['external dataset']
+        offset = int(info['external offset'])
+        length = int(info['external array length'])
+
+        transform = None
+        # The zlib compression in these two terms happens automatically during HDF5 encoding and
+        # the reader needn't even know about it. Need an example of how Numpress will be signaled.
+        if "linear prediction" in info or "truncation, linear prediction and zlib compression" in info:
+            transform = 'linear prediction'
+        elif "delta prediction" in info or "truncation, delta prediction and zlib compression" in info:
+            transform = 'delta prediction'
+
+        if not self.decode_binary:
+            name = self._detect_array_name(info)
+            result[name] = self._make_record(
+                array_name, offset, length, transform, name,
+                self._external_data_registry.dtype_of(array_name))
+            return result
+
+        array = self._retrieve_external_array(array_name, length, offset)
+
+        if len(result) == 1:
+            name = next(iter(result))
+        else:
+            name = self._detect_array_name(info)
+        result[name] = self._convert_array(name, array)
+        return result
+
+    def reset(self):
+        super(ExternalDataMzML, self).reset()
+        self._external_data_registry.clear()
+
+
+class chunk_interval_cache_record(namedtuple("chunk_interval_cache_record", ("start", "end", "array"))):
+    def contains(self, start, end):
+        if self.start <= start:
+            if end < self.end:
+                return True
+        return False
+
+    def get(self, start, end):
+        return self.array[start - self.start:end - self.start]
+
+    def __eq__(self, other):
+        return self.start == other.start and self.end == other.end
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(self.start)
+
+
+class ExternalArrayRegistry(object):
+    '''Read chunks out of a single long array
+
+    This is an implementation detail of :class:`MzMLb`
+
+    Attributes
+    ----------
+    registry : Mapping
+        A mapping from array name to the out-of-core array object.
+    chunk_size : int
+        The number of entries to chunk together and keep in memory.
+    chunk_cache : dict
+        A mapping from array name to cached array blocks.
+    '''
+    def __init__(self, registry, chunk_size=None):
+        if chunk_size is None:
+            chunk_size = 2 ** 20
+        else:
+            chunk_size = int(chunk_size)
+        self.registry = registry
+        self.chunk_cache = {}
+        self.chunk_size = chunk_size
+
+    def clear(self):
+        self.chunk_cache.clear()
+
+    def _get_raw(self, array_name, start, end):
+        return self.registry[array_name][start:end]
+
+    def _make_cache_record(self, array_name, start, end):
+        return chunk_interval_cache_record(start, end, self._get_raw(array_name, start, end))
+
+    def get(self, array_name, length, offset=0):
+        start = offset
+        end = start + length
+        try:
+            cache_record = self.chunk_cache[array_name]
+            if cache_record.contains(start, end):
+                return cache_record.get(start, end)
+            else:
+                cache_record = self._make_cache_record(
+                    array_name, start, start + max(length, self.chunk_size))
+            self.chunk_cache[array_name] = cache_record
+            return cache_record.get(start, end)
+        except KeyError:
+            cache_record = self._make_cache_record(
+                array_name, start, start + max(length, self.chunk_size))
+            self.chunk_cache[array_name] = cache_record
+            return cache_record.get(start, end)
+        return self.registry[array_name][offset:offset + length]
+
+    def dtype_of(self, array_name):
+        return self.registry[array_name].dtype
+
+    def __call__(self, array_name, length, offset=0):
+        return self.get(array_name, length, offset)
+
+
+class MzMLb(TimeOrderedIndexedReaderMixin, TaskMappingMixin):
+    '''A parser for mzMLb [1]_.
+
+    Provides an identical interface to :class:`~pyteomics.mzml.MzML`.
+
+    Attributes
+    ----------
+    path : str, Path-like, or file-like object
+        The mzMLb file path or a file-like object providing it.
+    handle : :class:`h5py.File`
+        The raw HDF5 file container.
+    mzml_parser : :class:`~.ExternalDataMzML`
+        The mzML parser for the XML stream inside the HDF5 file with
+        special behavior for retrieving the out-of-band data arrays
+        from their respective storage locations.
+    schema_version : str
+        The mzMLb HDF5 schema version, distinct from the mzML schema inside it.
+
+
+    References
+    ----------
+    [1] Bhamber, R. S., Jankevics, A., Deutsch, E. W., Jones, A. R., & Dowsey, A. W. (2021).
+        MzMLb: A Future-Proof Raw Mass Spectrometry Data Format Based on Standards-Compliant
+        mzML and Optimized for Speed and Storage Requirements. Journal of Proteome Research,
+        20(1), 172–183. https://doi.org/10.1021/acs.jproteome.0c00192
+    '''
+    _default_iter_tag = ExternalDataMzML._default_iter_tag
+
+    file_format = "mzMLb"
+
+    def __init__(self, path, hdfargs=None, mzmlargs=None, allow_updates=False,
+                 use_index=True, **kwargs):
+        if hdfargs is None:
+            hdfargs = {}
+        if mzmlargs is None:
+            mzmlargs = {}
+        mzmlargs.update(kwargs)
+
+        self.path = path
+        self._hdfargs = hdfargs
+        self._mzmlargs = mzmlargs
+        self._allow_updates = allow_updates
+        self.handle = h5py.File(self.path, 'r+' if self._allow_updates else 'r', **hdfargs)
+        self.schema_version = self.handle['mzML'].attrs.get('version')
+        self._check_compressor()
+
+        self._xml_buffer = io.BufferedReader(HDF5ByteBuffer(self.handle['mzML']))
+        self._array_registry = ExternalArrayRegistry(self.handle)
+        self._make_mzml_parser(mzmlargs)
+
+        super(MzMLb, self).__init__(**kwargs)
+
+    def _check_compressor(self):
+        for key in self.handle.keys():
+            if "spectrum_MS_" in key or "chromatogram_MS_":
+                data = self.handle[key]
+                try:
+                    filts = data._filters
+                except AttributeError:
+                    continue
+                if '32001' in filts:
+                    if hdf5plugin is None:
+                        warnings.warn(
+                            ("Blosc meta-compressor detected, but hdf5plugin is "
+                             "not installed, may not be able to access %r") % (key))
+
+    def _make_mzml_parser(self, kwargs):
+        self._mzml_parser = ExternalDataMzML(
+            self._xml_buffer, external_data_registry=self._array_registry,
+            use_index=False, **kwargs)
+        self._mzml_parser._offset_index = self._build_index()
+        self._mzml_parser._use_index = True
+
+    @property
+    def name(self):
+        if hasattr(self.path, 'name'):
+            return self.path.name
+        return self.path
+
+    def _build_index(self):
+        index = HierarchicalOffsetIndex()
+        for label in [u'spectrum', u'chromatogram']:
+            sub = index[label]
+            ids = bytearray(np.array(self.handle['mzML_{}Index_idRef'.format(label)])).split(b"\x00")
+            offsets = self.handle["mzML_{}Index".format(label)][:-1]
+            for i, o in enumerate(offsets):
+                sub[ids[i].decode('utf8')] = o
+        return index
+
+    def get_by_id(self, id):
+        """Parse the file and return the element with `id` attribute equal
+        to `elem_id`. Returns :py:const:`None` if no such element is found.
+
+        Parameters
+        ----------
+        elem_id : str
+            The value of the `id` attribute to match.
+
+        Returns
+        -------
+        out : :py:class:`dict` or :py:const:`None`
+        """
+        return self._mzml_parser.get_by_id(id)
+
+    def get_by_ids(self, ids):
+        return self._mzml_parser.get_by_ids(ids)
+
+    def get_by_index(self, i):
+        return self._mzml_parser.get_by_index(i)
+
+    def get_by_indexes(self, indexes):
+        return self._mzml_parser.get_by_indexes(indexes)
+
+    def get_by_index_slice(self, s):
+        return self._mzml_parser.get_by_index_slice(s)
+
+    def get_by_key_slice(self, s):
+        return self._mzml_parser.get_by_key_slice(s)
+
+    def __contains__(self, key):
+        return key in self.index
+
+    def __getitem__(self, i):
+        return self._mzml_parser[i]
+
+    def __len__(self):
+        return len(self._mzml_parser)
+
+    def __iter__(self):
+        return iter(self._mzml_parser)
+
+    def __next__(self):
+        return next(self._mzml_parser)
+
+    def next(self):
+        return self.__next__()
+
+    def __reduce__(self):
+        return self.__class__, (self.path, self._hdfargs, self._mzmlargs, self._allow_updates)
+
+    def close(self):
+        self.handle.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+    def iterfind(self, *args, **kwargs):
+        iterf = self._mzml_parser.iterfind(*args, **kwargs)
+        iterf.parser = self
+        return iterf
+
+    def _iterfind_impl(self, path, *args, **kwargs):
+        return self._mzml_parser._iterfind_impl(path, *args, **kwargs)
+
+    @property
+    def index(self):
+        return self._mzml_parser.index
+
+    @property
+    def _offset_index(self):
+        return self._mzml_parser._offset_index
+
+    @property
+    def default_index(self):
+        return self._mzml_parser.default_index
+
+    def _get_time(self, scan):
+        return self._mzml_parser._get_time(scan)
+
+    @property
+    def mzml_parser(self):
+        return self._mzml_parser
+
+    def _task_map_iterator(self):
+        """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC
+        queue used by :meth:`map`
+
+        Returns
+        -------
+        :class:`Iteratable`
+        """
+        return iter(self.index[self._default_iter_tag])
+
+    def read(self, n=-1):
+        return self._mzml_parser.read(n)
+
+    def reset(self):
+        self._mzml_parser.reset()
+
+    def seek(self, offset, whence=0):
+        self._mzml_parser.seek(offset, whence)
+
+    def tell(self):
+        return self._mzml_parser.tell()
+
+    def get_dataset(self, name):
+        '''Get an HDF5 dataset by its name or path relative to
+        the root node.
+
+        .. warning::
+            Because this accesses HDF5 data directly, it may be possible to mutate
+            the underlying file if :attr:`allow_updates` is :const:`True`.
+
+        Parameters
+        ----------
+        name : :class:`str`
+            The dataset name or path.
+
+        Returns
+        -------
+        :class:`h5py.Dataset` or :class:`h5py.Group`
+
+        Raises
+        ------
+        KeyError :
+            The name is not found.
+        '''
+        return self.handle[name]
+
+
+def read(source, dtype=None):
+    """Parse `source` and iterate through spectra.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target mzMLb file or the file object itself.
+    dtype : type or dict, optional
+        dtype to convert arrays to, one for both m/z and intensity arrays or one for each key.
+        If :py:class:`dict`, keys should be 'm/z array' and 'intensity array'.
+
+    Returns
+    -------
+    out : iterator
+       An iterator over the dicts with spectrum properties.
+    """
+    reader = MzMLb(source, dtype=dtype)
+    return reader
+
+
+# The MzMLb class is detatched from the normal :class:`FileReader`-based inheritance tree,
+# this grafts it back on for :func:`isinstance` and :func:`issubclass` tests at least.
+FileReader.register(MzMLb)
+
+
+version_info = xml._make_version_info(MzMLb)
+
+# chain = aux._make_chain(read, 'read')
+
+chain = aux.ChainBase._make_chain(MzMLb)
diff --git a/pyteomics/mztab.py b/pyteomics/mztab.py
new file mode 100644
index 0000000000000000000000000000000000000000..148d4cfc8be1cd49b988fd6343863ee557debf26
--- /dev/null
+++ b/pyteomics/mztab.py
@@ -0,0 +1,783 @@
+"""
+mztab - mzTab file reader
+=========================
+
+Summary
+-------
+
+`mzTab <https://github.com/HUPO-PSI/mzTab>`_  is one of the standards
+developed by the Proteomics Informatics working group of the HUPO Proteomics
+Standard Initiative.
+
+This module provides a way to read mzTab files into a collection of
+:py:class:`pandas.DataFrame` instances in memory, along with a mapping
+of the file-level metadata. MzTab specifications 1.0 and 2.0 are supported.
+
+Data access
+-----------
+
+  :py:class:`MzTab` - a class representing a single mzTab file.
+
+Helpers
+-------
+
+    :py:class:`Group` - a collection of metadata relating to one entity.
+
+
+Internals
+---------
+
+    :py:class:`_MzTabTable` - a single table in an mzTab file.
+
+
+Property Management
+~~~~~~~~~~~~~~~~~~~
+
+:mod:`mztab` uses metaprogramming to generate its metadata accessors, generated by
+these classes working in concert.
+
+    :py:class:`MetadataBackedProperty`
+
+    :py:class:`MetadataBackedCollection`
+
+    :py:class:`MetadataPropertyAnnotator`
+
+-------------------------------------------------------------------------------
+"""
+
+import re
+import warnings
+
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
+
+from collections import OrderedDict
+
+from pyteomics.auxiliary import _file_obj
+from pyteomics.auxiliary import cvstr
+from pyteomics.auxiliary.utils import add_metaclass
+
+
+def _require_pandas():
+    if pd is None:
+        raise ImportError(
+            "To load an mzTab file into pandas.DataFrame objects, you must install pandas!")
+
+
+class MetadataBackedProperty(object):
+    '''Our descriptor type which uses the instance's metadata attribute to carry its values'''
+
+    def __init__(self, name, variant_required=None):
+        if variant_required is None:
+            variant_required = ()
+        self.name = name
+        self.variant_required = variant_required
+        self.__doc__ = self.build_docstring()
+
+    def __repr__(self):
+        return "{self.__class__.__name__}(name={self.name!r}, variant_required={self.variant_required})".format(self=self)
+
+    def __get__(self, obj, objtype=None):
+        if obj is None and objtype is not None:
+            # So the property can be seen for what it is
+            return self
+        value = obj.metadata.get(self.name)
+        if value is None and self.variant_required and obj.variant in self.variant_required:
+            raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format(
+                self.name, obj.variant))
+        return value
+
+    def __set__(self, obj, value):
+        obj.metadata[self.name] = value
+
+    def __delete__(self, obj):
+        del obj.metadata[self.name]
+
+    def build_docstring(self):
+        doc = '''Accesses the {self.name!r} key in the :attr:`metadata` mapping attached
+to this object.
+'''
+        if self.variant_required:
+            if len(self.variant_required) > 1:
+                plural = 's'
+            else:
+                plural = ''
+            requires = ' or '.join(['-%s' % v for v in self.variant_required])
+            doc += '''
+This key must be present when the file is of {requires} variant{plural}.
+        '''.format(requires=requires, plural=plural)
+        doc += '''
+Returns
+-------
+object
+        '''
+        doc = doc.format(self=self)
+        return doc
+
+
+class MetadataBackedCollection(object):
+    def __init__(self, name, variant_required=None):
+        if variant_required is None:
+            variant_required = ()
+        self.name = name
+        self.variant_required = variant_required
+        self.__doc__ = self.build_docstring()
+
+    def __get__(self, obj, objtype=None):
+        if obj is None and objtype is not None:
+            # So the property can be seen for what it is
+            return self
+        groups = obj.gather(obj.metadata)
+        value = groups.get(self.name)
+        if value is None and self.variant_required and obj.variant in self.variant_required:
+            raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format(
+                self.name, obj.variant))
+        return value
+
+    def build_docstring(self):
+        doc = '''Accesses the {self.name!r} key group gathered in the :attr:`metadata` mapping attached
+to this object.
+
+This group is dynamically generated on each access and may be expensive for repeated use.
+'''
+        if self.variant_required:
+            if len(self.variant_required) > 1:
+                plural = 's'
+            else:
+                plural = ''
+            requires = ' or '.join(['-%s' % v for v in self.variant_required])
+            doc += '''
+This key must be present when the file is of {requires} variant{plural}.
+        '''.format(requires=requires, plural=plural)
+        doc += '''
+Returns
+-------
+:class:`~.Group`
+        '''
+        doc = doc.format(self=self)
+        return doc
+
+
+class MetadataPropertyAnnotator(type):
+    '''A simple metaclass to do some class-creation time introspection
+    and descriptor binding.
+
+    Uses a list of strings or 3-tuples from :attr:`__metadata_properties__` to
+    bind :class:`MetadataBackedProperty` or :class:`MetadataBackedCollection`
+    onto the class during its creation.
+
+    The specification for a property is a tuple of three values:
+        1. The metadata key to fetch
+        2. The property name to expose on the object
+        3. The variant(s) which require this metadata key be present
+
+    :obj:`("mzTab-version", "version", ("M", "P"))` would be interpreted as
+    Expose a property "version" on instances which serves the key "mzTab-version"
+    from the instance's :attr:`metadata`, and raise an error if it is absent in
+    the "M" or "P" variants.
+
+    Alternatively a specification may be a single string which will be interpreted
+    as the metadata key, and used to generate the property name replacing all '-'
+    with '_' and assumed to be optional in all variants.
+
+    If a metadata key ends with "[]" the property is assumed to be a collection. mzTab
+    makes heavy use of "<collection_name>[<index>]..." keys to define groups of homogenous
+    object types, often with per-element attributes.
+
+    .. code-block::
+
+        variable_mod[1]    CHEMMOD:15.9949146221
+        variable_mod[1]-site  M
+        variable_mod[1]-position    Anywhere
+        variable_mod[2]    CHEMMOD:42.0105646863
+        variable_mod[2]-site  N-term
+        variable_mod[2]-position Protein N-term
+
+    A specification :obj:`("variable_mod[]", "variable_mods", ())` would create a property
+    that returns:
+
+    .. code-block:: python
+
+        >>>instance.variable_mods
+        Group([(1,
+                    {'name': 'CHEMMOD:15.9949146221',
+                     'position': 'Anywhere',
+                     'site': 'M'}),
+                (2,
+                    {'name': 'CHEMMOD:42.0105646863',
+                     'position': 'Protein N-term',
+                     'site': 'N-term'})])
+
+    For precise description of the property collection algorithm, see
+    :meth:`~_MzTabParserBase.collapse_properties` and
+    :meth:`~_MzTabParserBase.gather`.
+
+    If any base classes have a :attr:`__metadata_properties__` attribute, it will
+    also be included unless :attr:`__inherit_metadata_properties__` is set to
+    :const:`False`. Any names explicitly set by the current class override this
+    automatic property generation.
+    '''
+    def __new__(mcls, name, bases, attrs):
+        props = attrs.get('__metadata_properties__', [])
+        inherit_props = attrs.get("__inherit_metadata_properties__", True)
+        # Gather from parent classes so we can use inheritance for overriding this
+        # behavior too.
+        if inherit_props:
+            for base in bases:
+                props.extend(getattr(base, '__metadata_properties__', []))
+
+        keys = set(attrs)
+
+        # Iterate in reverse to ensure that classes nearer to the new classes override
+        # more basal classes, ending with the new class to make sure overrides are
+        # applied.
+        for prop in reversed(props):
+            # If the property definition is a single string, interpret the specification
+            # as the property name, and apply some simple normalization to make it a valid
+            # Python attribute name and assume the property is always optional.
+            if isinstance(prop, str):
+                prop_name = prop
+                attr_name = prop_name.replace("mzTab-", '').replace('-', '_')
+                variant_required = None
+            else:
+                # Otherwise unpack the triple
+                prop_name, attr_name, variant_required = prop
+            # Attach the new descriptor to the class definition to be created. These descriptors
+            # will then be used when instances of that class try to get/set those attribute names.
+            if attr_name in keys:
+                continue
+            if prop_name.endswith('[]'):
+                # If the property name ends with "[]", then we're dealing with a collection so
+                # use the :class:`MetadataBackedCollection` descriptor
+                attrs[attr_name] = MetadataBackedCollection(
+                    prop_name[:-2], variant_required=variant_required)
+            else:
+                # Otherwise it is a scalar-valued property, using the :class:`MetadataBackedProperty`
+                # descriptor
+                prop = attrs[attr_name] = MetadataBackedProperty(
+                    prop_name, variant_required=variant_required)
+
+        return super(MetadataPropertyAnnotator, mcls).__new__(mcls, name, bases, attrs)
+
+
+class _MzTabParserBase(object):
+    def _parse_param(self, tuplet):
+        """Parse a controlled vocabulary or user specified parameter tuplet
+        into a Python object
+
+        Parameters
+        ----------
+        tuplet : str
+            A square brace enclosed tuplet of values describing the parameter
+
+        Returns
+        -------
+        tuple
+            The reduced representation of the parameter
+        """
+        cv, acc, name, value = re.split(r"\s*,\s*", tuplet[1:-1])
+        param_name = cvstr(name, acc)
+        if value:
+            return (param_name, value)
+        else:
+            return (param_name)
+
+    def collapse_properties(self, proplist):
+        '''Collapse a flat property list into a hierchical structure.
+
+        This is intended to operate on :py:class:`Mapping` objects, including
+        :class:`dict`, :class:`pandas.Series` and :class:`pandas.DataFrame`.
+
+        .. code-block:: python
+
+            {
+              "ms_run[1]-format": "Andromeda:apl file format",
+              "ms_run[1]-location": "file://...",
+              "ms_run[1]-id_format": "scan number only nativeID format"
+            }
+
+        to
+
+        .. code-block:: python
+
+            {
+              "ms_run": [
+                {
+                  "format": "Andromeda:apl file format",
+                  "location": "file://...",
+                  "id_format": "scan number only nativeID format"
+                }
+              ]
+            }
+
+        Parameters
+        ----------
+        proplist: :class:`Mapping`
+            Key-Value pairs to collapse
+
+        Returns
+        -------
+        :class:`OrderedDict`:
+            The collapsed property list
+        '''
+        entities = OrderedDict()
+        rest = {}
+        for key, value in proplist.items():
+            try:
+                entity, prop_name = key.rsplit("-", 1)
+            except ValueError:
+                rest[key] = value
+                continue
+            try:
+                entity_dict = entities[entity]
+            except KeyError:
+                entity_dict = entities[entity] = {}
+            entity_dict[prop_name] = value
+        for key, value in proplist.items():
+            if key in entities:
+                entity = entities[key]
+                if 'name' not in entity:
+                    entity['name'] = value
+        for key, value in rest.items():
+            if key in entities:
+                entities[key]['name'] = value
+            else:
+                entities[key] = value
+        return entities
+
+    def _collapse_collections(self, entities):
+        gathered = Group()
+        for key, props in entities.items():
+            if '[' in key:
+                k, ix = key.split('[', 1)
+                if '[' in ix:
+                    # If we have multiple [ in a key, we are dealing with a path
+                    path = extract_path(key)
+                    for k, ix in path[:-1]:
+                        store = gathered[k]
+                        store = store[int(ix)]
+                    k, ix = path[-1]
+                    store[k][int(ix)] = props
+
+                else:
+                    ix = int(ix[:-1])
+                    gathered[k][ix] = props
+            else:
+                gathered[key] = props
+        return gathered
+
+    def _cast_value(self, value):
+        """Convert a cell value to the appropriate Python type
+
+        Parameters
+        ----------
+        value : str
+            The cell value as text
+
+        Returns
+        -------
+        object
+            The most specialized type recognized
+        """
+        if value == 'null':
+            return None
+        # is it a parameter?
+        if value.startswith("["):
+            try:
+                if "|" in value:
+                    return [self._cast_value(v) for v in value.split("|")]
+                else:
+                    return self._parse_param(value)
+            except ValueError:
+                return value
+        else:
+            # begin guessing dtype
+            try:
+                value = int(value)
+            except ValueError:
+                try:
+                    value = float(value)
+                except ValueError:
+                    pass
+            return value
+
+    def gather(self, mapping):
+        '''Collapse property lists using :meth:`collapse_properties`
+        and then gather collections of entites into lists.
+
+        Parameters
+        ----------
+        mapping : dict
+            The flattened hierarchy of properties to re-construct
+
+        Returns
+        -------
+        Group :
+            A :class:`Group` of all entities and collections of entities
+        '''
+        return self._collapse_collections(self.collapse_properties(mapping))
+
+
+class _MzTabTable(_MzTabParserBase):
+
+    """An internal class for accumulating information about an single table
+    represented in an mzTab file
+
+    Attributes
+    ----------
+    header : list
+        The column names for the table
+    name : str
+        The table's name, human readable
+    rows : list
+        An accumulator of table rows
+    """
+
+    def __init__(self, name, header=None, rows=None):
+        if rows is None:
+            rows = []
+        self.name = name
+        self.header = header
+        self.rows = rows
+
+    def __repr__(self):
+        n_cols = len(self.header) if self.header is not None else 0
+        n_rows = len(self.rows)
+        template = "<_MzTabTable {name} with {n_cols} columns and {n_rows} rows>"
+        return template.format(n_cols=n_cols, n_rows=n_rows, name=self.name)
+
+    def add(self, row):
+        self.rows.append([self._cast_value(v) for v in row])
+
+    def __len__(self):
+        return len(self.rows)
+
+    def __getitem__(self, i):
+        if isinstance(i, int):
+            return self.gather({h: r for h, r in zip(self.header, self.rows[i])})
+        elif isinstance(i, slice):
+            out = []
+            for i in range(i.start or 0, i.stop or len(self), i.step or 1):
+                out.append(self[i])
+            return out
+        raise TypeError("Cannot access table with object of type %r" % type(i))
+
+    def as_dict(self):
+        return {"rows": [dict(zip(self.header, row)) for row in self.rows],
+                "name": self.name}
+
+    def as_df(self, index=None):
+        """Convert the table to a DataFrame in memory.
+
+        Returns
+        -------
+        pd.DataFrame
+        """
+        _require_pandas()
+        table = pd.DataFrame(data=self.rows, columns=self.header)
+        if index is not None and len(table.index) > 0:
+            table = table.set_index(index, drop=False)
+        table.name = self.name
+        return table
+
+    def clear(self):
+        self.header = None
+        self.rows = []
+
+
+DATA_FRAME_FORMAT = 'df'
+DICT_FORMAT = 'dict'
+RAW_FORMAT = 'raw'
+
+PATH_PARSER = re.compile(r"([^\[]+)\[(\d+)\]_?")
+
+
+def extract_path(path):
+    '''Parse `key[index]_next_key[next_index]...` sequences into
+    lists of (key, index) pairs.
+
+    Parameters
+    ----------
+    path : str
+        The path key to parse
+
+    Returns
+    -------
+    list
+    '''
+    return [(t, int(i)) for t, i in PATH_PARSER.findall(path)]
+
+
+class Group(OrderedDict):
+    '''A type for holding collections of arbitrarily nested keys from rows
+    and metadata mappings.
+
+    Implemented as an autovivifying :class:`OrderedDict` variant. As such implements
+    the :class:`~collections.abc.Mapping` interface.
+    '''
+
+    def get_path(self, path, default=None):
+        '''As :meth:`get` but over a path key parsed with :func:`extract_path`.
+
+        Parameters
+        ----------
+        path : str
+            The path to search down
+        default : object, optional
+            The return value when the path is missing
+
+        Returns
+        -------
+        object
+        '''
+        tokens = extract_path(path)
+        if not tokens:
+            return self.get(path, default)
+        layer = self
+        for k, i in tokens[:-1]:
+            i = int(i)
+            layer = layer.get(k)
+            if layer is None:
+                return None
+            layer = layer.get(i)
+            if layer is None:
+                return None
+        k, i = tokens[-1]
+        i = int(i)
+        layer = layer.get(k)
+        if layer is None:
+            return default
+        value = layer.get(i, default)
+        return value
+
+    def __missing__(self, key):
+        value = self.__class__()
+        self[key] = value
+        return value
+
+
+@add_metaclass(MetadataPropertyAnnotator)
+class MzTab(_MzTabParserBase):
+    """Parser for mzTab format files.
+
+    Attributes
+    ----------
+    comments : list
+        A list of comments across the file
+    file : _file_obj
+        A file stream wrapper for the file to be read
+    metadata : OrderedDict
+        A mapping of metadata that was entities.
+    peptide_table : _MzTabTable or pd.DataFrame
+        The table of peptides. Not commonly used.
+    protein_table : _MzTabTable or pd.DataFrame
+        The table of protein identifications.
+    small_molecule_table : _MzTabTable or pd.DataFrame
+        The table of small molecule identifications.
+    spectrum_match_table : _MzTabTable or pd.DataFrame
+        The table of spectrum-to-peptide match identifications.
+    table_format: 'df', 'dict', or callable
+        The structure type to replace each table with. The string
+        'df' will use pd.DataFrame instances. 'dict' will create
+        a dictionary of dictionaries for each table. A callable
+        will be called on each raw _MzTabTable object
+
+    Additional components of :attr:`metadata` are exposed as properties, returning
+    single values or aggregated collections of objects.
+    """
+
+    __metadata_properties__ = [
+        ('mzTab-version', 'version', ()),
+        ('mzTab-mode', 'mode', 'P'),
+        ('mzTab-type', 'type', 'P'),
+        ('mzTab-ID', 'id', 'M'),
+        'title',
+        'description',
+        ('ms_run[]', 'ms_runs', 'MP'),
+        ('instrument[]', 'instruments', ()),
+        ('software[]', 'software', ()),
+        ('publication[]', 'publications', ()),
+        ('contact[]', 'contacts', ()),
+        ('uri[]', 'uris', ()),
+        ('external_study_uri[]', 'external_study_uris', ()),
+        ('quantification_method', 'quantification_method', 'M'),
+        ('sample[]', 'samples', ()),
+        ('assay[]', 'assays', ()),
+        ('study_variable[]', 'study_variables', 'M'),
+        ('custom[]', 'custom', ()),
+        ('cv[]', 'cvs', 'M'),
+        ('database[]', 'databases', 'M'),
+
+        ('psm_search_engine_score[]', 'psm_search_engine_scores', ()),
+        ('protein_search_engine_score[]', 'protein_search_engine_scores', ()),
+        ('fixed_mod[]', 'fixed_mods', 'P'),
+        ('variable_mod[]', 'variable_mods', 'P'),
+        'colunit_protein',
+        'colunit_peptide',
+        'colunit_psm',
+        'colunit_small_molecule',
+        'false_discovery_rate',
+
+        ('derivatization_agent[]', 'derivatization_agents', ()),
+        ('small_molecule-quantification_unit',
+         'small_molecule_quantification_unit', 'M'),
+        ('small_molecule_feature-quantification_unit', 'small_molecule_feature_quantification_unit', 'M'),
+        ('small_molecule-identification_reliability',
+         'small_molecule_identification_reliability', ()),
+        ('id_confidence_measure[]', 'id_confidence_measures', 'M'),
+        ('colunit-small_molecule', 'colunit_small_molecule', ()),
+        ('colunit-small_molecule_feature', 'colunit_small_molecule_feature', ()),
+        ('colunit-small_molecule_evidence', 'colunit_small_molecule_evidence', ()),
+
+        ('sample_processing[]', 'sample_processing', ())
+    ]
+
+    def __init__(self, path, encoding='utf8', table_format=DATA_FRAME_FORMAT):
+        if table_format == DATA_FRAME_FORMAT:
+            _require_pandas()
+        # Must be defined in order for metadata properties to work
+        self.variant = None
+        self.file = _file_obj(path, mode='r', encoding=encoding)
+        self.metadata = OrderedDict()
+        self.comments = []
+        self._table_format = table_format
+        self._init_tables()
+        self._parse()
+        self._determine_schema_version()
+        self._transform_tables()
+
+    @property
+    def table_format(self):
+        return self._table_format
+
+    def __getitem__(self, key):
+        key = key.lower().strip()
+        if key in ('psm', ):
+            return self.spectrum_match_table
+        if key in ('pep', ):
+            return self.peptide_table
+        if key in ('prt', ):
+            return self.protein_table
+        if key in ('sml', ):
+            return self.small_molecule_table
+        if key in ('smf', ):
+            return self.small_molecule_feature_table
+        if key in ('sme', ):
+            return self.small_molecule_evidence_table
+        else:
+            raise KeyError(key)
+
+    def __iter__(self):
+        if self.variant == "P":
+            yield 'PRT', self.protein_table
+            yield 'PEP', self.peptide_table
+            yield 'PSM', self.spectrum_match_table
+            yield 'SML', self.small_molecule_table
+        elif self.variant == "M":
+            yield 'SML', self.small_molecule_table
+            yield 'SMF', self.small_molecule_feature_table
+            yield 'SME', self.small_molecule_evidence_table
+
+    def _init_tables(self):
+        self.protein_table = _MzTabTable("protein")
+        self.peptide_table = _MzTabTable("peptide")
+        self.spectrum_match_table = _MzTabTable('psm')
+        self.small_molecule_table = _MzTabTable('small molecule')
+        self.small_molecule_feature_table = _MzTabTable('small molecule feature')
+        self.small_molecule_evidence_table = _MzTabTable('small molecule evidence')
+
+    def _transform_tables(self):
+        if self._table_format == DATA_FRAME_FORMAT:
+            self.protein_table = self.protein_table.as_df('accession')
+            self.peptide_table = self.peptide_table.as_df()
+            self.spectrum_match_table = self.spectrum_match_table.as_df('PSM_ID')
+            self.small_molecule_table = self.small_molecule_table.as_df()
+            self.small_molecule_feature_table = self.small_molecule_feature_table.as_df()
+            self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_df()
+        elif self._table_format in (DICT_FORMAT, dict):
+            self.protein_table = self.protein_table.as_dict()
+            self.peptide_table = self.peptide_table.as_dict()
+            self.spectrum_match_table = self.spectrum_match_table.as_dict()
+            self.small_molecule_table = self.small_molecule_table.as_dict()
+            self.small_molecule_feature_table = self.small_molecule_feature_table.as_dict()
+            self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_dict()
+        elif callable(self._table_format):
+            self.protein_table = self._table_format(self.protein_table)
+            self.peptide_table = self._table_format(self.peptide_table)
+            self.spectrum_match_table = self._table_format(self.spectrum_match_table)
+            self.small_molecule_table = self._table_format(self.small_molecule_table)
+            self.small_molecule_feature_table = self._table_format(self.small_molecule_feature_table)
+            self.small_molecule_evidence_table = self._table_format(self.small_molecule_evidence_table)
+
+    def _parse(self):
+        for i, line in enumerate(self.file):
+            line = line.strip()
+            tokens = line.split("\t")
+            if not tokens:
+                continue
+            if tokens[0] == ("MTD"):
+                name = tokens[1]
+                value = self._cast_value(tokens[2])
+                self.metadata[name] = value
+            elif tokens[0] == 'COM':
+                self.comments.append(self._cast_value(tokens[1]))
+            # headers
+            elif tokens[0] == "PRH":
+                self.protein_table.header = tokens[1:]
+            elif tokens[0] == "PEH":
+                self.peptide_table.header = tokens[1:]
+            elif tokens[0] == "PSH":
+                self.spectrum_match_table.header = tokens[1:]
+            elif tokens[0] == "SMH":
+                self.small_molecule_table.header = tokens[1:]
+            elif tokens[0] == "SFH":
+                self.small_molecule_feature_table.header = tokens[1:]
+            elif tokens[0] == "SEH":
+                self.small_molecule_evidence_table.header = tokens[1:]
+            # rows
+            elif tokens[0] == "PRT":
+                self.protein_table.add(tokens[1:])
+            elif tokens[0] == "PEP":
+                self.peptide_table.add(tokens[1:])
+            elif tokens[0] == "PSM":
+                self.spectrum_match_table.add(tokens[1:])
+            elif tokens[0] == "SML":
+                self.small_molecule_table.add(tokens[1:])
+            elif tokens[0] == "SMF":
+                self.small_molecule_feature_table.add(tokens[1:])
+            elif tokens[0] == "SME":
+                self.small_molecule_evidence_table.add(tokens[1:])
+
+    def _determine_schema_version(self):
+        if self.version is not None:
+            version = str(self.version)
+        else:
+            warnings.warn("The mzTab-version metadata header was missing. Assuming the schema version is 1.0.0")
+            version = "1.0.0"
+            self.version = version
+        match = re.search(r"(?P<schema_version>\d+(?:\.\d+(?:\.\d+)?)?)(?:-(?P<schema_variant>[MP]))?", version)
+        if match is None:
+            warnings.warn("mzTab-version does not match the expected pattern: %r" % version)
+            version_parsed = '1.0.0'
+            variant = 'P'
+        else:
+            version_parsed, variant = match.groups()
+        if variant is None:
+            variant = "P"
+        self.num_version = [int(v) for v in version_parsed.split(".")]
+        # Ensure self.num_version is 3-tuple
+        while len(self.num_version) < 3:
+            self.num_version.append(0)
+        self.variant = variant
+
+    def keys(self):
+        return OrderedDict(list(self)).keys()
+
+    def values(self):
+        return OrderedDict(list(self)).values()
+
+    def items(self):
+        return OrderedDict(list(self)).items()
diff --git a/pyteomics/mzxml.py b/pyteomics/mzxml.py
new file mode 100644
index 0000000000000000000000000000000000000000..28c67e357ba9e0c77ba2dafebd121cda67a2c963
--- /dev/null
+++ b/pyteomics/mzxml.py
@@ -0,0 +1,328 @@
+"""
+mzxml - reader for mass spectrometry data in mzXML format
+=========================================================
+
+Summary
+-------
+
+**mzXML** is a (formerly) standard XML-format for raw mass spectrometry data storage,
+intended to be replaced with **mzML**.
+
+This module provides a minimalistic way to extract information from mzXML
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`MzXML`)
+to iterate over entries in ``<scan>`` elements.
+:py:class:`MzXML` also supports direct indexing with scan IDs.
+
+Data access
+-----------
+
+  :py:class:`MzXML` - a class representing a single mzXML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through spectra in mzXML file. Data from a
+  single scan are converted to a human-readable dict. Spectra themselves are
+  stored under 'm/z array' and 'intensity array' keys.
+
+  :py:func:`chain` - read multiple mzXML files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+Deprecated functions
+--------------------
+
+  :py:func:`version_info` - get version information about the mzXML file.
+  You can just read the corresponding attribute of the :py:class:`MzXML` object.
+
+  :py:func:`iterfind` - iterate over elements in an mzXML file.
+  You can just call the corresponding method of the :py:class:`MzXML` object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml` and :py:mod:`numpy`.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2016 Joshua Klein, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import heapq
+
+from . import xml, auxiliary as aux, _schema_defaults
+import numpy as np
+
+
+def _decode_peaks(info, peaks_data):
+    """Decode the interleaved base 64 encoded, potentially
+    compressed, raw data points.
+
+    Parameters
+    ----------
+    info : dict
+        The current context
+    peaks_data : str
+        The textually encoded peak data
+
+    Returns
+    -------
+    tuple of np.array
+        A pair of NumPy arrays containing
+        m/z and intensity values.
+    """
+    compressed = (info.get('compressionType') == 'zlib')
+    dt = np.float32 if info['precision'] == '32' else np.float64
+    dtype = np.dtype([('m/z array', dt), ('intensity array', dt)]).newbyteorder('>')
+    data = aux._decode_base64_data_array(peaks_data, dtype, compressed)
+    return data
+
+
+class IteratorQueue(object):
+    def __init__(self, iterator):
+        q = list()
+        heapq.heapify(q)
+        self.queue = q
+        self.iterator = iterator
+        self.last_index = -1
+        self.producer = self.consume(iterator)
+
+    def insert_item(self, scan):
+        heapq.heappush(self.queue, (int(scan['num']), scan))
+
+    def __iter__(self):
+        return self.producer
+
+    def consume(self, iterator):
+        for scan in iterator:
+            scan.pop("scan", None)
+            if scan['msLevel'] != 1:
+                self.insert_item(scan)
+            else:
+                self.insert_item(scan)
+                barrier = int(scan['num'])
+                while True:
+                    idx, item = heapq.heappop(self.queue)
+                    if idx >= barrier:
+                        self.insert_item(item)
+                        break
+                    yield item
+        while self.queue:
+            idx, item = heapq.heappop(self.queue)
+            yield item
+
+
+class MzXML(aux.BinaryArrayConversionMixin, aux.TimeOrderedIndexedReaderMixin, xml.MultiProcessingXML, xml.IndexSavingXML):
+    """Parser class for mzXML files."""
+    _root_element = 'mzXML'
+    _default_iter_tag = 'scan'
+    _indexed_tags = {'scan'}
+    _indexed_tag_keys = {'scan': 'num'}
+    _default_version = None
+    _default_schema = _schema_defaults._mzxml_schema_defaults
+    _default_id_attr = 'num'
+
+    def __init__(self, *args, **kwargs):
+        self.decode_binary = kwargs.pop('decode_binary', True)
+        super(MzXML, self).__init__(*args, **kwargs)
+
+    def __getstate__(self):
+        state = super(MzXML, self).__getstate__()
+        state['decode_binary'] = self.decode_binary
+        return state
+
+    def __setstate__(self, state):
+        super(MzXML, self).__setstate__(state)
+        self.decode_binary = state['decode_binary']
+
+    def _get_info_smart(self, element, **kw):
+        name = xml._local_name(element)
+
+        kwargs = dict(kw)
+        rec = kwargs.pop('recursive', None)
+        if name in {'mzXML'}:
+            info = self._get_info(element,
+                                  recursive=(
+                                      rec if rec is not None else False),
+                                  **kwargs)
+        else:
+            info = self._get_info(element,
+                                  recursive=(rec if rec is not None else True),
+                                  **kwargs)
+        if 'num' in info and isinstance(info, dict):
+            info['id'] = info['num']
+        if 'peaks' in info and isinstance(info, dict):
+            self._decode_peaks(info)
+        return info
+
+    def _determine_compression(self, info):
+        if info.get('compressionType') == 'zlib':
+            return 'zlib compression'
+        return "no compression"
+
+    def _determine_dtype(self, info):
+        dt = np.float32 if info['precision'] == '32' else np.float64
+        endianess = ">" if info['byteOrder'] in ('network', "big") else "<"
+        dtype = np.dtype(
+            [('m/z array', dt), ('intensity array', dt)]).newbyteorder(endianess)
+        return dtype
+
+    def _finalize_record_conversion(self, array, record):
+        key = record.key
+        return self._convert_array(key, array[key])
+
+    def _decode_peaks(self, info):
+        # handle cases where peaks is the encoded binary data which must be
+        # unpacked
+        if not isinstance(info['peaks'], (dict, list)):
+            compression_type = self._determine_compression(info)
+            dtype = self._determine_dtype(info)
+            binary = info.pop('peaks')
+            if not self.decode_binary:
+                for k in self._array_keys:
+                    record = self._make_record(binary, compression_type, dtype, k)
+                    info[k] = record
+            else:
+                peak_data = self.decode_data_array(binary, compression_type, dtype)
+                for k in self._array_keys:
+                    info[k] = self._convert_array(k, peak_data[k])
+        # otherwise we've already decoded the arrays and we're just passing
+        # them up the hierarchy
+        else:
+            if not self.decode_binary:
+                arrays = info.pop('peaks')[0]
+                for k in self._array_keys:
+                    info[k] = arrays[k]
+            else:
+                peak_data = info.pop('peaks')[0]
+                for k in self._array_keys:
+                    info[k] = self._convert_array(k, peak_data.get(k, np.array([])))
+
+    def iterfind(self, path, **kwargs):
+        if path == 'scan':
+            generator = super(MzXML, self).iterfind(path, **kwargs)
+            for item in IteratorQueue(generator):
+                yield item
+        else:
+            for item in super(MzXML, self).iterfind(path, **kwargs):
+                yield item
+
+    def _get_time(self, scan):
+        return scan['retentionTime']
+
+
+def read(source, read_schema=False, iterative=True, use_index=False, dtype=None,
+         huge_tree=False, decode_binary=True):
+    """Parse `source` and iterate through spectra.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target mzML file or the file object itself.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzML header. Otherwise, use default
+        parameters. Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    use_index : bool, optional
+        Defines whether an index of byte offsets needs to be created for
+        spectrum elements. Default is :py:const:`False`.
+
+    decode_binary : bool, optional
+        Defines whether binary data should be decoded and included in the output
+        (under "m/z array", "intensity array", etc.).
+        Default is :py:const:`True`.
+
+    huge_tree : bool, optional
+        This option is passed to the `lxml` parser and defines whether
+        security checks for XML tree depth and node size should be disabled.
+        Default is :py:const:`False`.
+        Enable this option for trusted files to avoid XMLSyntaxError exceptions
+        (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`).
+
+    Returns
+    -------
+    out : iterator
+       An iterator over the dicts with spectrum properties.
+    """
+
+    return MzXML(source, read_schema=read_schema, iterative=iterative,
+                 use_index=use_index, dtype=dtype, huge_tree=huge_tree,
+                 decode_binary=decode_binary)
+
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified XPath.
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create an :py:class:`MzXML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    decode_binary : bool, optional
+        Defines whether binary data should be decoded and included in the output
+        (under "m/z array", "intensity array", etc.).
+        Default is :py:const:`True`.
+
+    Returns
+    -------
+    out : iterator
+    """
+    return MzXML(source, **kwargs).iterfind(path, **kwargs)
+
+version_info = xml._make_version_info(MzXML)
+
+
+# chain = aux._make_chain(read, 'read')
+chain = aux.ChainBase._make_chain(MzXML)
diff --git a/pyteomics/openms/__init__.py b/pyteomics/openms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b9c0338f46370b775594e9fe369eaa9e3d790a9
--- /dev/null
+++ b/pyteomics/openms/__init__.py
@@ -0,0 +1 @@
+from . import featurexml, trafoxml, idxml
diff --git a/pyteomics/openms/featurexml.py b/pyteomics/openms/featurexml.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dcfd099d01652443b076fccf6dedd12daaf934f
--- /dev/null
+++ b/pyteomics/openms/featurexml.py
@@ -0,0 +1,115 @@
+"""
+featurexml - reader for featureXML files
+========================================
+
+Summary
+-------
+
+**featureXML** is a format specified in the
+`OpenMS <http://open-ms.sourceforge.net/about/>`_ project.
+It defines a list of LC-MS features observed in an experiment.
+
+This module provides a minimalistic way to extract information from **featureXML**
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`FeatureXML`)
+to iterate over entries in ``<feature>`` elements.
+:py:class:`FeatureXML` also supports direct indexing with feature IDs.
+
+Data access
+-----------
+
+  :py:class:`FeatureXML` - a class representing a single featureXML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through features in a featureXML file. Data from a
+  single feature are converted to a human-readable dict.
+
+  :py:func:`chain` - read multiple featureXML files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+Dependencies
+------------
+
+This module requres :py:mod:`lxml`.
+
+--------------------------------------------------------------------------------
+"""
+
+from .. import xml, auxiliary as aux, _schema_defaults, version
+
+class FeatureXML(xml.MultiProcessingXML):
+    """Parser class for featureXML files."""
+    file_format = 'featureXML'
+    _root_element = 'featureMap'
+    _default_schema = _schema_defaults._featurexml_schema_defaults
+    _default_version = '1.9'
+    _default_iter_tag = 'feature'
+    _structures_to_flatten = {}
+    _indexed_tags = {'feature'}
+    _schema_location_param = 'noNamespaceSchemaLocation'
+
+    _offending_keys = {'ints': {
+        ('PeptideIdentification', 'spectrum_reference'),
+        ('UnassignedPeptideIdentification', 'spectrum_reference'),
+        ('quality', 'quality')
+        }}
+    _missing_keys = {'floats': {('quality', 'quality')}}
+
+    def _get_info_smart(self, element, **kw):
+        kw['recursive'] = kw.get('recursive', True)
+        info = self._get_info(element, **kw)
+        return info
+
+    @xml._keepstate
+    def _get_schema_info(self, read_schema=True):
+        schema_info = super(FeatureXML, self)._get_schema_info(read_schema)
+        if not read_schema:
+            return schema_info
+        file_version, schema = self.version_info
+        if version.VersionInfo(file_version) < version.VersionInfo(self._default_version):
+            for k, s in self._offending_keys.items():
+                if k in schema_info:
+                    for elem in s:
+                        try:
+                            schema_info[k].remove(elem)
+                        except KeyError:
+                            pass
+            for t, s in self._missing_keys.items():
+                schema_info.setdefault(t, set()).update(s)
+        return schema_info
+
+
+def read(source, read_schema=True, iterative=True, use_index=False):
+    """Parse `source` and iterate through features.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target featureXML file or the file object itself.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the file header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    use_index : bool, optional
+        Defines whether an index of byte offsets needs to be created for
+        spectrum elements. Default is :py:const:`False`.
+
+    Returns
+    -------
+    out : iterator
+       An iterator over the dicts with feature properties.
+    """
+
+    return FeatureXML(source, read_schema=read_schema, iterative=iterative, use_index=use_index)
+
+chain = aux._make_chain(read, 'read')
diff --git a/pyteomics/openms/idxml.py b/pyteomics/openms/idxml.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d71d7a4e61ef46c8343f902819874fa09abe572
--- /dev/null
+++ b/pyteomics/openms/idxml.py
@@ -0,0 +1,430 @@
+"""
+idxml - idXML file reader
+=========================
+
+Summary
+-------
+
+**idXML** is a format specified in the
+`OpenMS <http://open-ms.sourceforge.net/about/>`_ project.
+It defines a list of peptide identifications.
+
+This module provides a minimalistic way to extract information from idXML
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`IDXML`) to iterate over entries in
+``<PeptideIdentification>`` elements. Note that each entry can contain more than one PSM
+(peptide-spectrum match). They are accessible with ``'PeptideHit'`` key.
+:py:class:`IDXML` objects also support direct indexing by element ID.
+
+Data access
+-----------
+
+  :py:class:`IDXML` - a class representing a single idXML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through peptide-spectrum matches in an idXML
+  file. Data from a single PSM group are converted to a human-readable dict.
+  Basically creates an :py:class:`IDXML` object and reads it.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`DataFrame` - read idXML files into a :py:class:`pandas.DataFrame`.
+
+Target-decoy approach
+---------------------
+
+  :py:func:`filter` - read a chain of idXML files and filter to a certain
+  FDR using TDA.
+
+  :py:func:`filter.chain` - chain a series of filters applied independently to
+  several files.
+
+  :py:func:`filter.chain.from_iterable` - chain a series of filters applied
+  independently to an iterable of files.
+
+  :py:func:`filter_df` - filter idXML files and return a :py:class:`pandas.DataFrame`.
+
+  :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be
+  consiudered decoy.
+
+  :py:func:`fdr` - estimate the false discovery rate of a set of identifications
+  using the target-decoy approach.
+
+  :py:func:`qvalues` - get an array of scores and local FDR values for a PSM
+  set using the target-decoy approach.
+
+Deprecated functions
+--------------------
+
+  :py:func:`version_info` - get information about idXML version and schema.
+  You can just read the corresponding attribute of the :py:class:`IDXML`
+  object.
+
+  :py:func:`get_by_id` - get an element by its ID and extract the data from it.
+  You can just call the corresponding method of the :py:class:`IDXML`
+  object.
+
+  :py:func:`iterfind` - iterate over elements in an idXML file.
+  You can just call the corresponding method of the :py:class:`IDXML`
+  object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml`.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2020 Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+
+import warnings
+from .. import auxiliary as aux
+from .. import xml, _schema_defaults
+
+
+class IDXML(xml.IndexedXML):
+    """Parser class for idXML files."""
+    file_format = 'idXML'
+    _root_element = 'IdXML'
+    _default_schema = _schema_defaults._idxml_schema_defaults
+    _default_version = '1.5'
+    _default_iter_tag = 'PeptideIdentification'
+    _structures_to_flatten = {}
+    _indexed_tags = {'ProteinHit'}
+    _schema_location_param = 'noNamespaceSchemaLocation'
+
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault('retrieve_refs', True)
+        super(IDXML, self).__init__(*args, **kwargs)
+
+    def _get_info_smart(self, element, **kwargs):
+        """Extract the info in a smart way depending on the element type"""
+        name = xml._local_name(element)
+        kwargs = dict(kwargs)
+        rec = kwargs.pop("recursive", None)
+
+        # Try not to recursively unpack the root element
+        # unless the user really wants to.
+        if name == self._root_element:
+            info = self._get_info(element, recursive=(rec if rec is not None else False), **kwargs)
+        else:
+            info = self._get_info(element, recursive=(rec if rec is not None else True), **kwargs)
+        for k in ['start', 'end']:
+            v = info.get(k)
+            if isinstance(v, list) and len(v) == 2:
+                info[k] = [int(x) for x in v[0].split()]
+        for k in ['aa_before', 'aa_after']:
+            if k in info:
+                info[k] = info[k].split()
+        return info
+
+    def _retrieve_refs(self, info, **kwargs):
+        """Retrieves and embeds the data for each attribute in `info` that
+        ends in _ref. Removes the id attribute from `info`"""
+        for k, v in dict(info).items():
+            if k[-5:] == '_refs':
+                try:
+                    by_id = [self.get_by_id(x, retrieve_refs=True) for x in v.split()]
+                except KeyError:
+                    warnings.warn('Ignoring unresolved reference: ' + v)
+                else:
+                    for x in by_id:
+                        x.pop('id', None)
+                    info[k[:-5]] = by_id
+                    del info[k]
+
+
+def read(source, **kwargs):
+    """Parse `source` and iterate through peptide-spectrum matches.
+
+    .. note:: This function is provided for backward compatibility only.
+        It simply creates an :py:class:`IDXML` instance using
+        provided arguments and returns it.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target IDXML file or the file object itself.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    retrieve_refs : bool, optional
+        If :py:const:`True`, additional information from references will be
+        automatically added to the results. The file processing time will
+        increase. Default is :py:const:`True`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the IDXML header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    build_id_cache : bool, optional
+        Defines whether a cache of element IDs should be built and stored on the
+        created :py:class:`IDXML` instance. Default value is the value of
+        `retrieve_refs`.
+
+        .. note:: This parameter is ignored when ``use_index`` is ``True`` (default).
+
+    use_index : bool, optional
+        Defines whether an index of byte offsets needs to be created for
+        the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored.
+
+    indexed_tags : container of bytes, optional
+        Defines which elements need to be indexed. Empty set by default.
+
+    Returns
+    -------
+    out : IDXML
+       An iterator over the dicts with PSM properties.
+    """
+    kwargs = kwargs.copy()
+    kwargs.setdefault('retrieve_refs', True)
+    kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs'))
+    return IDXML(source, **kwargs)
+
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified "XPath".
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create an :py:class:`IDXML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    retrieve_refs : bool, optional
+        If :py:const:`True`, additional information from references will be
+        automatically added to the results. The file processing time will
+        increase. Default is :py:const:`False`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the IDXML header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    build_id_cache : bool, optional
+        Defines whether a cache of element IDs should be built and stored on the
+        created :py:class:`IDXML` instance. Default value is the value of
+        `retrieve_refs`.
+
+    Returns
+    -------
+    out : iterator
+    """
+    kwargs = kwargs.copy()
+    kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs'))
+    return IDXML(source, **kwargs).iterfind(path, **kwargs)
+
+
+version_info = xml._make_version_info(IDXML)
+
+
+def get_by_id(source, elem_id, **kwargs):
+    """Parse `source` and return the element with `id` attribute equal
+    to `elem_id`. Returns :py:const:`None` if no such element is found.
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`get_by_id` calls on one file, you should
+        create an :py:class:`IDXML` object and use its
+        :py:meth:`!get_by_id` method.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target mzIdentML file of the file object itself.
+
+    elem_id : str
+        The value of the `id` attribute to match.
+
+    Returns
+    -------
+    out : :py:class:`dict` or :py:const:`None`
+    """
+    return IDXML(source, **kwargs).get_by_id(elem_id, **kwargs)
+
+
+chain = aux.ChainBase._make_chain(IDXML)
+
+
+def is_decoy(psm, prefix=None):
+    """Given a PSM dict, return :py:const:`True` if it is marked as decoy,
+    and :py:const:`False` otherwise.
+
+    Parameters
+    ----------
+    psm : dict
+        A dict, as yielded by :py:func:`read`.
+    prefix : ignored
+
+    Returns
+    -------
+    out : bool
+    """
+    return psm['PeptideHit'][0]['target_decoy'] == 'decoy'
+
+
+def DataFrame(*args, **kwargs):
+    """Read idXML files into a :py:class:`pandas.DataFrame`.
+
+    Requires :py:mod:`pandas`.
+
+    .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'.
+
+    Parameters
+    ----------
+    *args
+        Passed to :py:func:`chain`
+
+    **kwargs
+        Passed to :py:func:`chain`
+
+    sep : str or None, keyword only, optional
+        Some values related to PSMs (such as protein information) are variable-length
+        lists. If `sep` is a :py:class:`str`, they will be packed into single string using
+        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
+        :py:const:`None`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    data = []
+
+    sep = kwargs.pop('sep', None)
+    with chain(*args, **kwargs) as f:
+        for item in f:
+            info = {}
+            for k, v in item.items():
+                if isinstance(v, (str, int, float)):
+                    info[k] = v
+            peptide_hit = item.get('PeptideHit', [None])[0]
+            if peptide_hit is not None:
+                info.update((k, v) for k, v in peptide_hit.items() if isinstance(v, (str, int, float)))
+                protein = peptide_hit.get('protein')
+                if protein:
+                    accessions, isd, starts, ends, scores, aa_bs, aa_as = [], [], [], [], [], [], []
+                    for d, start, end, aab, aaa in zip(protein, peptide_hit['start'], peptide_hit['end'], peptide_hit['aa_before'], peptide_hit['aa_after']):
+                        accessions.append(d.get('accession'))
+                        isd.append(d.get('target_decoy'))
+                        scores.append(d.get('score'))
+                        starts.append(start)
+                        ends.append(end)
+                        aa_bs.append(aab)
+                        aa_as.append(aaa)
+
+                    isd = all(x == 'decoy' for x in isd)
+                    if sep is not None:
+                        if all(isinstance(acc, str) for acc in accessions):
+                            accessions = sep.join(accessions)
+                        if all(isinstance(aaa, str) for aaa in aa_as):
+                            aa_as = sep.join(aa_as)
+                        if all(isinstance(aab, str) for aab in aa_bs):
+                            aa_bs = sep.join(aa_bs)
+                    if all(acc is None for acc in accessions):
+                        accessions = None
+
+                    info.update((k, v) for k, v in protein[0].items() if isinstance(v, (str, int, float, list)))
+                    info['accession'] = accessions
+                    info['is decoy'] = isd
+                    info['start'] = starts
+                    info['end'] = ends
+                    info['aa_before'] = aa_bs
+                    info['aa_after'] = aa_as
+            data.append(info)
+    df = pd.DataFrame(data)
+    return df
+
+
+def filter_df(*args, **kwargs):
+    """Read idXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs.
+    Positional arguments can be idXML files or DataFrames.
+
+    Requires :py:mod:`pandas`.
+
+    .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'.
+
+    Parameters
+    ----------
+    key : str / iterable / callable, keyword only, optional
+        Peptide identification score. Default is 'score'. You will probably need to change it.
+    is_decoy : str / iterable / callable, keyword only, optional
+        Default is 'is decoy'.
+    *args
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+    **kwargs
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    kwargs.setdefault('key', 'score')
+    if all(isinstance(arg, pd.DataFrame) for arg in args):
+        df = pd.concat(args)
+    else:
+        df = DataFrame(*args, **kwargs)
+    if 'is_decoy' not in kwargs:
+        kwargs['is_decoy'] = 'is decoy'
+    return aux.filter(df, **kwargs)
+
+
+fdr = aux._make_fdr(is_decoy, None)
+_key = lambda x: x['PeptideHit'][0]['score']
+qvalues = aux._make_qvalues(chain, is_decoy, None, _key)
+filter = aux._make_filter(chain, is_decoy, None, _key, qvalues)
+filter.chain = aux._make_chain(filter, 'filter', True)
diff --git a/pyteomics/openms/trafoxml.py b/pyteomics/openms/trafoxml.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42c56979dc2a3ddc7c0076f467091c2da3c0563
--- /dev/null
+++ b/pyteomics/openms/trafoxml.py
@@ -0,0 +1,82 @@
+"""
+trafoxml - reader for trafoXML files
+========================================
+
+Summary
+-------
+
+**trafoXML** is a format specified in the
+`OpenMS <http://open-ms.sourceforge.net/about/>`_ project.
+It defines a transformation, which is a result of retention time alignment.
+
+This module provides a minimalistic way to extract information from **trafoXML**
+files. You can use the old functional interface (:py:func:`read`) or the new
+object-oriented interface (:py:class:`TrafoXML`)
+to iterate over entries in ``<Pair>`` elements.
+
+Data access
+-----------
+
+  :py:class:`TrafoXML` - a class representing a single trafoXML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through pairs in a trafoXML file. Data from a
+  single trafo are converted to a human-readable dict.
+
+  :py:func:`chain` - read multiple trafoXML files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+Dependencies
+------------
+
+This module requres :py:mod:`lxml`.
+
+--------------------------------------------------------------------------------
+"""
+
+from .. import xml, auxiliary as aux, _schema_defaults
+
+class TrafoXML(xml.XML):
+    """Parser class for trafoXML files."""
+    file_format = 'trafoXML'
+    _root_element = 'TrafoXML'
+    _default_schema = _schema_defaults._trafoxml_schema_defaults
+    _default_version = '1.0'
+    _default_iter_tag = 'Pair'
+    _schema_location_param = 'noNamespaceSchemaLocation'
+
+    def _get_info_smart(self, element, **kw):
+        kw['recursive'] = kw.get('recursive', True)
+        info = self._get_info(element, **kw)
+        return info
+
+def read(source, read_schema=True, iterative=True):
+    """Parse `source` and iterate through pairs.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target trafoXML file or the file object itself.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the file header (default). Otherwise, use default
+        parameters. Disable this to avoid waiting on slow network connections or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    Returns
+    -------
+    out : iterator
+       An iterator over the dicts with feature properties.
+    """
+
+    return TrafoXML(source, read_schema=read_schema, iterative=iterative)
+
+chain = aux._make_chain(read, 'read')
\ No newline at end of file
diff --git a/pyteomics/parser.py b/pyteomics/parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ef9fc2ad8cced5dcb500041357b450214b863b
--- /dev/null
+++ b/pyteomics/parser.py
@@ -0,0 +1,1148 @@
+"""
+parser - operations on modX peptide sequences
+=============================================
+
+modX is a simple extension of the `IUPAC one-letter peptide sequence
+representation <http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html>`_.
+
+The labels (or codes) for the 20 standard amino acids in modX are the same as
+in IUPAC nomeclature. A label for a modified amino acid has a general
+form of 'modX', i.e.:
+
+- it starts with an arbitrary number of lower-case symbols or numbers
+  (a modification);
+
+- it ends with a single upper-case symbol (an amino acid residue).
+
+The valid examples of modX amino acid labels are: 'G', 'pS', 'oxM'. This rule
+allows to combine read- and parseability.
+
+Besides the sequence of amino acid residues, modX has a rule to specify
+terminal modifications of a polypeptide. Such a label should start or
+end with a hyphen. The default N-terminal amine group and C-terminal
+carboxyl group may not be shown explicitly.
+
+Therefore, valid examples of peptide sequences in modX are: "GAGA",
+"H-PEPTIDE-OH", "H-TEST-NH2". It is not recommmended to specify only one
+terminal group.
+
+Operations on polypeptide sequences
+-----------------------------------
+
+  :py:func:`parse` - convert a sequence string into a list of
+  amino acid residues.
+
+  :py:func:`to_string` - convert a parsed sequence to a string.
+
+  :py:func:`to_proforma` - convert a (parsed) *modX* sequence to ProForma.
+
+  :py:func:`amino_acid_composition` - get numbers of each amino acid
+  residue in a peptide.
+
+  :py:func:`cleave`, :py:func:`icleave`, :py:func:`xcleave` - cleave a polypeptide using a given rule of
+  enzymatic digestion.
+
+  :py:func:`num_sites` - count the number of cleavage sites in a sequence.
+
+  :py:func:`isoforms` - generate all unique modified peptide sequences
+  given the initial sequence and modifications.
+
+Auxiliary commands
+------------------
+
+  :py:func:`coverage` - calculate the sequence coverage of a protein by peptides.
+
+  :py:func:`length` - calculate the number of amino acid
+  residues in a polypeptide.
+
+  :py:func:`valid` - check if a sequence can be parsed successfully.
+
+  :py:func:`fast_valid` - check if a sequence contains of known one-letter
+  codes.
+
+  :py:func:`is_modX` - check if supplied code corresponds to a modX label.
+
+  :py:func:`is_term_mod` - check if supplied code corresponds to a
+  terminal modification.
+
+Data
+----
+
+  :py:data:`std_amino_acids` - a list of the 20 standard amino acid IUPAC codes.
+
+  :py:data:`std_nterm` - the standard N-terminal modification (the
+  unmodified group is a single atom of hydrogen).
+
+  :py:data:`std_cterm` - the standard C-terminal modification (the
+  unmodified group is hydroxyl).
+
+  :py:data:`std_labels` - a list of all standard sequence
+  elements, amino acid residues and terminal modifications.
+
+  :py:data:`expasy_rules` and :py:data:`psims_rules` - two dicts with the regular expressions of
+  cleavage rules for the most popular proteolytic enzymes.
+
+-------------------------------------------------------------------------------
+
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import re
+from collections import deque
+import itertools as it
+import warnings
+from .auxiliary import PyteomicsError, memoize, BasicComposition, cvstr, cvquery
+
+
+std_amino_acids = ['Q', 'W', 'E', 'R', 'T', 'Y', 'I', 'P', 'A', 'S',
+                   'D', 'F', 'G', 'H', 'K', 'L', 'C', 'V', 'N', 'M']
+"""modX labels for the 20 standard amino acids."""
+
+std_nterm = 'H-'
+"""modX label for the unmodified N-terminus."""
+
+std_cterm = '-OH'
+"""modX label for the unmodified C-terminus."""
+
+std_labels = std_amino_acids + [std_nterm, std_cterm]
+"""modX labels for the standard amino acids and unmodified termini."""
+
+_nterm_mod = r'[^-]+-$'
+_cterm_mod = r'-[^-]+$'
+
+
+def is_term_mod(label):
+    """Check if `label` corresponds to a terminal modification.
+
+    Parameters
+    ----------
+    label : str
+
+    Returns
+    -------
+    out : bool
+
+    Examples
+    --------
+    >>> is_term_mod('A')
+    False
+    >>> is_term_mod('Ac-')
+    True
+    >>> is_term_mod('-customGroup')
+    True
+    >>> is_term_mod('this-group-')
+    False
+    >>> is_term_mod('-')
+    False
+    """
+    return (re.match(_nterm_mod, label) or re.match(_cterm_mod, label)) is not None
+
+
+def match_modX(label):
+    """Check if `label` is a valid 'modX' label.
+
+    Parameters
+    ----------
+    label : str
+
+    Returns
+    -------
+    out : re.match or None
+    """
+    return re.match(_modX_single, label)
+
+
+def is_modX(label):
+    """Check if `label` is a valid 'modX' label.
+
+    Parameters
+    ----------
+    label : str
+
+    Returns
+    -------
+    out : bool
+
+    Examples
+    --------
+    >>> is_modX('M')
+    True
+    >>> is_modX('oxM')
+    True
+    >>> is_modX('oxMet')
+    False
+    >>> is_modX('160C')
+    True
+    """
+    return bool(match_modX(label))
+
+
+def length(sequence, **kwargs):
+    """Calculate the number of amino acid residues in a polypeptide
+    written in modX notation.
+
+    Parameters
+    ----------
+    sequence : str or list or dict
+        A string with a polypeptide sequence, a list with a parsed sequence or
+        a dict of amino acid composition.
+    labels : list, optional
+        A list of allowed labels for amino acids and terminal modifications.
+
+    Returns
+    -------
+    out : int
+
+    Examples
+    --------
+    >>> length('PEPTIDE')
+    7
+    >>> length('H-PEPTIDE-OH')
+    7
+    """
+    if not sequence:
+        return 0
+
+    if isinstance(sequence, str) or isinstance(sequence, list):
+        if isinstance(sequence, str):
+            parsed_sequence = parse(sequence, **kwargs)
+        else:
+            parsed_sequence = sequence
+        num_term_groups = 0
+        if is_term_mod(parsed_sequence[0]):
+            num_term_groups += 1
+        if is_term_mod(parsed_sequence[-1]):
+            num_term_groups += 1
+        return len(parsed_sequence) - num_term_groups
+    elif isinstance(sequence, dict):
+        return sum(amount for aa, amount in sequence.items() if not is_term_mod(aa))
+
+    raise PyteomicsError('Unsupported type of sequence.')
+
+
+def _split_label(label):
+    try:
+        mod, X = match_modX(label).groups()
+    except AttributeError:
+        raise PyteomicsError('Cannot split a non-modX label: %s' % label)
+    if not mod:
+        return (X,)
+    else:
+        return mod, X
+
+
+_modX_sequence = re.compile(r'^([^-]+-)?((?:[^A-Z-]*[A-Z])+)(-[^-]+)?$')
+_modX_group = re.compile(r'[^A-Z-]*[A-Z]')
+_modX_split = re.compile(r'([^A-Z-]*)([A-Z])')
+_modX_single = re.compile(r'^([^A-Z-]*)([A-Z])$')
+
+
+def parse(sequence, show_unmodified_termini=False, split=False, allow_unknown_modifications=False, **kwargs):
+    """Parse a sequence string written in modX notation into a list of
+    labels or (if `split` argument is :py:const:`True`) into a list of
+    tuples representing amino acid residues and their modifications.
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+    show_unmodified_termini : bool, optional
+        If :py:const:`True` then the unmodified N- and C-termini are explicitly
+        shown in the returned list. Default value is :py:const:`False`.
+    split : bool, optional
+        If :py:const:`True` then the result will be a list of tuples with 1 to 4
+        elements: terminal modification, modification, residue. Default value is
+        :py:const:`False`.
+    allow_unknown_modifications : bool, optional
+        If :py:const:`True` then do not raise an exception when an unknown
+        modification of a known amino acid residue is found in the sequence.
+        This also includes terminal groups.
+        Default value is :py:const:`False`.
+
+        .. note::
+            Since version 2.5, this parameter has effect only if `labels`
+            are provided.
+    labels : container, optional
+        A container of allowed labels for amino acids,
+        modifications and terminal modifications.
+        If not provided, no checks will be done.
+        Separate labels for modifications (such as 'p' or 'ox')
+        can be supplied, which means they are applicable to all residues.
+
+        .. warning::
+            If `show_unmodified_termini` is set to :py:const:`True`, standard
+            terminal groups need to be present in `labels`.
+
+        .. warning::
+            Avoid using sequences with only one terminal group, as they are
+            ambiguous. If you provide one, `labels` (or :py:const:`std_labels`)
+            will be used to resolve the ambiguity.
+
+    Returns
+    -------
+    out : list
+        List of tuples with labels of modifications and amino acid residues.
+
+    Examples
+    --------
+    >>> parse('PEPTIDE', split=True)
+    [('P',), ('E',), ('P',), ('T',), ('I',), ('D',), ('E',)]
+    >>> parse('H-PEPTIDE')
+    ['P', 'E', 'P', 'T', 'I', 'D', 'E']
+    >>> parse('PEPTIDE', show_unmodified_termini=True)
+    ['H-', 'P', 'E', 'P', 'T', 'I', 'D', 'E', '-OH']
+    >>> parse('TEpSToxM', labels=std_labels + ['pS', 'oxM'])
+    ['T', 'E', 'pS', 'T', 'oxM']
+    >>> parse('zPEPzTIDzE', True, True, labels=std_labels+['z'])
+    [('H-', 'z', 'P'), ('E',), ('P',), ('z', 'T'), ('I',), ('D',), ('z', 'E', '-OH')]
+    >>> parse('Pmod1EPTIDE')
+    ['P', 'mod1E', 'P', 'T', 'I', 'D', 'E']
+    """
+    sequence = str(sequence)
+
+    try:
+        n, body, c = re.match(_modX_sequence, sequence).groups()
+    except AttributeError:
+        raise PyteomicsError('Not a valid modX sequence: ' + sequence)
+
+    # Check for allowed labels, if they were explicitly given
+    labels = kwargs.get('labels')
+    # labels help save the day when only one terminal group is given
+    if c is None and n is not None:
+        if labels is None:
+            labels = std_labels
+        # we can try to resolve the ambiguity
+        if n != std_nterm and n not in labels:
+            # n is the body then
+            c = '-' + body
+            body = n[:-1]
+            n = None
+
+    # Actual parsing
+    if split:
+        parsed_sequence = [g if g[0] else (g[1],) for g in re.findall(
+            _modX_split, body)]
+    else:
+        parsed_sequence = re.findall(_modX_group, body)
+    nterm, cterm = (n or std_nterm), (c or std_cterm)
+
+    # Check against `labels` if given
+    if labels is not None:
+        labels = set(labels)
+        for term, std_term in zip([n, c], [std_nterm, std_cterm]):
+            if term and term not in labels and not allow_unknown_modifications:
+                raise PyteomicsError('Unknown label: {}'.format(term))
+        for group in parsed_sequence:
+            if split:
+                mod, X = group if len(group) == 2 else ('', group[0])
+            else:
+                mod, X = re.match(_modX_split, group).groups()
+            if ((not mod) and X not in labels) or not ((mod + X in labels) or (
+                X in labels and (
+                    mod in labels or allow_unknown_modifications))):
+                raise PyteomicsError('Unknown label: {}'.format(group))
+
+    # Append terminal labels
+    if show_unmodified_termini or nterm != std_nterm:
+        if split:
+            parsed_sequence[0] = (nterm,) + parsed_sequence[0]
+        else:
+            parsed_sequence.insert(0, nterm)
+    if show_unmodified_termini or cterm != std_cterm:
+        if split:
+            parsed_sequence[-1] = parsed_sequence[-1] + (cterm,)
+        else:
+            parsed_sequence.append(cterm)
+
+    return parsed_sequence
+
+
+def valid(*args, **kwargs):
+    """Try to parse sequence and catch the exceptions.
+    All parameters are passed to :py:func:`parse`.
+
+    Returns
+    -------
+    out : bool
+        :py:const:`True` if the sequence was parsed successfully, and
+        :py:const:`False` otherwise.
+    """
+    try:
+        parse(*args, **kwargs)
+    except PyteomicsError:
+        return False
+    return True
+
+
+def fast_valid(sequence, labels=set(std_labels)):
+    """Iterate over `sequence` and check if all items are in `labels`.
+    With strings, this only works as expected on sequences without
+    modifications or terminal groups.
+
+    Parameters
+    ----------
+    sequence : iterable (expectedly, str)
+        The sequence to check. A valid sequence would be a string of
+        labels, all present in `labels`.
+    labels : iterable, optional
+        An iterable of known labels.
+
+    Returns
+    -------
+    out : bool
+    """
+    return set(sequence).issubset(labels)
+
+
+def to_string(parsed_sequence, show_unmodified_termini=True):
+    """Create a string from a parsed sequence.
+
+    Parameters
+    ----------
+    parsed_sequence : iterable
+        Expected to be in one of the formats returned by
+        :py:func:`parse`, i.e. list of labels or list of tuples.
+    show_unmodified_termini : bool, optional
+        Defines the behavior towards standard terminal groups in the input.
+        :py:const:`True` means that they will be preserved if present (default).
+        :py:const:`False` means that they will be removed. Standard terminal
+        groups will not be added if not shown in `parsed_sequence`,
+        regardless of this setting.
+
+    Returns
+    -------
+    sequence : str
+    """
+    parsed_sequence = list(parsed_sequence)
+    labels = []
+    nterm = parsed_sequence[0]
+    cterm = parsed_sequence[-1]
+
+    if isinstance(nterm, str):
+        if nterm != std_nterm or show_unmodified_termini:
+            labels.append(nterm)
+        labels.extend(parsed_sequence[1:-1])
+        if len(parsed_sequence) > 1 and (cterm != std_cterm or show_unmodified_termini):
+            labels.append(cterm)
+    else:
+        if len(parsed_sequence) == 1:
+            g = nterm
+            if nterm[0] == std_nterm and not show_unmodified_termini:
+                g = g[1:]
+            if nterm[-1] == std_cterm and not show_unmodified_termini:
+                g = g[:-1]
+            return ''.join(g)
+        if nterm[0] != std_nterm or show_unmodified_termini:
+            labels.append(''.join(nterm))
+        else:
+            labels.append(''.join(nterm[1:]))
+        labels.extend(''.join(g) for g in parsed_sequence[1:-1])
+        if len(parsed_sequence) > 1:
+            if cterm[-1] != std_cterm or show_unmodified_termini:
+                labels.append(''.join(cterm))
+            else:
+                labels.append(''.join(cterm[:-1]))
+    return ''.join(labels)
+
+
+tostring = to_string
+
+
+def to_proforma(sequence, **kwargs):
+    """Converts a (parsed) *modX* sequence to a basic ProForma string.
+    Modifications are represented as masses, if those are given in :arg:`aa_mass`,
+    as chemical formulas (via :arg:`aa_comp`) or as names (using :arg:`mod_names`).
+
+    Parameters
+    ----------
+    sequence : str or list
+        A *modX* sequence, possibly in the parsed form.
+    aa_mass : dict, keyword only, optional
+        Used to render modifications as mass shifts.
+    aa_comp : dict, keyword only, optional
+        Used to render modifications as chemical formulas.
+    mod_names : dict or callable, keyword only, optional
+        Used to get the rendered name of modification from the mod label.
+    prefix : str, keyword only, optional
+        Prepend all modification names with the given prefix.
+
+    Returns
+    -------
+    out : str
+        A ProForma sequence.
+
+    Examples
+    --------
+    >>> to_proforma('PEPTIDE')
+    'PEPTIDE'
+    >>> to_proforma('Ac-oxMYPEPTIDE-OH', aa_mass={'Ac-': 42.010565}, mod_names={'ox': 'Oxidation'}, prefix='U:')
+    '[+42.0106]-M[U:Oxidation]YPEPTIDE'
+    >>> to_proforma('oxidationMYPEPTIDE')  # last fallback is to just capitalize the label
+    'M[Oxidation]YPEPTIDE'
+    """
+    from . import proforma
+    from .mass.mass import std_aa_mass, std_aa_comp
+
+    if isinstance(sequence, str):
+        return to_proforma(parse(sequence), **kwargs)
+
+    aa_mass = kwargs.get('aa_mass', std_aa_mass)
+    aa_comp = kwargs.get('aa_comp', std_aa_comp)
+    mod_names = kwargs.get('mod_names', {})
+    prefix = kwargs.get('prefix', '')
+
+    if isinstance(mod_names, dict):
+        get_name = mod_names.get
+    else:
+        get_name = mod_names
+
+    def get_tag(label):
+        if label in aa_mass:
+            return [proforma.MassModification(aa_mass[label])]
+        if label in aa_comp:
+            return [proforma.FormulaModification(''.join('{}{}'.format(k, v if v not in {0, 1} else '') for k, v in aa_comp[label].items()))]
+        name = get_name(label)
+        if not name:
+            warnings.warn("Unable to resolve label `{}`. "
+                "The ProForma string may be invalid. Specify `mod_names`, `aa_mass` or `aa_comp`.".format(label))
+            name = label.capitalize()
+        return [proforma.GenericModification(prefix + name)]
+
+    i, j = 0, len(sequence)
+    nterm = cterm = None
+    pro_sequence = []
+    if isinstance(sequence[0], str):  # regular parsed sequence
+        if is_term_mod(sequence[0]) and sequence[0] != std_nterm:
+            nterm = get_tag(sequence[0])
+            i = 1
+        if is_term_mod(sequence[-1]) and sequence[-1] != std_cterm:
+            cterm = get_tag(sequence[-1])
+            j -= 1
+        for label in sequence[i:j]:
+            if len(label) == 1:
+                pro_sequence.append((label, None))
+            else:
+                mod, aa = _split_label(label)
+                pro_sequence.append((aa, get_tag(mod)))
+    else:  # split sequence
+        if is_term_mod(sequence[0][0]) and sequence[0][0] != std_nterm:
+            nterm = get_tag(sequence[0][0])
+        if is_term_mod(sequence[-1][-1]) and sequence[-1][-1] != std_cterm:
+            cterm = get_tag(sequence[-1][-1])
+        if len(sequence) == 1:
+            pro_sequence = [(sequence[0][-2] if cterm else sequence[0][-1], get_tag(sequence[0][1]) if len(sequence[0]) == 4 else None)]
+        else:
+            pro_sequence.append((sequence[0][-1], get_tag(sequence[0][-2]) if len(sequence[0]) == 3 else None))
+            for group in sequence[1:-1]:
+                pro_sequence.append((group[-1], get_tag(group[0]) if len(group) == 2 else None))
+            if len(sequence[-1]) == 1 or (len(sequence[-1]) == 2 and cterm):
+                pro_sequence.append((sequence[-1][0], None))
+            else:
+                pro_sequence.append((sequence[-1][1], get_tag(sequence[-1][0])))
+
+    return proforma.to_proforma(pro_sequence, n_term=nterm, c_term=cterm)
+
+
+def amino_acid_composition(sequence, show_unmodified_termini=False, term_aa=False, allow_unknown_modifications=False, **kwargs):
+    """Calculate amino acid composition of a polypeptide.
+
+    Parameters
+    ----------
+    sequence : str or list
+        The sequence of a polypeptide or a list with a parsed sequence.
+    show_unmodified_termini : bool, optional
+        If :py:const:`True` then the unmodified N- and C-terminus are explicitly
+        shown in the returned dict. Default value is :py:const:`False`.
+    term_aa : bool, optional
+        If :py:const:`True` then the terminal amino acid residues are
+        artificially modified with `nterm` or `cterm` modification.
+        Default value is :py:const:`False`.
+    allow_unknown_modifications : bool, optional
+        If :py:const:`True` then do not raise an exception when an unknown
+        modification of a known amino acid residue is found in the sequence.
+        Default value is :py:const:`False`.
+    labels : list, optional
+        A list of allowed labels for amino acids and terminal modifications.
+
+    Returns
+    -------
+    out : dict
+        A dictionary of amino acid composition.
+
+    Examples
+    --------
+    >>> amino_acid_composition('PEPTIDE') == \
+    {'I': 1, 'P': 2, 'E': 2, 'T': 1, 'D': 1}
+    True
+    >>> amino_acid_composition('PEPTDE', term_aa=True) == \
+    {'ctermE': 1, 'E': 1, 'D': 1, 'P': 1, 'T': 1, 'ntermP': 1}
+    True
+    >>> amino_acid_composition('PEPpTIDE', labels=std_labels+['pT']) == \
+    {'I': 1, 'P': 2, 'E': 2, 'D': 1, 'pT': 1}
+    True
+    """
+    labels = kwargs.get('labels')
+
+    if isinstance(sequence, str):
+        parsed_sequence = parse(sequence, show_unmodified_termini,
+            allow_unknown_modifications=allow_unknown_modifications,
+            labels=labels)
+    elif isinstance(sequence, list):
+        if sequence and isinstance(sequence[0], tuple):
+            parsed_sequence = parse(tostring(sequence, True),
+                show_unmodified_termini,
+                allow_unknown_modifications=allow_unknown_modifications,
+                labels=labels)
+        else:
+            parsed_sequence = sequence
+    else:
+        raise PyteomicsError('Unsupported type of a sequence.'
+                'Must be str or list, not %s' % type(sequence))
+
+    aa_dict = BasicComposition()
+
+    # Process terminal amino acids.
+    if term_aa:
+        nterm_aa_position = 1 if is_term_mod(parsed_sequence[0]) else 0
+        cterm_aa_position = (
+            len(parsed_sequence) - 2 if is_term_mod(parsed_sequence[-1])
+            else len(parsed_sequence) - 1)
+        if len(parsed_sequence) > 1:
+            aa_dict['cterm' + parsed_sequence.pop(cterm_aa_position)] = 1
+        aa_dict['nterm' + parsed_sequence.pop(nterm_aa_position)] = 1
+
+    # Process core amino acids.
+    for aa in parsed_sequence:
+        aa_dict[aa] += 1
+
+    return aa_dict
+
+
+@memoize()
+def cleave(*args, **kwargs):
+    """Cleaves a polypeptide sequence using a given rule.
+
+    .. seealso::
+        :func:`icleave` and :func:`xcleave`, which produce both peptides and their indices.
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+
+        .. note::
+            The sequence is expected to be in one-letter uppercase notation.
+            Otherwise, some of the cleavage rules in :py:data:`expasy_rules`
+            will not work as expected.
+
+    rule : str or compiled regex
+        A key present in :py:data:`expasy_rules`, :py:data:`psims_rules` (or an MS ontology accession) or a
+        `regular expression <https://docs.python.org/library/re.html#regular-expression-syntax>`_
+        describing the site of cleavage. It is recommended
+        to design the regex so that it matches only the residue whose C-terminal
+        bond is to be cleaved. All additional requirements should be specified
+        using `lookaround assertions
+        <http://www.regular-expressions.info/lookaround.html>`_.
+        :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents.
+
+        .. seealso:: The `regex` argument.
+
+    missed_cleavages : int, optional
+        Maximum number of allowed missed cleavages. Defaults to 0.
+    min_length : int or None, optional
+        Minimum peptide length. Defaults to :py:const:`None`.
+
+        .. note ::
+            This checks for string length, which is only correct for one-letter
+            notation and not for full *modX*. Use :py:func:`length` manually if
+            you know what you are doing and apply :py:func:`cleave` to *modX*
+            sequences.
+
+    max_length : int or None, optional
+        Maximum peptide length. Defaults to :py:const:`None`. See note above.
+
+    semi : bool, optional
+        Include products of semi-specific cleavage. Default is :py:const:`False`.
+        This effectively cuts every peptide at every position and adds results to the output.
+
+    exception : str or compiled RE or None, optional
+        Exceptions to the cleavage rule. If specified, should be a key present in :py:const:`expasy_rules`
+        or regular expression. Cleavage sites matching `rule` will be checked against `exception` and omitted
+        if they match.
+
+    regex : bool, optional
+        If :py:const:`True`, the cleavage rule is always interpreted as a regex. Otherwise, a matching value
+        is looked up in :py:data:`expasy_rules` and :py:data:`psims_rules`.
+
+    Returns
+    -------
+    out : set
+        A set of unique (!) peptides.
+
+    Examples
+    --------
+    >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'}
+    True
+    >>> cleave('AKAKBK', 'trypsin', 0) == {'AK', 'BK'}
+    True
+    >>> cleave('AKAKBK', 'MS:1001251', 0) == {'AK', 'BK'}
+    True
+    >>> cleave('GKGKYKCK', 'Trypsin/P', 2) == \
+    {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'}
+    True
+
+    """
+    return set(p for i, p in icleave(*args, **kwargs))
+
+
+def icleave(sequence, rule, missed_cleavages=0, min_length=None, max_length=None, semi=False, exception=None, regex=False):
+    """Like :py:func:`cleave`, but the result is an iterator and includes peptide indices.
+    Refer to :py:func:`cleave` for explanation of parameters.
+
+    Returns
+    -------
+    out : iterator
+        An iterator over (index, sequence) pairs.
+
+    """
+    if not regex:
+        if rule in expasy_rules:
+            rule = expasy_rules[rule]
+        elif rule in psims_rules:
+            rule = psims_rules[rule]
+        elif rule in _psims_index:
+            rule = _psims_index[rule]
+        elif re.search(r'[a-z]', rule):
+            warnings.warn('Interpreting the rule as a regular expression: {}. Did you mistype the rule? '
+                'Specify `regex=True` to silence this warning.'.format(rule))
+    exception = expasy_rules.get(exception, exception)
+    ml = missed_cleavages + 2
+    trange = range(ml)
+    cleavage_sites = deque([0], maxlen=ml)
+    if min_length is None:
+        min_length = 1
+    if max_length is None:
+        max_length = len(sequence)
+    cl = 1
+    if exception is not None:
+        exceptions = {x.end() for x in re.finditer(exception, sequence)}
+    for end in it.chain([x.end() for x in re.finditer(rule, sequence)], [None]):
+        if exception is not None and end in exceptions:
+            continue
+        cleavage_sites.append(end)
+        if cl < ml:
+            cl += 1
+        for j in trange[:cl - 1]:
+            seq = sequence[cleavage_sites[j]:cleavage_sites[-1]]
+            lenseq = len(seq)
+            if end is not None:
+                start = end - lenseq
+            else:
+                start = len(sequence) - lenseq
+            if seq and min_length <= lenseq <= max_length:
+                yield (start, seq)
+                if semi:
+                    for k in range(min_length, min(lenseq, max_length)):
+                        yield (start, seq[:k])
+                    for k in range(max(1, lenseq - max_length), lenseq - min_length + 1):
+                        yield (start + k, seq[k:])
+
+
+def xcleave(*args, **kwargs):
+    """Like :py:func:`icleave`, but returns a list.
+
+    Returns
+    -------
+    out : list
+        A list of (index, sequence) pairs.
+
+    Examples
+    --------
+    >>> xcleave('AKAKBK', 'trypsin', 1)
+    [(0, 'AK'), (0, 'AKAK'), (2, 'AK'), (2, 'AKBK'), (4, 'BK')]
+    """
+    return list(icleave(*args, **kwargs))
+
+
+def num_sites(sequence, rule, **kwargs):
+    """Count the number of sites where `sequence` can be cleaved using
+    the given `rule` (e.g. number of miscleavages for a peptide).
+
+    Parameters
+    ----------
+    sequence : str
+        The sequence of a polypeptide.
+    rule : str or compiled regex
+        A regular expression describing the site of cleavage. It is recommended
+        to design the regex so that it matches only the residue whose C-terminal
+        bond is to be cleaved. All additional requirements should be specified
+        using `lookaround assertions
+        <http://www.regular-expressions.info/lookaround.html>`_.
+    labels : list, optional
+        A list of allowed labels for amino acids and terminal modifications.
+    exception : str or compiled RE or None, optional
+        Exceptions to the cleavage rule. If specified, should be a regular expression.
+        Cleavage sites matching `rule` will be checked against `exception` and omitted
+        if they match.
+
+    Returns
+    -------
+    out : int
+        Number of cleavage sites.
+    """
+    return sum(1 for _ in icleave(sequence, rule, **kwargs)) - 1
+
+
+expasy_rules = {
+    'arg-c':         r'R',
+    'asp-n':         r'\w(?=D)',
+    'bnps-skatole' : r'W',
+    'caspase 1':     r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])',
+    'caspase 2':     r'(?<=DVA)D(?=[^PEDQKR])',
+    'caspase 3':     r'(?<=DMQ)D(?=[^PEDQKR])',
+    'caspase 4':     r'(?<=LEV)D(?=[^PEDQKR])',
+    'caspase 5':     r'(?<=[LW]EH)D',
+    'caspase 6':     r'(?<=VE[HI])D(?=[^PEDQKR])',
+    'caspase 7':     r'(?<=DEV)D(?=[^PEDQKR])',
+    'caspase 8':     r'(?<=[IL]ET)D(?=[^PEDQKR])',
+    'caspase 9':     r'(?<=LEH)D',
+    'caspase 10':    r'(?<=IEA)D',
+    'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))',
+    'chymotrypsin low specificity':
+        r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))',
+    'clostripain':   r'R',
+    'cnbr':          r'M',
+    'enterokinase':  r'(?<=[DE]{3})K',
+    'factor xa':     r'(?<=[AFGILTVM][DE]G)R',
+    'formic acid':   r'D',
+    'glutamyl endopeptidase': r'E',
+    'granzyme b':    r'(?<=IEP)D',
+    'hydroxylamine': r'N(?=G)',
+    'iodosobenzoic acid': r'W',
+    'lysc':          r'K',
+    'ntcb':          r'\w(?=C)',
+    'pepsin ph1.3':  r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|'
+                     r'((?<=[^HKR][^P])[FL](?=\w[^P]))',
+    'pepsin ph2.0':  r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|'
+                     r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))',
+    'proline endopeptidase': r'(?<=[HKR])P(?=[^P])',
+    'proteinase k':  r'[AEFILTVWY]',
+    'staphylococcal peptidase i': r'(?<=[^E])E',
+    'thermolysin':   r'[^DE](?=[AFILMV][^P])',
+    'thrombin':      r'((?<=G)R(?=G))|'
+                     r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))',
+    'trypsin':       r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))',
+    'trypsin_exception': r'((?<=[CD])K(?=D))|((?<=C)K(?=[HY]))|((?<=C)R(?=K))|((?<=R)R(?=[HR]))',
+}
+"""
+This dict contains regular expressions for cleavage rules of the most
+popular proteolytic enzymes. The rules were taken from the
+`PeptideCutter tool
+<http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_
+at Expasy.
+
+.. note::
+    'trypsin_exception' can be used as `exception` argument when calling
+    :py:func:`cleave` with 'trypsin' `rule`::
+
+        >>> parser.cleave('PEPTIDKDE', parser.expasy_rules['trypsin'])
+        {'DE', 'PEPTIDK'}
+        >>> parser.cleave('PEPTIDKDE', parser.expasy_rules['trypsin'], \
+exception=parser.expasy_rules['trypsin_exception'])
+        {'PEPTIDKDE'}
+"""
+
+
+psims_rules = {
+    cvstr('2-iodobenzoate', 'MS:1001918'): r'(?<=W)',
+    cvstr('Arg-C', 'MS:1001303'): r'(?<=R)(?!P)',
+    cvstr('Asp-N', 'MS:1001304'): r'(?=[BD])',
+    cvstr('Asp-N ambic', 'MS:1001305'): r'(?=[DE])',
+    cvstr('CNBr', 'MS:1001307'): r'(?<=M)',
+    cvstr('Chymotrypsin', 'MS:1001306'): r'(?<=[FYWL])(?!P)',
+    cvstr('Formic acid', 'MS:1001308'): r'((?<=D))|((?=D))',
+    cvstr('Lys-C', 'MS:1001309'): r'(?<=K)(?!P)',
+    cvstr('Lys-C/P', 'MS:1001310'): r'(?<=K)',
+    cvstr('PepsinA', 'MS:1001311'): r'(?<=[FL])',
+    cvstr('TrypChymo', 'MS:1001312'): r'(?<=[FYWLKR])(?!P)',
+    cvstr('Trypsin', 'MS:1001251'): r'(?<=[KR])(?!P)',
+    cvstr('Trypsin/P', 'MS:1001313'): r'(?<=[KR])',
+    cvstr('V8-DE', 'MS:1001314'): r'(?<=[BDEZ])(?!P)',
+    cvstr('V8-E', 'MS:1001315'): r'(?<=[EZ])(?!P)',
+    cvstr('glutamyl endopeptidase', 'MS:1001917'): r'(?<=[^E]E)',
+    cvstr('leukocyte elastase', 'MS:1001915'): r'(?<=[ALIV])(?!P)',
+    cvstr('proline endopeptidase', 'MS:1001916'): r'(?<=[HKR]P)(?!P)',
+}
+"""
+This dict contains regular expressions for cleavage rules of the most
+popular proteolytic enzymes. The rules were taken from the PSI `MS ontology
+<http://purl.obolibrary.org/obo/MS_1001045>`_.
+
+You can use names or accessions to access the rules.
+Use :py:func:`pyteomics.auxiliary.cvquery` for accession access::
+
+    >>> from pyteomics.auxiliary import cvquery
+    >>> from pyteomics.parser import psims_rules
+    >>> cvquery(psims_rules, 'MS:1001918')
+    '(?<=W)'
+
+"""
+
+_psims_index = cvquery(psims_rules)
+
+def isoforms(sequence, **kwargs):
+    """
+    Apply variable and fixed modifications to the polypeptide and yield
+    the unique modified sequences.
+
+    Parameters
+    ----------
+
+    sequence : str
+        Peptide sequence to modify.
+
+    variable_mods : dict, optional
+        A dict of variable modifications in the following format:
+        :py:const:`{'label1': ['X', 'Y', ...], 'label2': ['X', 'A', 'B', ...]}`
+
+        Keys in the dict are modification labels (terminal modifications allowed).
+        Values are iterables of residue labels (one letter each) or
+        :py:const:`True`. If a value for a modification is :py:const:`True`,
+        it is applicable to any residue (useful for terminal modifications).
+        You can use values such as 'ntermX' or 'ctermY' to specify that a
+        mdofication only occurs when the residue is in the terminal position.
+        This is *not needed* for terminal modifications.
+
+        .. note:: Several variable modifications can occur on amino acids of the
+                  same type, but in the output each amino acid residue will be
+                  modified at most once (apart from terminal modifications).
+
+    fixed_mods : dict, optional
+        A dict of fixed modifications in the same format.
+
+        **Note**: if a residue is affected by a fixed modification, no variable
+        modifications will be applied to it (apart from terminal modifications).
+
+    labels : list, optional
+        A list of amino acid labels containing all the labels present in
+        `sequence`. Modified entries will be added automatically.
+        Defaults to :py:data:`std_labels`.
+        Not required since version 2.5.
+
+    max_mods : int or None, optional
+        Number of modifications that can occur simultaneously on a peptide,
+        excluding fixed modifications. If :py:const:`None` or if ``max_mods``
+        is greater than the number of modification sites, all possible
+        isoforms are generated. Default is :py:const:`None`.
+
+    override : bool, optional
+        Defines how to handle the residues that are modified in the input.
+        :py:const:`False` means that they will be preserved (default).
+        :py:const:`True` means they will be treated as unmodified.
+
+    show_unmodified_termini : bool, optional
+        If :py:const:`True` then the unmodified N- and C-termini are explicitly
+        shown in the returned sequences. Default value is :py:const:`False`.
+
+    format : str, optional
+        If :py:const:`'str'` (default), an iterator over sequences is returned.
+        If :py:const:`'split'`, the iterator will yield results in the same
+        format as :py:func:`parse` with the 'split' option, with unmodified
+        terminal groups shown.
+
+    Returns
+    -------
+
+    out : iterator over strings or lists
+        All possible unique polypeptide sequences resulting from
+        the specified modifications are yielded obe by one.
+    """
+    def main(group):  # index of the residue (capital letter) in `group`
+        if group[-1][0] == '-':
+            i = -2
+        else:
+            i = -1
+        return len(group) + i, group[i]
+
+    def apply_mod(label, mod):
+        # `label` is assumed to be a tuple (see split option of `parse`)
+        # unmodified termini are assumed shown
+        # if the modification is not applicable, `None` is returned
+        group = list(label)
+        m = main(group)[0]
+        c = True  # whether the change is applied in the end
+        if m == 0 and not is_term_mod(mod):
+            group.insert(0, mod)
+        elif mod[0] == '-' and (group[-1] == std_cterm or (group[-1][0] == '-' and override)):
+            group[-1] = mod
+        elif mod[-1] == '-' and (group[0] == std_nterm or (group[0][-1] == '-' and override)):
+            group[0] = mod
+        elif not is_term_mod(mod):
+            if m and group[m - 1][-1] != '-':
+                if override:
+                    group[m - 1] = mod
+                else:
+                    c = False
+            else:
+                group.insert(m, mod)
+        else:
+            c = False
+        if c:
+            return tuple(group)
+
+    variable_mods = kwargs.get('variable_mods', {})
+    varmods_term, varmods_non_term = [], []
+    for m, r in sorted(variable_mods.items()):
+        if is_term_mod(m):
+            varmods_term.append((m, r))
+        else:
+            varmods_non_term.append((m, r))
+    fixed_mods = kwargs.get('fixed_mods', {})
+    parse_kw = {}
+    if 'labels' in kwargs:
+        parse_kw['labels'] = list(kwargs['labels']) + list(fixed_mods)
+    parsed = parse(sequence, True, True, **parse_kw)
+    override = kwargs.get('override', False)
+    show_unmodified_termini = kwargs.get('show_unmodified_termini', False)
+    max_mods = kwargs.get('max_mods')
+    format_ = kwargs.get('format', 'str')
+
+    # Apply fixed modifications
+    for cmod, res in fixed_mods.items():
+        for i, group in enumerate(parsed):
+            if res is True or main(group)[1] in res:
+                parsed[i] = apply_mod(group, cmod) or parsed[i]
+
+    # Create a list of possible states for each group
+    # Start with N-terminal mods and regular mods on the N-terminal residue
+    states = [[parsed[0]]]
+    m0 = main(parsed[0])[1]
+    for m, r in varmods_non_term:
+        if r is True or m0 in r or 'nterm' + m0 in r or len(parsed) == 1 and 'cterm' + m0 in r:
+            applied = apply_mod(parsed[0], m)
+            if applied is not None:
+                states[0].append(applied)
+    more_states = []
+    for m, r in varmods_term:
+        if r is True or m0 in r:
+            if m[-1] == '-' or len(parsed) == 1:
+                for group in states[0]:
+                    applied = apply_mod(group, m)
+                    if applied is not None:
+                        more_states.append(applied)
+    states[0].extend(more_states)
+
+    # Continue with regular mods
+    for group in parsed[1:-1]:
+        gstates = [group]
+        for m, r in varmods_non_term:
+            if r is True or group[-1] in r:
+                applied = apply_mod(group, m)
+                if applied is not None:
+                    gstates.append(applied)
+        states.append(gstates)
+
+    # Finally add C-terminal mods and regular mods on the C-terminal residue
+    if len(parsed) > 1:
+        states.append([parsed[-1]])
+        m1 = main(parsed[-1])[1]
+        for m, r in varmods_non_term:
+            if r is True or m1 in r or 'cterm' + m1 in r or len(parsed) == 1 and 'nterm' + m1 in r:
+                applied = apply_mod(parsed[-1], m)
+                if applied is not None:
+                    states[-1].append(applied)
+        more_states = []
+        for m, r in varmods_term:
+            if r is True or m1 in r:
+                if m[0] == '-' or len(parsed) == 1:
+                    for group in states[-1]:
+                        applied = apply_mod(group, m)
+                        if applied is not None:
+                            more_states.append(applied)
+        states[-1].extend(more_states)
+
+    sites = [s for s in enumerate(states) if len(s[1]) > 1]
+    if max_mods is None or max_mods > len(sites):
+        possible_states = it.product(*states)
+    else:
+        def state_lists():
+            for m in range(max_mods + 1):
+                for comb in it.combinations(sites, m):
+                    skel = [[s[0]] for s in states]
+                    for i, e in comb:
+                        skel[i] = e[1:]
+                    yield skel
+        possible_states = it.chain.from_iterable(it.product(*skel) for skel in state_lists())
+
+    if format_ == 'split':
+        def strip_std_terms():
+            for ps in possible_states:
+                ps = list(ps)
+                if not show_unmodified_termini:
+                    if ps[0][0] == std_nterm:
+                        ps[0] = ps[0][1:]
+                    if ps[-1][-1] == std_cterm:
+                        ps[-1] = ps[-1][:-1]
+                yield ps
+        return strip_std_terms()
+    elif format_ == 'str':
+        return (tostring(form, show_unmodified_termini)
+            for form in possible_states)
+    else:
+        raise PyteomicsError('Unsupported value of "format": {}'.format(format_))
+
+
+def coverage(protein, peptides):
+    """Calculate how much of `protein` is covered by `peptides`.
+    Peptides can overlap. If a peptide is found multiple times in `protein`,
+    it contributes more to the overall coverage.
+
+    Requires :py:mod:`numpy`.
+
+    .. note::
+        Modifications and terminal groups are discarded.
+
+    Parameters
+    ----------
+    protein : str
+        A protein sequence.
+    peptides : iterable
+        An iterable of peptide sequences.
+
+    Returns
+    -------
+    out : float
+        The sequence coverage, between 0 and 1.
+
+    Examples
+    --------
+    >>> coverage('PEPTIDES'*100, ['PEP', 'EPT'])
+    0.5
+    """
+    import numpy as np
+    protein = re.sub(r'[^A-Z]', '', protein)
+    mask = np.zeros(len(protein), dtype=np.int8)
+    for peptide in peptides:
+        indices = [m.start() for m in re.finditer(
+            '(?={})'.format(re.sub(r'[^A-Z]', '', peptide)), protein)]
+        for i in indices:
+            mask[i:i + len(peptide)] = 1
+    return mask.sum(dtype=float) / mask.size
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/pyteomics/peff.py b/pyteomics/peff.py
new file mode 100644
index 0000000000000000000000000000000000000000..90ffcc5a992256521f2e01af7822ec80406cfe0f
--- /dev/null
+++ b/pyteomics/peff.py
@@ -0,0 +1,277 @@
+"""
+peff - PSI Extended FASTA Format
+================================
+
+PEFF is a forth-coming standard from PSI-HUPO formalizing and extending the
+encoding of protein features and annotations for building search spaces for
+proteomics. See `The PEFF specification <http://www.psidev.info/peff>`_ for
+more up-to-date information on the standard.
+
+Data manipulation
+-----------------
+
+Classes
+.......
+
+The PEFF parser inherits several properties from implementation in the :mod:`~.fasta` module,
+building on top of the :class:`~.TwoLayerIndexedFASTA` reader.
+
+Available classes:
+
+  :py:class:`IndexedPEFF` - Parse a PEFF format file in binary-mode, supporting
+  direct indexing by header string or by tag.
+
+"""
+
+#   Copyright 2018 Joshua Klein, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import re
+try:
+    from collections.abc import Sequence as SequenceABC, Mapping
+except ImportError:
+    from collections import Sequence as SequenceABC, Mapping
+from collections import OrderedDict, defaultdict
+
+from .fasta import TwoLayerIndexedFASTA
+
+
+class Header(Mapping):
+    """Hold parsed properties of a key-value pair like a sequence's
+    definition line.
+
+    This object supports the :class:`Mapping` interface, and
+    keys may be accessed by attribute access notation.
+    """
+    def __init__(self, mapping, original=None):
+        self._mapping = mapping
+
+    def __getitem__(self, key):
+        return self._mapping[key]
+
+    def __iter__(self):
+        return iter(self._mapping)
+
+    def items(self):
+        return self._mapping.items()
+
+    def keys(self):
+        return self._mapping.keys()
+
+    def values(self):
+        return self._mapping.values()
+
+    def __len__(self):
+        return len(self._mapping)
+
+    def __contains__(self, key):
+        return key in self._mapping
+
+    def __getattr__(self, key):
+        if key == "_mapping":
+            raise AttributeError(key)
+        try:
+            return self._mapping[key]
+        except KeyError:
+            raise AttributeError(key)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({mapping})".format(self=self, mapping=dict(self._mapping))
+
+    def __hash__(self):
+        return hash(self.defline)
+
+    def __eq__(self, other):
+        try:
+            return self._mapping == other._mapping
+        except AttributeError:
+            return str(self) == str(other)
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __dir__(self):
+        base = set(dir(super(Header, self)))
+        keys = set(self._mapping.keys())
+        return list(base | keys)
+
+
+class IndexedPEFF(TwoLayerIndexedFASTA):
+    """Creates an :py:class:`IndexedPEFF` object.
+
+    Parameters
+    ----------
+    source : str or file
+        The file to read. If a file object, it needs to be in *rb* mode.
+    parse : bool, optional
+        Defines whether the descriptions should be parsed in the produced tuples.
+        Default is :py:const:`True`.
+    kwargs : passed to the :py:class:`TwoLayerIndexedFASTA` constructor.
+    """
+
+    kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)")
+    header_pattern = re.compile(r"^>?(\S+):(\S+)")
+    has_feature_index = re.compile(r"^\(?(\d+):")
+    header_group = 2
+
+    class _PEFFFeature(SequenceABC):
+        def __init__(self, *fields, **kwargs):
+            self.fields = tuple(fields)
+            self.id = kwargs.get('id')
+            self.feature_type = kwargs.get("feature_type")
+
+        def __eq__(self, other):
+            return tuple(self) == tuple(other)
+
+        def __ne__(self, other):
+            return not (self == other)
+
+        def __getitem__(self, i):
+            return self.fields[i]
+
+        def __len__(self):
+            return len(self.fields)
+
+        def __repr__(self):
+            return repr(tuple(self))
+
+        def __str__(self):
+            return "(%s%s)" % (
+                '%r:' % self.id if self.id is not None else '',
+                '|'.join(map(str, self)), )
+
+    def __init__(self, source, ignore_comments=False, **kwargs):
+        super(IndexedPEFF, self).__init__(
+            source, ignore_comments=ignore_comments, parser=self.parser,
+            header_pattern=self.header_pattern, **kwargs)
+        self.header_blocks = []
+        self.comments = []
+        self.version = None
+        self.number_of_entries = 0
+        self._parse_header()
+
+    def _parse_header(self):
+        self.seek(0)
+        line = self.readline().decode("ascii")
+        if not line.startswith("# PEFF"):
+            raise ValueError("Not a PEFF File")
+        self.version = tuple(map(int, line.strip()[7:].split(".")))
+        current_block = defaultdict(list)
+        in_header = True
+        while in_header:
+            line = self.readline().decode("ascii")
+            if not line.startswith("#"):
+                in_header = False
+            line = line.strip()[2:]
+            if '=' in line:
+                key, value = line.split("=", 1)
+                if key == "GeneralComment":
+                    self.comments.append(value)
+                else:
+                    current_block[key].append(value)
+            if line.startswith("//"):
+                if current_block:
+                    self.header_blocks.append(
+                        Header(OrderedDict((k, v if len(v) > 1 else v[0])
+                                           for k, v in current_block.items())))
+                current_block = defaultdict(list)
+        number_of_entries = 0
+        for block in self.header_blocks:
+            try:
+                number_of_entries += int(block['NumberOfEntries'])
+            except KeyError:
+                pass
+        self.number_of_entries = number_of_entries
+
+    def _extract_parenthesis_list(self, text):
+        chunks = []
+        chunk = []
+        paren_level = 0
+        i = 0
+        n = len(text)
+        while i < n:
+            c = text[i]
+            i += 1
+            if c == "(":
+                if paren_level > 0:
+                    chunk.append(c)
+                paren_level += 1
+            elif c == ")":
+                if paren_level > 1:
+                    chunk.append(c)
+                paren_level -= 1
+                if paren_level == 0:
+                    if chunk:
+                        chunks.append(chunk)
+                    chunk = []
+            else:
+                chunk.append(c)
+        chunks = list(map(''.join, chunks))
+        return chunks
+
+    def _split_pipe_separated_tuple(self, text):
+        parts = text.split("|")
+        return parts
+
+    def _coerce_types(self, key, value):
+        value = value.strip()
+        feature_id_match = self.has_feature_index.search(value)
+        if feature_id_match:
+            feature_id = int(feature_id_match.group(1))
+            value = self.has_feature_index.sub('', value)
+        else:
+            feature_id = None
+        if "|" in value:
+            value = self._split_pipe_separated_tuple(value)
+            result = []
+            for i, v in enumerate(value):
+                result.append(self._coerce_value(key, v, i))
+            return self._PEFFFeature(*result, feature_type=key, id=feature_id)
+        else:
+            return self._coerce_value(key, value, 0)
+
+    def _coerce_value(self, key, value, index):
+        try:
+            return int(value)
+        except ValueError:
+            pass
+        try:
+            return float(value)
+        except ValueError:
+            pass
+        return str(value)
+
+    def parser(self, line):
+        match = self.header_pattern.match(line)
+        if not match:
+            raise ValueError(
+                "Failed to parse {!r} using {!r}".format(
+                    line, self))
+        storage = OrderedDict()
+        prefix = None
+        db_uid = None
+        if line.startswith(">"):
+            line = line[1:]
+        prefix, line = line.split(":", 1)
+        db_uid, line = line.split(" ", 1)
+        storage['Prefix'] = prefix
+        storage['Tag'] = db_uid
+        kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)")
+        for key, value in kv_pattern.findall(line):
+            if not (value.startswith("(") or " (" in value):
+                storage[key] = self._coerce_types(key, value)
+            else:
+                # multi-value
+                storage[key] = [self._coerce_types(key, v) for v in self._extract_parenthesis_list(value)]
+        return Header(storage)
diff --git a/pyteomics/pepxml.py b/pyteomics/pepxml.py
new file mode 100644
index 0000000000000000000000000000000000000000..813f5749fcabc19c150717230c1e85e0b570c4c2
--- /dev/null
+++ b/pyteomics/pepxml.py
@@ -0,0 +1,573 @@
+"""
+pepxml - pepXML file reader
+===========================
+
+Summary
+-------
+
+`pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_
+was the first widely accepted format for proteomics search engines' output.
+Even though it is to be replaced by a community standard
+`mzIdentML <http://www.psidev.info/index.php?q=node/454>`_, it is still used
+commonly.
+
+This module provides minimalistic infrastructure for access to data stored in
+pepXML files. The most important function is :py:func:`read`, which
+reads peptide-spectum matches and related information and saves them into
+human-readable dicts. This function relies on the terminology of the underlying
+`lxml library <http://lxml.de/>`_.
+
+Data access
+-----------
+
+  :py:class:`PepXML` - a class representing a single pepXML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through peptide-spectrum matches in a pepXML
+  file. Data for a single spectrum are converted to an easy-to-use dict.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`DataFrame` - read pepXML files into a :py:class:`pandas.DataFrame`.
+
+Target-decoy approach
+---------------------
+
+  :py:func:`filter` - filter PSMs from a chain of pepXML files to a specific FDR
+  using TDA.
+
+  :py:func:`filter.chain` - chain a series of filters applied independently to
+  several files.
+
+  :py:func:`filter.chain.from_iterable` - chain a series of filters applied
+  independently to an iterable of files.
+
+  :py:func:`filter_df` - filter pepXML files and return a :py:class:`pandas.DataFrame`.
+
+  :py:func:`fdr` - estimate the false discovery rate of a PSM set using the
+  target-decoy approach.
+
+  :py:func:`qvalues` - get an array of scores and local FDR values for a PSM
+  set using the target-decoy approach.
+
+  :py:func:`is_decoy` - determine whether a PSM is decoy or not.
+
+Miscellaneous
+-------------
+
+  :py:func:`roc_curve` - get a receiver-operator curve (min PeptideProphet
+  probability in a sample vs. false discovery rate) of PeptideProphet analysis.
+
+Deprecated functions
+--------------------
+
+  :py:func:`iterfind` - iterate over elements in a pepXML file.
+  You can just call the corresponding method of the :py:class:`PepXML`
+  object.
+
+  :py:func:`version_info` - get information about pepXML version and schema.
+  You can just read the corresponding attribute of the :py:class:`PepXML`
+  object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml`.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from lxml import etree
+from . import xml, auxiliary as aux, _schema_defaults
+
+
+class PepXML(xml.MultiProcessingXML, xml.IndexSavingXML):
+    """Parser class for pepXML files."""
+    file_format = 'pepXML'
+    _root_element = 'msms_pipeline_analysis'
+    _default_schema = _schema_defaults._pepxml_schema_defaults
+    _default_version = '1.15'
+    _default_iter_tag = 'spectrum_query'
+    _indexed_tags = {'spectrum_query'}
+    _indexed_tag_keys = {'spectrum_query': 'spectrum'}
+    _default_id_attr = 'spectrum'
+    _structures_to_flatten = {'search_score_summary', 'modification_info'}
+    # attributes which contain unconverted values
+    _convert_items = {'float': {'calc_neutral_pep_mass', 'massdiff',
+            'probability', 'variable', 'static'},
+        'int': {'start_scan', 'end_scan', 'index', 'num_matched_peptides'},
+        'bool': {'is_rejected'},
+        'floatarray': {'all_ntt_prob'}}.items()
+
+    def _get_info_smart(self, element, **kwargs):
+        """Extract the info in a smart way depending on the element type"""
+        try:
+            name = kwargs.pop('ename')
+        except KeyError:
+            name = xml._local_name(element)
+        rec = kwargs.pop('recursive', None)
+        if name == self._root_element:
+            info = self._get_info(element, ename=name, recursive=(rec if rec is not None else False), **kwargs)
+        else:
+            info = self._get_info(element, ename=name, recursive=(rec if rec is not None else True), **kwargs)
+
+        def safe_float(s):
+            try:
+                return float(s)
+            except ValueError:
+                if s.startswith('+-0'):
+                    return 0
+                return s
+
+        converters = {'float': safe_float, 'int': int,
+                'bool': lambda x: x.lower() in {'1', 'true'},
+                'floatarray': lambda x: list(map(float, x[1:-1].split(',')))}
+        for k, v in dict(info).items():
+            for t, s in self._convert_items:
+                if k in s:
+                    del info[k]
+                    info[k] = converters[t](v)
+        for k in {'search_score', 'parameter'}:
+            if k in info and isinstance(info[k], list) and all(
+                    isinstance(x, dict) and len(x) == 1 for x in info[k]):
+                scores = {}
+                for score in info[k]:
+                    name, value = score.popitem()
+                    try:
+                        scores[name] = float(value)
+                    except ValueError:
+                        scores[name] = value
+                info[k] = scores
+        if 'search_result' in info and len(info['search_result']) == 1:
+            info.update(info['search_result'][0])
+            del info['search_result']
+        if 'protein' in info and 'peptide' in info:
+            info['proteins'] = [{'protein': info.pop('protein'),
+                'protein_descr': info.pop('protein_descr', None)}]
+            for add_key in {'peptide_prev_aa', 'peptide_next_aa', 'protein_mw'}:
+                if add_key in info:
+                    info['proteins'][0][add_key] = info.pop(add_key)
+            info['proteins'][0]['num_tol_term'] = info.pop('num_tol_term', 0)
+            if 'alternative_protein' in info:
+                info['proteins'].extend(info['alternative_protein'])
+                del info['alternative_protein']
+        if 'peptide' in info and not 'modified_peptide' in info:
+            info['modified_peptide'] = info['peptide']
+        if 'peptide' in info:
+            info['modifications'] = info.pop('mod_aminoacid_mass', [])
+            if 'mod_nterm_mass' in info:
+                info['modifications'].insert(0, {'position': 0,
+                    'mass': float(info.pop('mod_nterm_mass'))})
+            if 'mod_cterm_mass' in info:
+                info['modifications'].append({'position': 1 + len(info['peptide']),
+                    'mass': float(info.pop('mod_cterm_mass'))})
+        if 'modified_peptide' in info and info['modified_peptide'] == info.get(
+                'peptide'):
+            if not info.get('modifications'):
+                info['modifications'] = []
+            else:
+                mp = info['modified_peptide']
+                for mod in sorted(info['modifications'],
+                        key=lambda m: m['position'],
+                        reverse=True):
+                    if mod['position'] not in {0, 1+len(info['peptide'])}:
+                        p = mod['position']
+                        mp = mp[:p] + '[{}]'.format(int(mod['mass'])) + mp[p:]
+                info['modified_peptide'] = mp
+        if 'search_hit' in info:
+            info['search_hit'].sort(key=lambda x: x['hit_rank'])
+        return info
+
+
+def read(source, read_schema=False, iterative=True, **kwargs):
+    """Parse `source` and iterate through peptide-spectrum matches.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target pepXML file or the file object itself.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the pepXML header. Otherwise, use default parameters.
+        Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    Returns
+    -------
+    out : PepXML
+       An iterator over dicts with PSM properties.
+    """
+
+    return PepXML(source, read_schema=read_schema, iterative=iterative)
+
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified "XPath".
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create an :py:class:`PepXML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, keyword only, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    iterative : bool, keyword only, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, keyword only, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header. Otherwise, use default parameters.
+        Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    Returns
+    -------
+    out : iterator
+    """
+    return PepXML(source, **kwargs).iterfind(path, **kwargs)
+
+
+version_info = xml._make_version_info(PepXML)
+
+
+def roc_curve(source):
+    """Parse source and return a ROC curve for peptideprophet analysis.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target pepXML file or the file object itself.
+
+    Returns
+    -------
+    out : list
+        A list of ROC points.
+    """
+
+    parser = etree.XMLParser(remove_comments=True, ns_clean=True)
+    tree = etree.parse(source, parser=parser)
+
+    roc_curve = []
+    for roc_error_data in tree.xpath(
+        "/*[local-name()='msms_pipeline_analysis'] \
+        //*[local-name()='analysis_summary' and @analysis='peptideprophet'] \
+        //*[local-name()='peptideprophet_summary'] \
+        //*[local-name()='roc_error_data']"):
+        for element in roc_error_data.xpath("*[local-name()='roc_data_point' or local-name()='error_point']"):
+            data_point = dict(element.attrib)
+            for key in data_point:
+                data_point[key] = float(data_point[key])
+            data_point["charge"] = roc_error_data.attrib["charge"]
+            data_point["tag"] = etree.QName(element).localname
+            roc_curve.append(data_point)
+
+    return roc_curve
+
+
+# chain = aux._make_chain(read, 'read')
+chain = aux.ChainBase._make_chain(read)
+
+
+def _is_decoy_prefix(psm, prefix='DECOY_'):
+    """Given a PSM dict, return :py:const:`True` if all protein names for
+    the PSM start with ``prefix``, and :py:const:`False` otherwise. This
+    function might not work for some pepXML flavours. Use the source to get the
+    idea and suit it to your needs.
+
+    Parameters
+    ----------
+    psm : dict
+        A dict, as yielded by :py:func:`read`.
+    prefix : str, optional
+        A prefix used to mark decoy proteins. Default is `'DECOY_'`.
+
+    Returns
+    -------
+    out : bool
+    """
+    return all(protein['protein'].startswith(prefix)
+            for protein in psm['search_hit'][0]['proteins'])
+
+
+def _is_decoy_suffix(psm, suffix='_DECOY'):
+    return all(protein['protein'].endswith(suffix)
+            for protein in psm['search_hit'][0]['proteins'])
+
+
+is_decoy = _is_decoy_prefix
+fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix)
+_key = lambda x: min(sh['search_score']['expect'] for sh in x['search_hit'])
+qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, _key)
+filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, _key, qvalues)
+filter.chain = aux._make_chain(filter, 'filter', True)
+
+
+def DataFrame(*args, **kwargs):
+    """Read pepXML output files into a :py:class:`pandas.DataFrame`.
+
+    Requires :py:mod:`pandas`.
+
+    Parameters
+    ----------
+    *args
+        pepXML file names or objects. Passed to :py:func:`chain`.
+
+    **kwargs
+        Passed to :py:func:`chain`.
+
+    sep : str or None, keyword only, optional
+        Some values related to PSMs (such as protein information) are variable-length
+        lists. If `sep` is a :py:class:`str`, they will be packed into single string using
+        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
+        :py:const:`None`.
+
+    recursive : bool, keyword only, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    iterative : bool, keyword only, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, keyword only, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header. Otherwise, use default parameters.
+        Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    pd_kwargs : dict, optional
+        Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    kwargs = kwargs.copy()
+    sep = kwargs.pop('sep', None)
+    pd_kwargs = kwargs.pop('pd_kwargs', {})
+    def gen_items():
+        with chain(*args, **kwargs) as f:
+            for item in f:
+                info = {}
+                for k, v in item.items():
+                    if isinstance(v, (str, int, float)):
+                        info[k] = v
+                if 'search_hit' in item:
+                    sh = item['search_hit'][0]
+                    proteins = sh.pop('proteins')
+                    prot_dict = {}
+                    for p in proteins:
+                        for k in p:
+                            prot_dict[k] = []
+                    for p in proteins:
+                        for k, v in prot_dict.items():
+                            v.append(p.get(k))
+                    if sep is None:
+                        info.update(prot_dict)
+                    else:
+                        for k, v in prot_dict.items():
+                            info[k] = sep.join(str(val) if val is not None else '' for val in v)
+                    info.update(sh.pop('search_score'))
+                    mods = sh.pop('modifications', [])
+                    formatted_mods = ['{0[mass]:.3f}@{0[position]}'.format(x) for x in mods]
+                    if sep is not None:
+                        info['modifications'] = sep.join(formatted_mods)
+                    else:
+                        info['modifications'] = formatted_mods
+                    for k, v in sh.items():
+                        if isinstance(v, (str, int, float)):
+                            info[k] = v
+                    if 'analysis_result' in sh:
+                        for ar in sh['analysis_result']:
+                            if ar['analysis'] == 'peptideprophet':
+                                try:
+                                    info.update(ar['peptideprophet_result']['parameter'])
+                                except KeyError:
+                                    pass
+                                info['peptideprophet_probability'] = ar['peptideprophet_result']['probability']
+                                info['peptideprophet_ntt_prob'] = ar['peptideprophet_result']['all_ntt_prob']
+                            elif ar['analysis'] == 'interprophet':
+                                info.update(ar['interprophet_result']['parameter'])
+                                info['interprophet_probability'] = ar['interprophet_result']['probability']
+                                info['interprophet_ntt_prob'] = ar['interprophet_result']['all_ntt_prob']
+                yield info
+    return pd.DataFrame(gen_items(), **pd_kwargs)
+
+
+def filter_df(*args, **kwargs):
+    """Read pepXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs.
+    Positional arguments can be pepXML files or DataFrames. Keyword parameter `fdr` is also required.
+    Other parameters are optional.
+
+    Requires :py:mod:`pandas`.
+
+    Parameters
+    ----------
+    positional args
+        pepXML file names, file objects, or DataFrames. Passed to :py:func:`DataFrame`.
+    fdr : float, keyword only, 0 <= fdr <= 1
+        Desired FDR level.
+    key : str / iterable / callable, keyword only, optional
+        PSM score. Default is 'expect'.
+    is_decoy : str / iterable / callable, keyword only, optional
+        Default is to check if all strings in the "protein" column start with `'DECOY_'`.
+    sep : str or None, keyword only, optional
+        Some values related to PSMs (such as protein information) are variable-length
+        lists. If `sep` is a :py:class:`str`, they will be packed into single string using
+        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
+        :py:const:`None`.
+    reverse : bool, keyword only, optional
+        If :py:const:`True`, then PSMs are sorted in descending order,
+        i.e. the value of the key function is higher for better PSMs.
+        Default is :py:const:`False`.
+    decoy_prefix : str, optional
+        If the default `is_decoy` function works for you, this parameter specifies which
+        protein name prefix to use to detect decoy matches. If you provide your own
+        `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect.
+        Default is `"DECOY_"`.
+    decoy_suffix : str, optional
+        If the default `is_decoy` function works for you, this parameter specifies which
+        protein name suffix to use to detect decoy matches. If you provide your own
+        `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.
+    remove_decoy : bool, keyword only, optional
+        Defines whether decoy matches should be removed from the output.
+        Default is :py:const:`True`.
+
+        .. note:: If set to :py:const:`False`, then by default the decoy
+           PSMs will be taken into account when estimating FDR. Refer to the
+           documentation of :py:func:`fdr` for math; basically, if
+           `remove_decoy` is :py:const:`True`, then formula 1 is used
+           to control output FDR, otherwise it's formula 2. This can be
+           changed by overriding the `formula` argument.
+
+    formula : int, keyword only, optional
+        Can be either 1 or 2, defines which formula should be used for FDR
+        estimation. Default is 1 if `remove_decoy` is :py:const:`True`,
+        else 2 (see :py:func:`fdr` for definitions).
+    ratio : float, keyword only, optional
+        The size ratio between the decoy and target databases. Default is
+        1. In theory, the "size" of the database is the number of
+        theoretical peptides eligible for assignment to spectra that are
+        produced by *in silico* cleavage of that database.
+    correction : int or float, keyword only, optional
+        Possible values are 0, 1 and 2, or floating point numbers between 0 and 1.
+
+        0 (default): no correction;
+
+        1: enable "+1" correction. This accounts for the probability that a false
+        positive scores better than the first excluded decoy PSM;
+
+        2: this also corrects that probability for finite size of the sample,
+        so the correction will be slightly less than "+1".
+
+        If a floating point number
+        is given, then instead of the expectation value for the number of false PSMs,
+        the confidence value is used. The value of `correction` is then interpreted as
+        desired confidence level. E.g., if correction=0.95, then the calculated q-values
+        do not exceed the "real" q-values with 95% probability.
+
+        See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation.
+
+    pep : callable / array-like / iterable / str, keyword only, optional
+        If callable, a function used to determine the posterior error probability (PEP).
+        Should accept exactly one argument (PSM) and return a float.
+        If array-like, should contain float values for all given PSMs.
+        If string, it is used as a field name (PSMs must be in a record array
+        or a :py:class:`DataFrame`).
+
+        .. note:: If this parameter is given, then PEP values will be used to calculate
+           q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with:
+           `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`.
+           `key` can still be provided. Without `key`, PSMs will be sorted by PEP.
+
+    q_label : str, optional
+        Field name for q-value in the output. Default is ``'q'``.
+
+    score_label : str, optional
+        Field name for score in the output. Default is ``'score'``.
+
+    decoy_label : str, optional
+        Field name for the decoy flag in the output. Default is ``'is decoy'``.
+
+    pep_label : str, optional
+        Field name for PEP in the output. Default is ``'PEP'``.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    sep = kwargs.get('sep')
+    kwargs.setdefault('key', 'expect')
+    if all(isinstance(arg, pd.DataFrame) for arg in args):
+        if len(args) > 1:
+            df = pd.concat(args)
+        else:
+            df = args[0]
+    else:
+        read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs}
+        df = DataFrame(*args, **read_kw)
+    if 'is_decoy' not in kwargs:
+        if sep is not None:
+            if 'decoy_suffix' in kwargs:
+                kwargs['is_decoy'] = df['protein'].str.split(';').apply(
+                    lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s))
+            else:
+                kwargs['is_decoy'] = df['protein'].str.split(';').apply(
+                    lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s))
+        else:
+            if 'decoy_suffix' in kwargs:
+                kwargs['is_decoy'] = df['protein'].apply(
+                    lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s))
+            else:
+                kwargs['is_decoy'] = df['protein'].apply(
+                    lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s))
+    return aux.filter(df, **kwargs)
diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py
new file mode 100644
index 0000000000000000000000000000000000000000..c24792baf8c2005fd618ff33ed0a79eba920591b
--- /dev/null
+++ b/pyteomics/proforma.py
@@ -0,0 +1,2372 @@
+'''
+proforma - Proteoform and Peptidoform Notation
+==============================================
+
+ProForma is a notation for defining modified amino acid sequences using
+a set of controlled vocabularies, as well as encoding uncertain or partial
+information about localization. See `ProForma specification <https://www.psidev.info/proforma>`_
+for more up-to-date information.
+
+For more details, see the :mod:`pyteomics.proforma` online.
+'''
+
+import re
+import warnings
+from collections import deque, namedtuple
+from functools import partial
+from array import array as _array
+
+try:
+    from enum import Enum
+except ImportError:
+    # Python 2 doesn't have a builtin Enum type
+    Enum = object
+
+from .mass import Composition, std_aa_mass, Unimod, nist_mass, calculate_mass, std_ion_comp, mass_charge_ratio
+from .auxiliary import PyteomicsError, BasicComposition
+from .auxiliary.utils import add_metaclass
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+try:
+    from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache, load_unimod)
+    _has_psims = True
+except ImportError:
+    def _needs_psims(name):
+        raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims`" % name)
+
+    load_psimod = partial(_needs_psims, 'PSIMOD')
+    load_xlmod = partial(_needs_psims, 'XLMOD')
+    load_gno = partial(_needs_psims, 'GNO')
+    load_unimod = partial(_needs_psims, 'UNIMOD')
+    obo_cache = None
+    _has_psims = False
+
+_WATER_MASS = calculate_mass(formula="H2O")
+
+std_aa_mass = std_aa_mass.copy()
+std_aa_mass['X'] = 0
+
+element_symbols = set(nist_mass)
+element_symbols.remove("e*")
+element_symbols.add('e')
+
+
+class ProFormaError(PyteomicsError):
+    def __init__(self, message, index=None, parser_state=None, **kwargs):
+        super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state)
+        self.message = message
+        self.index = index
+        self.parser_state = parser_state
+
+
+class PrefixSavingMeta(type):
+    '''A subclass-registering-metaclass that provides easy
+    lookup of subclasses by prefix attributes.
+    '''
+
+    def __new__(mcs, name, parents, attrs):
+        new_type = type.__new__(mcs, name, parents, attrs)
+        prefix = attrs.get("prefix_name")
+        if prefix:
+            new_type.prefix_map[prefix.lower()] = new_type
+        short = attrs.get("short_prefix")
+        if short:
+            new_type.prefix_map[short.lower()] = new_type
+        return new_type
+
+    def find_by_tag(self, tag_name):
+        if tag_name is None:
+            raise ValueError("tag_name cannot be None!")
+        tag_name = tag_name.lower()
+        return self.prefix_map[tag_name]
+
+
+class TagTypeEnum(Enum):
+    unimod = 0
+    psimod = 1
+    massmod = 2
+    generic = 3
+    info = 4
+    gnome = 5
+    xlmod = 6
+
+    formula = 7
+    glycan = 8
+
+    localization_marker = 9
+    position_label = 10
+    group_placeholder = 999
+
+
+class ModificationTagStyle(Enum):
+    Unset = 0
+    ShortId = 1
+    LongId = 2
+    ShortName = 3
+    LongName = 4
+
+
+_sentinel = object()
+
+
+class ModificationMassNotFoundError(ProFormaError):
+    pass
+
+
+class UnknownMonosaccharideError(ProFormaError):
+    pass
+
+
+@add_metaclass(PrefixSavingMeta)
+class TagBase(object):
+    '''A base class for all tag types.
+
+    Attributes
+    ----------
+    type: Enum
+        An element of :class:`TagTypeEnum` saying what kind of tag this is.
+    value: object
+        The data stored in this tag, usually an externally controlled name
+    extra: list
+        Any extra tags that were nested within this tag. Usually limited to INFO
+        tags but may be other synonymous controlled vocabulary terms.
+    group_id: str or None
+        A short label denoting which group, if any, this tag belongs to
+    '''
+    __slots__ = ("type", "value", "extra", "group_id")
+
+    prefix_name = None
+    short_prefix = None
+    prefix_map = {}
+
+    def __init__(self, type, value, extra=None, group_id=None):
+        self.type = type
+        self.value = value
+        self.extra = extra
+        self.group_id = group_id
+
+    def __str__(self):
+        part = self._format_main()
+        had_marker = False
+        if self.extra:
+            rest = []
+            for e in self.extra:
+                rest.append(str(e))
+                had_marker |= isinstance(e, GroupLabelBase) and e.group_id == self.group_id
+            label = '|'.join([part] + rest)
+        else:
+            label = part
+        if self.group_id and not had_marker:
+            label = '%s%s' % (label, self.group_id)
+        return '%s' % label
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})"
+        return template.format(self=self)
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        if isinstance(other, str):
+            return str(self) == other
+        return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \
+            and (self.group_id == other.group_id)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def find_tag_type(self, tag_type):
+        '''Search this tag or tag collection for elements with a particular
+        tag type and return them.
+
+        Parameters
+        ----------
+        tag_type : TagTypeEnum
+            A label from :class:`TagTypeEnum`, or an equivalent type.
+
+        Returns
+        -------
+        matches : list
+            The list of all tags in this object which match the requested tag type.
+        '''
+        out = []
+        if self.type == tag_type:
+            out.append(self)
+        if not self.extra:
+            return out
+        for e in self.extra:
+            if e.type == tag_type:
+                out.append(e)
+        return out
+
+    @classmethod
+    def parse(cls, buffer):
+        return process_tag_tokens(buffer)
+
+
+class GroupLabelBase(TagBase):
+    __slots__ = ()
+
+    def __str__(self):
+        part = self._format_main()
+        if self.extra:
+            rest = [str(e) for e in self.extra]
+            label = '|'.join([part] + rest)
+        else:
+            label = part
+        return '%s' % label
+
+
+class PositionLabelTag(GroupLabelBase):
+    '''A tag to mark that a position is involved in a group in some way, but does
+    not imply any specific semantics.
+    '''
+    __slots__ = ()
+
+    def __init__(self, value=None, extra=None, group_id=None):
+        assert group_id is not None
+        value = group_id
+        super(PositionLabelTag, self).__init__(
+            TagTypeEnum.position_label, value, extra, group_id)
+
+    def _format_main(self):
+        return "{self.group_id}".format(self=self)
+
+
+class LocalizationMarker(GroupLabelBase):
+    '''A tag to mark a particular localization site
+    '''
+    __slots__ = ()
+
+    def __init__(self, value, extra=None, group_id=None):
+        assert group_id is not None
+        super(LocalizationMarker, self).__init__(
+            TagTypeEnum.localization_marker, float(value), extra, group_id)
+
+    def _format_main(self):
+        return "{self.group_id}({self.value:.4g})".format(self=self)
+
+
+class InformationTag(TagBase):
+    '''A tag carrying free text describing the location
+    '''
+    __slots__ = ()
+
+    prefix_name = "INFO"
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(InformationTag, self).__init__(
+            TagTypeEnum.info, str(value), extra, group_id)
+
+    def _format_main(self):
+        return str(self.value)
+
+
+class ModificationResolver(object):
+    def __init__(self, name, **kwargs):
+        self.name = name.lower()
+        self.symbol = self.name[0]
+        self._database = None
+
+    def load_database(self):
+        raise NotImplementedError()
+
+    @property
+    def database(self):
+        if not self._database:
+            self._database = self.load_database()
+        return self._database
+
+    @database.setter
+    def database(self, database):
+        self._database = database
+
+    def parse_identifier(self, identifier):
+        """Parse a string that is either a CV prefixed identifier or name.
+
+        Parameters
+        ----------
+        identifier : str
+            The identifier string to parse, removing CV prefix as needed.
+
+        Returns
+        -------
+        name : str, optional
+            A textual identifier embedded in the qualified identifier, if any, otherwise
+            :const:`None`.
+        id : int, optional
+            An integer ID embedded in the qualified identifier, if any, otherwise
+            :const:`None`.
+        """
+        tokens = identifier.split(":", 1)
+        if len(tokens) > 1:
+            prefix = tokens[0].lower()
+            if prefix == self.name or prefix == self.symbol:
+                identifier = tokens[1]
+
+        if identifier.isdigit():
+            id = int(identifier)
+            name = None
+        else:
+            name = identifier
+            id = None
+        return name, id
+
+    def resolve(self, name=None, id=None, **kwargs):
+        raise NotImplementedError()
+
+    def __call__(self, name=None, id=None, **kwargs):
+        return self.resolve(name, id, **kwargs)
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+class UnimodResolver(ModificationResolver):
+    def __init__(self, **kwargs):
+        super(UnimodResolver, self).__init__("unimod", **kwargs)
+        self._database = kwargs.get("database")
+        self.strict = kwargs.get("strict", True)
+
+    def load_database(self):
+        if _has_psims:
+            return obo_cache.resolve("http://www.unimod.org/obo/unimod.obo")
+        return Unimod()
+
+    def resolve(self, name=None, id=None, **kwargs):
+        strict = kwargs.get("strict", self.strict)
+        exhaustive = kwargs.get("exhaustive", True)
+        if name is not None:
+            defn = self.database.by_title(name, strict=strict)
+            if not defn:
+                defn = self.database.by_name(name, strict=strict)
+            if not defn and exhaustive and strict:
+                defn = self.database.by_title(name, strict=False)
+                if not defn:
+                    defn = self.database.by_name(name, strict=False)
+            if defn and isinstance(defn, list):
+                warnings.warn(
+                    "Multiple matches found for {!r} in Unimod, taking the first, {}.".format(
+                        name, defn[0]['record_id']))
+                defn = defn[0]
+            if not defn:
+                raise KeyError(name)
+        elif id is not None:
+            defn = self.database[id]
+            if not defn:
+                raise KeyError(id)
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        if isinstance(defn, dict):
+            return {
+                'composition': defn['composition'],
+                'name': defn['title'],
+                'id': defn['record_id'],
+                'mass': defn['mono_mass'],
+                'provider': self.name,
+                "source": self
+            }
+        else:
+            name = defn.ex_code_name
+            if not name:
+                name = defn.code_name
+            return {
+                "composition": defn.composition,
+                "name": name,
+                "id": defn.id,
+                "mass": defn.monoisotopic_mass,
+                "provider": self.name,
+                "source": self
+            }
+
+
+class PSIModResolver(ModificationResolver):
+    def __init__(self, **kwargs):
+        super(PSIModResolver, self).__init__('psimod', **kwargs)
+        self._database = kwargs.get("database")
+
+    def load_database(self):
+        return load_psimod()
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is not None:
+            defn = self.database[name]
+        elif id is not None:
+            defn = self.database['MOD:{:05d}'.format(id)]
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        try:
+            mass = float(defn.DiffMono)
+        except (KeyError, TypeError, ValueError):
+            raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn))
+        if defn.DiffFormula is not None:
+            composition = Composition()
+            diff_formula_tokens = defn.DiffFormula.strip().split(" ")
+            for i in range(0, len(diff_formula_tokens), 2):
+                element = diff_formula_tokens[i]
+                count = diff_formula_tokens[i + 1]
+                if count:
+                    count = int(count)
+                if element.startswith("("):
+                    j = element.index(")")
+                    isotope = element[1:j]
+                    element = "%s[%s]" % (element[j + 1:], isotope)
+                composition[element] += count
+        else:
+            composition = None
+            warnings.warn("No formula was found for %r in PSI-MOD, composition will be missing" % ((name, id), ))
+        return {
+            'mass': mass,
+            'composition': composition,
+            'name': defn.name,
+            'id': defn.id,
+            'provider': self.name,
+            "source": self
+        }
+
+
+class XLMODResolver(ModificationResolver):
+    def __init__(self, **kwargs):
+        super(XLMODResolver, self).__init__('xlmod', **kwargs)
+        self._database = kwargs.get("database")
+
+    def load_database(self):
+        return load_xlmod()
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is not None:
+            defn = self.database[name]
+        elif id is not None:
+            defn = self.database['XLMOD:{:05d}'.format(id)]
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        try:
+            mass = float(defn['monoIsotopicMass'])
+        except (KeyError, TypeError, ValueError):
+            raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn))
+        if 'deadEndFormula' in defn:
+            composition = Composition(defn['deadEndFormula'].replace(" ", '').replace("D", "H[2]"))
+        elif 'bridgeFormula' in defn:
+            composition = Composition(
+                defn['bridgeFormula'].replace(" ", '').replace("D", "H[2]"))
+        return {
+            'mass': mass,
+            'composition': composition,
+            'name': defn.name,
+            'id': defn.id,
+            'provider': self.name,
+            "source": self
+        }
+
+# TODO: Implement resolve walking up the graph to get the mass. Can't really
+# get any more information without glypy/glyspace interaction
+class GNOResolver(ModificationResolver):
+    mass_pattern = re.compile(r"(\d+(:?\.\d+)) Da")
+
+    def __init__(self, **kwargs):
+        super(GNOResolver, self).__init__('gnome', **kwargs)
+        self._database = kwargs.get("database")
+
+    def load_database(self):
+        return load_gno()
+
+    def get_mass_from_glycan_composition(self, term):
+        '''Parse the Byonic-style glycan composition from property GNO:00000202
+        to get the counts of each monosaccharide and use that to calculate mass.
+
+        The mass computed here is exact and dehydrated, distinct from the rounded-off
+        mass that :meth:`get_mass_from_term` will produce by walking up the CV term
+        hierarchy. However, not all glycan compositions are representable in GNO:00000202
+        format, so this may silently be absent or incomplete, hence the double-check in
+        :meth:`get_mass_from_term`.
+
+        Parameters
+        ----------
+        term : psims.controlled_vocabulary.Entity
+            The CV entity being parsed.
+
+        Returns
+        -------
+        mass : float or :const:`None`
+            If a glycan composition is found on the term, the computed
+            mass will be returned. Otherwise the :const:`None` is returned
+        '''
+        val = term.get('GNO:00000202')
+        monosaccharides = BasicComposition()
+        composition = Composition()
+        if val:
+            tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val)
+            mass = 0.0
+            for symbol, count in tokens:
+                count = int(count)
+                try:
+                    mono_mass, mono_comp, symbol = GlycanModification.valid_monosaccharides[symbol]
+                    mass += mono_mass * count
+                    composition += mono_comp * count
+                    monosaccharides[symbol] += count
+                except KeyError:
+                    continue
+            return mass, monosaccharides, composition
+        return None, None, None
+
+    def get_mass_from_term(self, term, raw_mass):
+        '''Walk up the term hierarchy and find the mass group
+        term near the root of the tree, and return the most accurate
+        mass available for the provided term.
+
+        The mass group term's mass is rounded to two decimal places, leading
+        to relatively large errors.
+
+        Parameters
+        ----------
+        term : psims.controlled_vocabulary.Entity
+            The CV entity being parsed.
+
+        Returns
+        -------
+        mass : float or :const:`None`
+            If a root node is found along the term's lineage, computed
+            mass will be returned. Otherwise the :const:`None` is returned.
+            The mass may be
+        '''
+        root_id = 'GNO:00000001'
+        parent = term.parent()
+        if isinstance(parent, list):
+            parent = parent[0]
+        while parent.id != root_id:
+            next_parent = parent.parent()
+            if isinstance(next_parent, list):
+                next_parent = next_parent[0]
+            if next_parent.id == root_id:
+                break
+            parent = next_parent
+        match = self.mass_pattern.search(parent.name)
+        if not match:
+            return None
+        # This will have a small mass error.
+        rough_mass = float(match.group(1)) - _WATER_MASS
+        if raw_mass is not None and abs(rough_mass - raw_mass) < 1:
+            return raw_mass
+        warnings.warn(
+            ("An accurate glycan composition could not be inferred from %s. "
+             "Only a rough approximation is available.") % (term, ))
+        return rough_mass
+
+    def resolve(self, name=None, id=None, **kwargs):
+        if name is not None:
+            term = self.database[name]
+        elif id is not None:
+            term = self.database[id]
+        else:
+            raise ValueError("Must provide one of `name` or `id`")
+        raw_mass, monosaccharides, composition = self.get_mass_from_glycan_composition(term)
+
+        rec = {
+            "name":term.name,
+            "id": term.id,
+            "provider": self.name,
+            "composition": composition,
+            "monosaccharides": monosaccharides,
+            "mass": self.get_mass_from_term(term, raw_mass),
+            "source": self
+        }
+        return rec
+
+
+class GenericResolver(ModificationResolver):
+
+    def __init__(self, resolvers, **kwargs):
+        super(GenericResolver, self).__init__('generic', **kwargs)
+        self.resolvers = list(resolvers)
+
+    def load_database(self):
+        return None
+
+    def parse_identifier(self, identifier):
+        """Parse a string that is either a CV prefixed identifier or name.
+
+        Does no parsing as a :class:`GenericModification` is never qualified.
+
+        Parameters
+        ----------
+        identifier : str
+            The identifier string to parse, removing CV prefix as needed.
+
+        Returns
+        -------
+        name : str, optional
+            A textual identifier embedded in the qualified identifier, if any, otherwise
+            :const:`None`.
+        id : int, optional
+            An integer ID embedded in the qualified identifier, if any, otherwise
+            :const:`None`.
+        """
+        return identifier, None
+
+    def resolve(self, name=None, id=None, **kwargs):
+        defn = None
+        for resolver in self.resolvers:
+            try:
+                defn = resolver(name=name, id=id, **kwargs)
+                break
+            except KeyError:
+                continue
+            except ModificationMassNotFoundError:
+                warnings.warn("Could not resolve the mass for %r in %r" % ((name, id), resolver))
+                continue
+        if defn is None:
+            if name is None:
+                raise KeyError(id)
+            elif id is None:
+                raise KeyError(name)
+            else:
+                raise ValueError("Must provide one of `name` or `id`")
+        return defn
+
+
+class ModificationBase(TagBase):
+    '''A base class for all modification tags with marked prefixes.
+
+    While :class:`ModificationBase` is hashable, its equality testing
+    brings in additional tag-related information. For pure modification
+    identity comparison, use :attr:`key` to get a :class:`ModificationToken`
+    free of these concerns..
+    '''
+
+    _tag_type = None
+    __slots__ = ('_definition', 'style')
+
+    def __init__(self, value, extra=None, group_id=None, style=None):
+        if style is None:
+            style = ModificationTagStyle.Unset
+        super(ModificationBase, self).__init__(
+            self._tag_type, value, extra, group_id)
+        self._definition = None
+        self.style = style
+
+    def __eq__(self, other):
+        if isinstance(other, ModificationToken):
+            return other == self
+        return super(ModificationBase, self).__eq__(other)
+
+    def __hash__(self):
+        return hash((self.id, self.provider))
+
+    @property
+    def key(self):
+        '''Get a safe-to-hash-and-compare :class:`ModificationToken`
+        representing this modification without tag-like properties.
+
+        Returns
+        --------
+        ModificationToken
+        '''
+        return ModificationToken(self.value, self.id, self.provider, self.__class__)
+
+    @property
+    def definition(self):
+        '''A :class:`dict` of properties describing this modification, given
+        by the providing controlled vocabulary. This value is cached, and
+        should not be modified.
+
+        Returns
+        -------
+        dict
+        '''
+        if self._definition is None:
+            self._definition = self.resolve()
+        return self._definition
+
+    @property
+    def mass(self):
+        '''The monoisotopic mass shift this modification applies
+
+        Returns
+        -------float
+        '''
+        return self.definition['mass']
+
+    @property
+    def composition(self):
+        '''The chemical composition shift this modification applies'''
+        return self.definition.get('composition')
+
+    @property
+    def id(self):
+        '''The unique identifier given to this modification by its provider
+
+        Returns
+        -------
+        str or int
+        '''
+        return self.definition.get('id')
+
+    @property
+    def name(self):
+        '''The primary name of this modification from its provider.
+
+        Returns
+        -------
+        str
+        '''
+        return self.definition.get('name')
+
+    @property
+    def provider(self):
+        '''The name of the controlled vocabulary that provided this
+        modification.
+
+        Returns
+        -------
+        str
+        '''
+        return self.definition.get('provider')
+
+    def _populate_from_definition(self, definition):
+        self._definition = definition
+
+    def _format_main(self):
+        if self.style == ModificationTagStyle.Unset or self.style is None:
+            return "{self.prefix_name}:{self.value}".format(self=self)
+        elif self.style == ModificationTagStyle.LongId:
+            return "{self.prefix_name}:{self.id}".format(self=self)
+        elif self.style == ModificationTagStyle.ShortId:
+            return "{self.short_prefix}:{self.id}".format(self=self)
+        elif self.style == ModificationTagStyle.LongName:
+            return "{self.prefix_name}:{self.name}".format(self=self)
+        elif self.style == ModificationTagStyle.ShortName:
+            return "{self.short_prefix}:{self.name}".format(self=self)
+        else:
+            warnings.warn("Unknown formatting style {!r}".format(self.style))
+            return "{self.prefix_name}:{self.value}".format(self=self)
+
+    def resolve(self):
+        '''Find the term and return it's properties
+        '''
+        keys = self.resolver.parse_identifier(self.value)
+        return self.resolver(*keys)
+
+
+class MassModification(TagBase):
+    '''A modification defined purely by a signed mass shift in Daltons.
+
+    The value of a :class:`MassModification` is always a :class:`float`
+    '''
+    __slots__ = ('_significant_figures', )
+
+    prefix_name = "Obs"
+
+    def __init__(self, value, extra=None, group_id=None):
+        if isinstance(value, str):
+            sigfigs = len(value.split('.')[-1].rstrip('0'))
+        else:
+            sigfigs = 4
+        self._significant_figures = sigfigs
+        super(MassModification, self).__init__(
+            TagTypeEnum.massmod, float(value), extra, group_id)
+
+    def _format_main(self):
+        if self.value >= 0:
+            return ('+{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.')
+        else:
+            return ('{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.')
+
+    @property
+    def provider(self):
+        return None
+
+    @property
+    def id(self):
+        return self._format_main()
+
+    @property
+    def key(self):
+        '''Get a safe-to-hash-and-compare :class:`ModificationToken`
+        representing this modification without tag-like properties.
+
+        Returns
+        --------
+        ModificationToken
+        '''
+        return ModificationToken(self.value, self.id, self.provider, self.__class__)
+
+    @property
+    def mass(self):
+        return self.value
+
+    def __eq__(self, other):
+        if isinstance(other, ModificationToken):
+            return other == self
+        return super(MassModification, self).__eq__(other)
+
+    def __hash__(self):
+        return hash((self.id, self.provider))
+
+
+class FormulaModification(ModificationBase):
+    prefix_name = "Formula"
+
+    isotope_pattern = re.compile(r'\[(?P<isotope>\d+)(?P<element>[A-Z][a-z]*)(?P<quantity>[\-+]?\d+)\]')
+    _tag_type = TagTypeEnum.formula
+
+    def _normalize_isotope_notation(self, match):
+        '''Rewrite ProForma isotope notation to Pyteomics-compatible
+        isotope notation.
+
+        Parameters
+        ----------
+        match : Match
+            The matched isotope notation string parsed by the regular expression.
+
+        Returns
+        reformatted : str
+            The re-written isotope notation
+        '''
+        parts = match.groupdict()
+        return "{element}[{isotope}]{quantity}".format(**parts)
+
+    def resolve(self):
+        normalized = self.value.replace(' ', '')
+        # If there is a [ character in the formula, we know there are isotopes which
+        # need to be normalized.
+        if '[' in normalized:
+            normalized = self.isotope_pattern.sub(self._normalize_isotope_notation, normalized)
+        composition = Composition(formula=normalized)
+        return {
+            "mass": composition.mass(),
+            "composition": composition,
+            "name": self.value
+        }
+
+
+monosaccharide_description = namedtuple('monosaccharide_description', ('mass', 'composition', "symbol"))
+
+
+class GlycanModification(ModificationBase):
+    prefix_name = "Glycan"
+
+    _tag_type = TagTypeEnum.glycan
+
+    valid_monosaccharides = {
+        "Hex": monosaccharide_description(162.0528, Composition("C6H10O5"), 'Hex'),
+        "HexNAc": monosaccharide_description(203.0793, Composition("C8H13N1O5"), 'HexNAc'),
+        "HexS": monosaccharide_description(242.009, Composition("C6H10O8S1"), 'HexS'),
+        "HexP": monosaccharide_description(242.0191, Composition("C6H11O8P1"), 'HexP'),
+        "HexNAcS": monosaccharide_description(283.0361, Composition("C8H13N1O8S1"), 'HexNAcS'),
+        "dHex": monosaccharide_description(146.0579, Composition("C6H10O4"), 'dHex'),
+        "NeuAc": monosaccharide_description(291.0954, Composition("C11H17N1O8"), 'NeuAc'),
+        "NeuGc": monosaccharide_description(307.0903, Composition("C11H17N1O9"), 'NeuGc'),
+        "Pen": monosaccharide_description(132.0422, Composition("C5H8O4"), 'Pen'),
+        "Fuc": monosaccharide_description(146.0579, Composition("C6H10O4"), 'Fuc')
+    }
+
+    valid_monosaccharides['Neu5Ac'] = valid_monosaccharides['NeuAc']
+    valid_monosaccharides['Neu5Gc'] = valid_monosaccharides['NeuGc']
+    valid_monosaccharides['Pent'] = valid_monosaccharides['Pen']
+    valid_monosaccharides['d-Hex'] = valid_monosaccharides['dHex']
+
+    monomer_tokenizer = re.compile(
+        r"|".join(sorted(valid_monosaccharides.keys(), key=len, reverse=True)))
+    tokenizer = re.compile(r"(%s|[A-Za-z]+)\s*(\d*)\s*" % monomer_tokenizer.pattern)
+
+    @property
+    def monosaccharides(self):
+        return self.definition.get('monosaccharides')
+
+    def resolve(self):
+        composite = BasicComposition()
+        for tok, cnt in self.tokenizer.findall(self.value):
+            if cnt:
+                cnt = int(cnt)
+            else:
+                cnt = 1
+            if tok not in self.valid_monosaccharides:
+                parts = self.monomer_tokenizer.findall(tok)
+                t = 0
+                for p in parts:
+                    if p not in self.valid_monosaccharides:
+                        break
+                    t += len(p)
+                if t != len(tok):
+                    raise ValueError("{tok!r} is not a valid monosaccharide name".format(tok=tok))
+                else:
+                    for p in parts[:-1]:
+                        sym = self.valid_monosaccharides[p].symbol
+                        composite[sym] += 1
+                    sym = self.valid_monosaccharides[parts[-1]].symbol
+                    composite[sym] += cnt
+            else:
+                sym = self.valid_monosaccharides[tok].symbol
+                composite[sym] += cnt
+        mass = 0
+        chemcomp = Composition()
+        for key, cnt in composite.items():
+            try:
+                m, c, sym = self.valid_monosaccharides[key]
+            except KeyError:
+                raise UnknownMonosaccharideError(key)
+            mass += m * cnt
+            chemcomp += c * cnt
+        return {
+            "mass": mass,
+            "composition": chemcomp,
+            "name": self.value,
+            "monosaccharides": composite
+        }
+
+
+class UnimodModification(ModificationBase):
+    __slots__ = ()
+
+    resolver = UnimodResolver()
+
+    prefix_name = "UNIMOD"
+    short_prefix = "U"
+    _tag_type = TagTypeEnum.unimod
+
+
+class PSIModModification(ModificationBase):
+    __slots__ = ()
+
+    resolver = PSIModResolver()
+
+    prefix_name = "MOD"
+    short_prefix = 'M'
+    _tag_type = TagTypeEnum.psimod
+
+
+class GNOmeModification(ModificationBase):
+    __slots__ = ()
+
+    resolver = GNOResolver()
+
+    prefix_name = "GNO"
+    short_prefix = 'G'
+    _tag_type = TagTypeEnum.gnome
+
+    @property
+    def monosaccharides(self):
+        return self.definition.get('monosaccharides')
+
+
+class XLMODModification(ModificationBase):
+    __slots__ = ()
+
+    resolver = XLMODResolver()
+
+    prefix_name = "XLMOD"
+    # short_prefix = 'XL'
+    _tag_type = TagTypeEnum.xlmod
+
+
+class GenericModification(ModificationBase):
+    __slots__ = ()
+    _tag_type = TagTypeEnum.generic
+    resolver = GenericResolver([
+        # Do exact matching here first. Then default to non-strict matching as a final
+        # correction effort.
+        partial(UnimodModification.resolver, exhaustive=False),
+        PSIModModification.resolver,
+        XLMODModification.resolver,
+        GNOmeModification.resolver,
+        # Some really common names aren't actually found in the XML exactly, so default
+        # to non-strict matching now to avoid masking other sources here.
+        partial(UnimodModification.resolver, strict=False)
+    ])
+
+    def __init__(self, value, extra=None, group_id=None):
+        super(GenericModification, self).__init__(
+            value, extra, group_id)
+
+    def _format_main(self):
+        return self.value
+
+    def resolve(self):
+        '''Find the term, searching through all available vocabularies and
+        return the first match's properties
+        '''
+        keys = self.resolver.parse_identifier(self.value)
+        defn = self.resolver(*keys)
+        if defn is not None:
+            return defn
+        raise KeyError(keys)
+
+
+def set_unimod_path(path):
+    '''Set the path to load the Unimod database from for resolving
+    ProForma Unimod modifications.
+
+    .. note::
+
+        This method ensures that the Unimod modification database loads
+        quickly from a local database file instead of downloading a new
+        copy from the internet.
+
+    Parameters
+    ----------
+    path : str or file-like object
+        A path to or file-like object for the "unimod.xml" file.
+
+    Returns
+    -------
+    :class:`~pyteomics.mass.mass.Unimod`
+    '''
+    db = Unimod(path)
+    UnimodModification.resolver.database = db
+    return db
+
+
+class ModificationToken(object):
+    '''Describes a particular modification from a particular provider, independent
+    of a :class:`TagBase`'s state.
+
+    This class is meant to be used in place of a :class:`ModificationBase` object
+    when equality testing and hashing is desired, but do not want extra properties
+    to be involved.
+
+    :class:`ModificationToken` is comparable and hashable, and can be compared with
+    :class:`ModificationBase` subclass instances safely. It can be called to create
+    a new instance of the :class:`ModificationBase` it is equal to.
+
+    Attributes
+    ----------
+    name : str
+        The name of the modification being represented, as the user specified it.
+    id : int or str
+        Whatever unique identifier the providing controlled vocabulary gave to this
+        modification
+    provider : str
+        The name of the providing controlled vocabulary.
+    source_cls : type
+        A sub-class of :class:`ModificationBase` that will be used to fulfill this
+        token if requested, providing it a resolver.
+    '''
+    __slots__ = ('name', 'id', 'provider', 'source_cls')
+
+    def __init__(self, name, id, provider, source_cls):
+        self.name = name
+        self.id = id
+        self.provider = provider
+        self.source_cls = source_cls
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        if isinstance(other, (ModificationToken, ModificationBase, MassModification)):
+            return self.id == other.id and self.provider == other.provider
+        return False
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash((self.id, self.provider))
+
+    def __call__(self):
+        '''Create a new :class:`ModificationBase`
+        instance from the provided :attr:`name`
+        against :attr:`source_cls`'s resolver.
+
+        Returns
+        -------
+        ModificationBase
+        '''
+        return self.source_cls(self.name)
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.name!r}, {self.id!r}, {self.provider!r}, {self.source_cls})"
+        return template.format(self=self)
+
+
+def split_tags(tokens):
+    '''Split a token array into discrete sets of tag
+    tokens.
+
+    Parameters
+    ----------
+    tokens: list
+        The characters of the tag token buffer
+
+    Returns
+    -------
+    list of list:
+        The tokens for each contained tag
+    '''
+    starts = [0]
+    ends = []
+    for i, c in enumerate(tokens):
+        if c == '|':
+            ends.append(i)
+            starts.append(i + 1)
+        elif (i != 0 and c == '#'):
+            ends.append(i)
+            starts.append(i)
+    ends.append(len(tokens))
+    out = []
+    for i, start in enumerate(starts):
+        end = ends[i]
+        tag = tokens[start:end]
+        if len(tag) == 0:
+            continue
+        # Short circuit on INFO tags which can't be broken
+        # if (tag[0] == 'i' and tag[:5] == ['i', 'n', 'f', 'o', ':']) or (tag[0] == 'I' and tag[:5] == ['I', 'N', 'F', 'O', ':']):
+        #     tag = tokens[start:]
+        #     out.append(tag)
+        #     break
+        out.append(tag)
+    return out
+
+
+def find_prefix(tokens):
+    '''Find the prefix, if any of the tag defined by `tokens`
+    delimited by ":".
+
+    Parameters
+    ----------
+    tokens: list
+        The tag tokens to search
+
+    Returns
+    -------
+    prefix: str or None
+        The prefix string, if found
+    rest: str
+        The rest of the tokens, merged as a string
+    '''
+    for i, c in enumerate(tokens):
+        if c == ':':
+            return ''.join(tokens[:i]), ''.join(tokens[i + 1:])
+    return None, ''.join(tokens)
+
+
+def process_marker(tokens):
+    '''Process a marker, which is a tag whose value starts with #.
+
+    Parameters
+    ----------
+    tokens: list
+        The tag tokens to parse
+
+    Returns
+    -------
+    PositionLabelTag or LocalizationMarker
+    '''
+    if tokens[1:3] == 'XL':
+        return PositionLabelTag(None, group_id=''.join(tokens))
+    else:
+        group_id = None
+        value = None
+        for i, c in enumerate(tokens):
+            if c == '(':
+                group_id = ''.join(tokens[:i])
+                if tokens[-1] != ')':
+                    raise Exception(
+                        "Localization marker with score missing closing parenthesis")
+                value = float(''.join(tokens[i + 1:-1]))
+                return LocalizationMarker(value, group_id=group_id)
+        else:
+            group_id = ''.join(tokens)
+            return PositionLabelTag(group_id=group_id)
+
+
+def process_tag_tokens(tokens):
+    '''Convert a tag token buffer into a parsed :class:`TagBase` instance
+    of the appropriate sub-type with zero or more sub-tags.
+
+    Parameters
+    ----------
+    tokens: list
+        The tokens to parse
+
+    Returns
+    -------
+    TagBase:
+        The parsed tag
+    '''
+    parts = split_tags(tokens)
+    main_tag = parts[0]
+    if main_tag[0] in ('+', '-'):
+        main_tag = ''.join(main_tag)
+        main_tag = MassModification(main_tag)
+    elif main_tag[0] == '#':
+        main_tag = process_marker(main_tag)
+    else:
+        prefix, value = find_prefix(main_tag)
+        if prefix is None:
+            main_tag = GenericModification(''.join(value))
+        else:
+            try:
+                tag_type = TagBase.find_by_tag(prefix)
+                main_tag = tag_type(value)
+            except KeyError:
+                main_tag_str = ''.join(main_tag)
+                main_tag = GenericModification(main_tag_str)
+
+    if len(parts) > 1:
+        extras = []
+        for part in parts[1:]:
+            prefix, value = find_prefix(part)
+            if prefix is None:
+                if value[0] == "#":
+                    marker = process_marker(value)
+                    if isinstance(marker, PositionLabelTag):
+                        main_tag.group_id = ''.join(value)
+                    else:
+                        main_tag.group_id = marker.group_id
+                        extras.append(marker)
+                else:
+                    extras.append(GenericModification(''.join(value)))
+            else:
+                try:
+                    tag_type = TagBase.find_by_tag(prefix)
+                    extra_tag = tag_type(value)
+                except KeyError:
+                    part_str = ''.join(part)
+                    extra_tag = GenericModification(part_str)
+                extras.append(extra_tag)
+        main_tag.extra = extras
+    return main_tag
+
+
+class ModificationRule(object):
+    '''Define a fixed modification rule which dictates a modification tag is
+    always applied at one or more amino acid residues.
+
+    Attributes
+    ----------
+    modification_tag: TagBase
+        The modification to apply
+    targets: list
+        The list of amino acids this applies to
+    '''
+    __slots__ = ('modification_tag', 'targets')
+
+    def __init__(self, modification_tag, targets=None):
+        self.modification_tag = modification_tag
+        self.targets = targets
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return self.modification_tag == other.modification_tag and self.targets == other.targets
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        targets = ','.join(self.targets)
+        return "<[{self.modification_tag}]@{targets}>".format(self=self, targets=targets)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self)
+
+
+class StableIsotope(object):
+    '''Define a fixed isotope that is applied globally to all amino acids.
+
+    Attributes
+    ----------
+    isotope: str
+        The stable isotope string, of the form [<isotope-number>]<element> or a special
+        isotopoform's name.
+    '''
+    __slots__ = ('isotope', )
+
+    def __init__(self, isotope):
+        self.isotope = isotope
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return self.isotope == other.isotope
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return "<{self.isotope}>".format(self=self)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.isotope})".format(self=self)
+
+
+class IntersectionEnum(Enum):
+    no_overlap = 0
+    full_contains_interval = 1
+    full_contained_in_interval = 2
+    start_overlap = 3
+    end_overlap = 4
+
+
+class TaggedInterval(object):
+    '''Define a fixed interval over the associated sequence which contains the localization
+    of the associated tag or denotes a region of general sequence order ambiguity.
+
+    Attributes
+    ----------
+    start: int
+        The starting position (inclusive) of the interval along the primary sequence
+    end: int
+        The ending position (exclusive) of the interval along the primary sequence
+    tags: list[TagBase]
+        The tags being localized
+    ambiguous : bool
+        Whether the interval is ambiguous or not
+    '''
+    __slots__ = ('start', 'end', 'tags', 'ambiguous')
+
+    def __init__(self, start, end=None, tags=None, ambiguous=False):
+        self.start = start
+        self.end = end
+        self.tags = tags
+        self.ambiguous = ambiguous
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return self.start == other.start and self.end == other.end and self.tags == other.tags
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __str__(self):
+        return "({self.start}-{self.end}){self.tags!r}".format(self=self)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.start}, {self.end}, {self.tags})".format(self=self)
+
+    def as_slice(self):
+        return slice(self.start, self.end)
+
+    def contains(self, i):
+        return self.start <= i < self.end
+
+    def __contains__(self, i):
+        return self.contains(i)
+
+    def copy(self):
+        return self.__class__(self.start, self.end, self.tags)
+
+    def _check_slice(self, qstart, qend, warn_ambiguous):
+        # Fully contained interval
+        valid = qstart <= self.start and qend >= self.end
+        case = IntersectionEnum.full_contained_in_interval if valid else IntersectionEnum.no_overlap
+        if not valid:
+            # Spans the beginning but not the end
+            valid = qstart <= self.start and qend > self.start
+            if valid:
+                case = IntersectionEnum.start_overlap
+                if warn_ambiguous:
+                    warnings.warn("Slice bisecting interval %s" % (self, ))
+
+        if not valid:
+            # Spans the end but not the beginning
+            valid = qstart < self.end and qend > self.end
+            if valid:
+                case = IntersectionEnum.end_overlap
+                if warn_ambiguous:
+                    warnings.warn("Slice bisecting interval %s" % (self, ))
+
+        if not valid:
+            # Contained interval
+            valid = qstart >= self.start and qend < self.end
+            if valid:
+                case = IntersectionEnum.full_contains_interval
+                if warn_ambiguous:
+                    warnings.warn("Slice bisecting interval %s" % (self, ))
+        return valid, case
+
+    def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True):
+        if end is None:
+            qend = self.end + 1
+        else:
+            qend = end
+        if start is None:
+            qstart = self.start - 1
+        else:
+            qstart = start
+
+        valid, intersection_type = self._check_slice(qstart, qend, warn_ambiguous)
+        if self.ambiguous and intersection_type not in (IntersectionEnum.full_contained_in_interval, IntersectionEnum.no_overlap):
+            raise ValueError("Cannot bisect an ambiguous interval")
+        if not valid:
+            return None
+        new = self.copy()
+        if start is not None:
+            diff = self.start - start
+            if diff < 0:
+                diff = 0
+            new.start = diff
+        if end is not None:
+            width = min(new.end, end) - self.start
+        else:
+            width = self.end - max(start, self.start)
+        new.end = new.start + width
+        return new
+
+
+class ChargeState(object):
+    '''Describes the charge and adduct types of the structure.
+
+    Attributes
+    ----------
+    charge : int
+        The total charge state as a signed number.
+    adducts : list[str]
+        Each charge carrier associated with the molecule.
+    '''
+    __slots__ = ("charge", "adducts")
+
+    def __init__(self, charge, adducts=None):
+        if adducts is None:
+            adducts = []
+        self.charge = charge
+        self.adducts = adducts
+
+    def __str__(self):
+        tokens = [str(self.charge)]
+        if self.adducts:
+            tokens.append("[")
+            tokens.append(','.join(str(adduct) for adduct in self.adducts))
+            tokens.append("]")
+        return ''.join(tokens)
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.charge}, {self.adducts})"
+        return template.format(self=self)
+
+
+class TokenBuffer(object):
+    '''A token buffer that wraps the accumulation and reset logic
+    of a list of :class:`str` objects.
+
+    Implements a subset of the Sequence protocol.
+
+    Attributes
+    ----------
+    buffer: list
+        The list of tokens accumulated since the last parsing.
+    '''
+    def __init__(self, initial=None):
+        self.buffer = list(initial or [])
+        self.boundaries = []
+
+    def append(self, c):
+        '''Append a new character to the buffer.
+
+        Parameters
+        ----------
+        c: str
+            The character appended
+        '''
+        self.buffer.append(c)
+
+    def reset(self):
+        '''Discard the content of the current buffer.
+        '''
+        if self.buffer:
+            self.buffer = []
+        if self.boundaries:
+            self.boundaries = []
+
+    def __bool__(self):
+        return bool(self.buffer)
+
+    def __iter__(self):
+        return iter(self.buffer)
+
+    def __getitem__(self, i):
+        return self.buffer[i]
+
+    def __len__(self):
+        return len(self.buffer)
+
+    def tokenize(self):
+        i = 0
+        pieces = []
+        for k in self.boundaries + [len(self)]:
+            piece = self.buffer[i:k]
+            i = k
+            pieces.append(piece)
+        return pieces
+
+    def _transform(self, value):
+        return value
+
+    def process(self):
+        if self.boundaries:
+            value = [self._transform(v) for v in self.tokenize()]
+        else:
+            value = self._transform(self.buffer)
+        self.reset()
+        return value
+
+    def bound(self):
+        k = len(self)
+        self.boundaries.append(k)
+        return k
+
+    def __call__(self):
+        return self.process()
+
+
+class NumberParser(TokenBuffer):
+    '''A buffer which accumulates tokens until it is asked to parse them into
+    :class:`int` instances.
+    '''
+
+    def _transform(self, value):
+        return int(''.join(value))
+
+
+class StringParser(TokenBuffer):
+    '''A buffer which accumulates tokens until it is asked to parse them into
+    :class:`str` instances.
+    '''
+
+    def _transform(self, value):
+        return ''.join(value)
+
+
+class TagParser(TokenBuffer):
+    '''A buffer which accumulates tokens until it is asked to parse them into
+    :class:`TagBase` instances.
+
+    Implements a subset of the Sequence protocol.
+
+    Attributes
+    ----------
+    buffer: list
+        The list of tokens accumulated since the last parsing.
+    group_ids: set
+        The set of all group IDs that have been produced so far.
+    '''
+
+    def __init__(self, initial=None, group_ids=None):
+        super(TagParser, self).__init__(initial)
+        if group_ids:
+            self.group_ids = set(group_ids)
+        else:
+            self.group_ids = set()
+
+    def _transform(self, value):
+        tag = process_tag_tokens(value)
+        if tag.group_id:
+            self.group_ids.add(tag.group_id)
+        return tag
+
+    def process(self):
+        value = super(TagParser, self).process()
+        if not isinstance(value, list):
+            value = [value]
+        return value
+
+
+class ParserStateEnum(Enum):
+    before_sequence = 0
+    tag_before_sequence = 1
+    global_tag = 2
+    fixed_spec = 3
+    labile_tag = 4
+    sequence = 5
+    tag_in_sequence = 6
+    interval_tag = 7
+    tag_after_sequence = 8
+    stable_isotope = 9
+    post_tag_before = 10
+    unlocalized_count = 11
+    post_global = 12
+    post_global_aa = 13
+    post_interval_tag = 14
+    post_tag_after = 15
+    charge_state_start = 16
+    charge_state_number = 17
+    charge_state_adduct_start = 18
+    charge_state_adduct_end = 19
+    inter_chain_cross_link_start = 20
+    chimeric_start = 21
+    interval_initial = 22
+    done = 999
+
+
+BEFORE = ParserStateEnum.before_sequence
+TAG_BEFORE = ParserStateEnum.tag_before_sequence
+FIXED = ParserStateEnum.fixed_spec
+GLOBAL = ParserStateEnum.global_tag
+ISOTOPE = ParserStateEnum.stable_isotope
+LABILE = ParserStateEnum.labile_tag
+SEQ = ParserStateEnum.sequence
+TAG = ParserStateEnum.tag_in_sequence
+INTERVAL_TAG = ParserStateEnum.interval_tag
+INTERVAL_INIT = ParserStateEnum.interval_initial
+TAG_AFTER = ParserStateEnum.tag_after_sequence
+POST_TAG_BEFORE = ParserStateEnum.post_tag_before
+POST_TAG_AFTER = ParserStateEnum.post_tag_after
+UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count
+POST_GLOBAL = ParserStateEnum.post_global
+POST_GLOBAL_AA = ParserStateEnum.post_global_aa
+POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag
+CHARGE_START = ParserStateEnum.charge_state_start
+CHARGE_NUMBER = ParserStateEnum.charge_state_number
+ADDUCT_START = ParserStateEnum.charge_state_adduct_start
+ADDUCT_END = ParserStateEnum.charge_state_adduct_end
+DONE = ParserStateEnum.done
+
+VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB")
+
+def parse(sequence):
+    '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a
+    mapping of sequence-spanning modifiers.
+
+    .. note::
+        This is a state machine parser, but with certain sub-state paths
+        unrolled to avoid an explosion of formal intermediary states.
+
+    Parameters
+    ----------
+    sequence: str
+        The sequence to parse
+
+    Returns
+    -------
+    parsed_sequence: list[tuple[str, list[TagBase]]]
+        The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence
+    modifiers: dict
+        A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized
+        modifications, tagged intervals, and group IDs
+    '''
+    labile_modifications = []
+    fixed_modifications = []
+    unlocalized_modifications = []
+    intervals = []
+    isotopes = []
+
+    n_term = None
+    c_term = None
+
+    i = 0
+    n = len(sequence)
+
+    positions = []
+    state = BEFORE
+    depth = 0
+
+    current_aa = None
+    current_tag = TagParser()
+    current_interval = None
+    current_unlocalized_count = NumberParser()
+    current_aa_targets = TokenBuffer()
+
+    charge_buffer = None
+    adduct_buffer = None
+
+    # A mostly context free finite state machine unrolled
+    # by hand.
+    while i < n:
+        c = sequence[i]
+        i += 1
+        # Initial state prior to sequence content
+        if state == BEFORE:
+            if c == '[':
+                state = TAG_BEFORE
+                depth = 1
+            elif c == '{':
+                state = LABILE
+                depth = 1
+            elif c == '<':
+                state = FIXED
+            elif c in VALID_AA:
+                current_aa = c
+                state = SEQ
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        # The body of the amino acid sequence.
+        elif state == SEQ or state == INTERVAL_INIT:
+            if state == INTERVAL_INIT:
+                state = SEQ
+                if c == '?':
+                    if current_interval is not None:
+                        current_interval.ambiguous = True
+                    continue
+            if c in VALID_AA:
+                if current_aa is not None:
+                    positions.append((current_aa, current_tag() if current_tag else None))
+                current_aa = c
+            elif c == '[':
+                state = TAG
+                if current_tag:
+                    current_tag.bound()
+                depth = 1
+            elif c == '(':
+                if current_interval is not None:
+                    raise ProFormaError(
+                        ("Error In State {state}, nested range found at index {i}. "
+                         "Nested ranges are not yet supported by ProForma.").format(
+                            **locals()), i, state)
+                current_interval = TaggedInterval(len(positions) + 1)
+                state = INTERVAL_INIT
+            elif c == ')':
+                positions.append(
+                    (current_aa, current_tag() if current_tag else None))
+                current_aa = None
+                if current_interval is None:
+                    raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+                else:
+                    current_interval.end = len(positions)
+                    if i < n and sequence[i] == '[':
+                        i += 1
+                        depth = 1
+                        state = INTERVAL_TAG
+                    else:
+                        intervals.append(current_interval)
+                        current_interval = None
+            elif c == '-':
+                if current_aa:
+                    positions.append((current_aa, current_tag() if current_tag else None))
+                    current_aa = None
+                state = TAG_AFTER
+                if i >= n or sequence[i] != '[':
+                    raise ProFormaError("Missing Closing Tag", i, state)
+                i += 1
+                depth = 1
+            elif c == '/':
+                state = CHARGE_START
+                charge_buffer = NumberParser()
+            elif c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
+            else:
+                raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        # Tag parsing which rely on `current_tag` to buffer tokens.
+        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL or state == INTERVAL_TAG:
+            if c == '[':
+                depth += 1
+                current_tag.append(c)
+            elif c == ']':
+                depth -= 1
+                if depth <= 0:
+                    depth = 0
+                    if state == TAG:
+                        state = SEQ
+                    elif state == TAG_BEFORE:
+                        state = POST_TAG_BEFORE
+                    elif state == TAG_AFTER:
+                        c_term = current_tag()
+                        state = POST_TAG_AFTER
+                    elif state == GLOBAL:
+                        state = POST_GLOBAL
+                    elif state == INTERVAL_TAG:
+                        state = POST_INTERVAL_TAG
+                        depth = 0
+                else:
+                    current_tag.append(c)
+            else:
+                current_tag.append(c)
+        # Handle transition to fixed modifications or isotope labeling from opening signal.
+        elif state == FIXED:
+            if c == '[':
+                state = GLOBAL
+            else:
+                # Do validation here
+                state = ISOTOPE
+                current_tag.reset()
+                current_tag.append(c)
+        # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens
+        elif state == ISOTOPE:
+            if c != '>':
+                current_tag.append(c)
+            else:
+                # Not technically a tag, but exploits the current buffer
+                isotopes.append(StableIsotope(''.join(current_tag)))
+                current_tag.reset()
+                state = BEFORE
+        # Handle labile modifications, which rely on `current_tag` to buffer tokens
+        elif state == LABILE:
+            if c == '{':
+                depth += 1
+            elif c == '}':
+                depth -= 1
+                if depth <= 0:
+                    depth = 0
+                    labile_modifications.append(current_tag()[0])
+                    state = BEFORE
+            else:
+                current_tag.append(c)
+        # The intermediate state between an interval tag and returning to sequence parsing.
+        # A new tag may start immediately, leading to it being appended to the interval instead
+        # instead of returning to the primary sequence. Because this state may also occur at the
+        # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags,
+        # charge states, and the like.
+        elif state == POST_INTERVAL_TAG:
+            if c == '[':
+                current_tag.bound()
+                state = INTERVAL_TAG
+            elif c in VALID_AA:
+                current_aa = c
+                current_interval.tags = current_tag()
+                intervals.append(current_interval)
+                current_interval = None
+                state = SEQ
+            elif c == '-':
+                state = TAG_AFTER
+                if i >= n or sequence[i] != '[':
+                    raise ProFormaError("Missing Closing Tag", i, state)
+                i += 1
+                depth = 1
+            elif c == '/':
+                state = CHARGE_START
+                charge_buffer = NumberParser()
+            elif c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        # An intermediate state for discriminating which type of tag-before-sequence type
+        # we just finished parsing.
+        elif state == POST_TAG_BEFORE:
+            if c == '?':
+                unlocalized_modifications.append(current_tag()[0])
+                state = BEFORE
+            elif c == '-':
+                n_term = current_tag()
+                state = BEFORE
+            elif c == '^':
+                state = UNLOCALIZED_COUNT
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == UNLOCALIZED_COUNT:
+            if c.isdigit():
+                current_unlocalized_count.append(c)
+            elif c == '[':
+                state = TAG_BEFORE
+                depth = 1
+                tag = current_tag()[0]
+                multiplicity = current_unlocalized_count()
+                for i in range(multiplicity):
+                    unlocalized_modifications.append(tag)
+            elif c == '?':
+                state = BEFORE
+                tag = current_tag()[0]
+                multiplicity = current_unlocalized_count()
+                for i in range(multiplicity):
+                    unlocalized_modifications.append(tag)
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == POST_GLOBAL:
+            if c == '@':
+                state = POST_GLOBAL_AA
+            else:
+                raise ProFormaError(
+                    ("Error In State {state}, fixed modification detected without "
+                     "target amino acids found at index {i}").format(**locals()), i, state)
+        elif state == POST_GLOBAL_AA:
+            if c in VALID_AA:
+                current_aa_targets.append(c)
+            elif c == ',':
+                # the next character should be another amino acid
+                pass
+            elif c == '>':
+                fixed_modifications.append(
+                    ModificationRule(current_tag()[0], current_aa_targets()))
+                state = BEFORE
+            else:
+                raise ProFormaError(
+                    ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state)
+        elif state == POST_TAG_AFTER:
+            if c == '/':
+                state = CHARGE_START
+                charge_buffer = NumberParser()
+            elif c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
+        elif state == CHARGE_START:
+            if c in '+-':
+                charge_buffer.append(c)
+                state = CHARGE_NUMBER
+            elif c.isdigit():
+                charge_buffer.append(c)
+                state = CHARGE_NUMBER
+            elif c == '/':
+                state = ParserStateEnum.inter_chain_cross_link_start
+                raise ProFormaError("Inter-chain cross-linked peptides are not yet supported", i, state)
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == CHARGE_NUMBER:
+            if c.isdigit():
+                charge_buffer.append(c)
+            elif c == "[":
+                state = ADDUCT_START
+                adduct_buffer = StringParser()
+            else:
+                raise ProFormaError(
+                    "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+        elif state == ADDUCT_START:
+            if c.isdigit() or c in "+-" or c in element_symbols:
+                adduct_buffer.append(c)
+            elif c == ',':
+                adduct_buffer.bound()
+            elif c == ']':
+                state = ADDUCT_END
+        elif state == ADDUCT_END:
+            if c == '+':
+                raise ProFormaError(
+                    "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state)
+        else:
+            raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state)
+    if charge_buffer:
+        charge_number = charge_buffer()
+        if adduct_buffer:
+            adducts = adduct_buffer()
+        else:
+            adducts = None
+        charge_state = ChargeState(charge_number, adducts)
+    else:
+        charge_state = None
+    if current_aa:
+        positions.append((current_aa, current_tag() if current_tag else None))
+    if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ):
+        raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state)
+    return positions, {
+        'n_term': n_term,
+        'c_term': c_term,
+        'unlocalized_modifications': unlocalized_modifications,
+        'labile_modifications': labile_modifications,
+        'fixed_modifications': fixed_modifications,
+        'intervals': intervals,
+        'isotopes': isotopes,
+        'group_ids': sorted(current_tag.group_ids),
+        'charge_state': charge_state,
+    }
+
+
+def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None,
+                labile_modifications=None, fixed_modifications=None, intervals=None,
+                isotopes=None, charge_state=None, group_ids=None):
+    '''Convert a sequence plus modifiers into formatted text following the
+    ProForma specification.
+
+    Parameters
+    ----------
+    sequence : list[tuple[str, TagBase]]
+        The primary sequence of the peptidoform/proteoform to render
+    n_term : Optional[TagBase]
+        The N-terminal modification, if any.
+    c_term : Optional[TagBase]
+        The C-terminal modification, if any.
+    unlocalized_modifications : Optional[list[TagBase]]
+        Any modifications which aren't assigned to a specific location.
+    labile_modifications : Optional[list[TagBase]]
+        Any labile modifications
+    fixed_modifications : Optional[list[ModificationRule]]
+        Any fixed modifications
+    intervals : Optional[list[TaggedInterval]]
+        A list of modified intervals, if any
+    isotopes : Optional[list[StableIsotope]]
+        Any global stable isotope labels applied
+    charge_state : Optional[ChargeState]
+        An optional charge state value
+    group_ids : Optional[list[str]]
+        Any group identifiers. This parameter is currently not used.
+
+    Returns
+    -------
+    str
+    '''
+    primary = deque()
+    for aa, tags in sequence:
+        if not tags:
+            primary.append(str(aa))
+        else:
+            primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags]))
+    if intervals:
+        for iv in sorted(intervals, key=lambda x: x.start):
+            if iv.ambiguous:
+                primary[iv.start] = '(?' + primary[iv.start]
+            else:
+                primary[iv.start] = '(' + primary[iv.start]
+
+            terminator = '{0!s})'.format(primary[iv.end - 1])
+            if iv.tags:
+                terminator += ''.join('[{!s}]'.format(t) for t in iv.tags)
+            primary[iv.end - 1] = terminator
+    if n_term:
+        primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-')
+    if c_term:
+        primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term))
+    if charge_state:
+        primary.append("/{!s}".format(charge_state))
+    if labile_modifications:
+        primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications])
+    if unlocalized_modifications:
+        primary.appendleft("?")
+        primary.extendleft(['[{!s}]'.format(m) for m in unlocalized_modifications])
+    if isotopes:
+        primary.extendleft(['{!s}'.format(m) for m in isotopes])
+    if fixed_modifications:
+        primary.extendleft(['{!s}'.format(m) for m in fixed_modifications])
+    return ''.join(primary)
+
+
+class _ProFormaProperty(object):
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, cls):
+        return obj.properties[self.name]
+
+    def __set__(self, obj, value):
+        obj.properties[self.name] = value
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.name!r})"
+        return template.format(self=self)
+
+
+class ProForma(object):
+    '''Represent a parsed ProForma sequence.
+
+    The preferred way to instantiate this class is via the :meth:`parse`
+    method.
+
+    Attributes
+    ----------
+    sequence : list[tuple[str, List[TagBase]]]
+        The list of (amino acid, tag collection) pairs making up the primary sequence of the
+        peptide.
+    isotopes : list[StableIsotope]
+        A list of any stable isotope rules that apply to this peptide
+    charge_state : int, optional
+        An optional charge state that may have been provided
+    intervals : list[Interval]
+        Any annotated intervals that contain either sequence ambiguity or a
+        tag over that interval.
+    labile_modifications : list[ModificationBase]
+        Any modifications that were parsed as labile, and may not appear at
+        any location on the peptide primary sequence.
+    unlocalized_modifications : list[ModificationBase]
+        Any modifications that were not localized but may be attached to peptide
+        sequence evidence.
+    n_term : list[ModificationBase]
+        Any modifications on the N-terminus of the peptide
+    c_term : list[ModificationBase]
+        Any modifications on the C-terminus of the peptide
+    group_ids : set
+        The collection of all groupd identifiers on this sequence.
+    mass : float
+        The computed mass for the fully modified peptide, including labile
+        and unlocalized modifications. **Does not include stable isotopes at this time**
+    '''
+
+    def __init__(self, sequence, properties):
+        self.sequence = sequence
+        self.properties = properties
+
+    isotopes = _ProFormaProperty('isotopes')
+    charge_state = _ProFormaProperty('charge_state')
+
+    intervals = _ProFormaProperty('intervals')
+    fixed_modifications = _ProFormaProperty('fixed_modifications')
+    labile_modifications = _ProFormaProperty('labile_modifications')
+    unlocalized_modifications = _ProFormaProperty('unlocalized_modifications')
+
+    n_term = _ProFormaProperty('n_term')
+    c_term = _ProFormaProperty('c_term')
+
+    group_ids = _ProFormaProperty('group_ids')
+
+    def __str__(self):
+        return to_proforma(self.sequence, **self.properties)
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.sequence}, {self.properties})".format(self=self)
+
+    def __len__(self):
+        return len(self.sequence)
+
+    def __getitem__(self, i):
+        if isinstance(i, slice):
+            props = self.properties.copy()
+            ivs = []
+            for iv in props['intervals']:
+                iv = iv._update_coordinates_sliced(
+                    i.start, i.stop)
+                if iv is None:
+                    continue
+                ivs.append(iv)
+            props['intervals'] = ivs
+
+            if not (i.start is None or i.start == 0):
+                props['n_term'] = None
+            n = len(self)
+            if not (i.stop is None or i.stop >= n):
+                props['c_term'] = None
+
+            return self.__class__(self.sequence[i], props)
+        else:
+            return self.sequence[i]
+
+    def __eq__(self, other):
+        if isinstance(other, str):
+            return str(self) == other
+        elif other is None:
+            return False
+        else:
+            return self.sequence == other.sequence and self.properties == other.properties
+
+    def __ne__(self, other):
+        return not self == other
+
+    @classmethod
+    def parse(cls, string):
+        '''Parse a ProForma string.
+
+        Parameters
+        ----------
+        string : str
+            The string to parse
+
+        Returns
+        -------
+        ProForma
+        '''
+        return cls(*parse(string))
+
+    @property
+    def mass(self):
+        mass = 0.0
+
+        fixed_modifications = self.properties['fixed_modifications']
+        fixed_rules = {}
+        for rule in fixed_modifications:
+            for aa in rule.targets:
+                fixed_rules[aa] = rule.modification_tag.mass
+
+        for position in self.sequence:
+            aa = position[0]
+            try:
+                mass += std_aa_mass[aa]
+            except KeyError:
+                warnings.warn("%r does not have an exact mass" % (aa, ))
+            if aa in fixed_rules:
+                mass += fixed_rules[aa]
+            tags = position[1]
+            if tags:
+                for tag in tags:
+                    try:
+                        mass += tag.mass
+                    except (AttributeError, KeyError):
+                        continue
+        for mod in self.properties['labile_modifications']:
+            mass += mod.mass
+        for mod in self.properties['unlocalized_modifications']:
+            mass += mod.mass
+        if self.properties.get('n_term'):
+            for mod in self.properties['n_term']:
+                try:
+                    mass += mod.mass
+                except (AttributeError, KeyError):
+                    continue
+        mass += calculate_mass(formula="H")
+        if self.properties.get('c_term'):
+            for mod in self.properties['c_term']:
+                try:
+                    mass += mod.mass
+                except (AttributeError, KeyError):
+                    continue
+
+        mass += calculate_mass(formula="OH")
+        for iv in self.properties['intervals']:
+            try:
+                mass += iv.tag.mass
+            except (AttributeError, KeyError):
+                continue
+        return mass
+
+    def fragments(self, ion_shift, charge=1, reverse=None, include_labile=True, include_unlocalized=True):
+        """
+        The function generates all possible fragments of the requested
+        series type.
+
+        Parameters
+        ----------
+        ion_shift : float or str
+            The mass shift of the ion series, or the name of the ion series
+        charge : int
+            The charge state of the theoretical fragment masses to generate.
+            Defaults to 1+. If 0 is passed, neutral masses will be returned.
+        reverse : bool, optional
+            Whether to fragment from the N-terminus (``False``) or C-terminus (``True``).
+            If ``ion_shift`` is a :class:`str`, the terminal will be inferred from
+            the series name. Otherwise, defaults to ``False``.
+        include_labile : bool, optional
+            Whether or not to include dissociated modification masses.
+            Defaults to ``True``
+        include_unlocalized : bool, optional
+            Whether or not to include unlocalized modification masses.
+            Defaults to ``True``
+
+        Returns
+        -------
+        np.ndarray
+
+        Examples
+        --------
+
+        >>> p = proforma.ProForma.parse("PEPTIDE")
+        >>> p.fragments('b', charge=1)
+        array([ 98.06004032, 227.1026334 , 324.15539725, 425.20307572,
+                538.2871397 , 653.31408272])
+        >>> p.fragments('y', charge=1)
+        array([148.06043424, 263.08737726, 376.17144124, 477.21911971,
+               574.27188356, 703.31447664])
+
+        """
+        if isinstance(ion_shift, str):
+            if ion_shift[0] in 'xyz':
+                reverse = True
+            ion_shift = std_ion_comp[ion_shift].mass(absolute=False)
+
+        n = len(self.sequence)
+        masses = _array('d')
+
+        mass = 0
+        mass += ion_shift
+
+        fixed_modifications = self.properties['fixed_modifications']
+        fixed_rules = {}
+        for rule in fixed_modifications:
+            for aa in rule.targets:
+                fixed_rules[aa] = rule.modification_tag.mass
+
+        intervals = self.intervals
+        if intervals:
+            intervals = sorted(intervals, key=lambda x: x.start)
+        intervals = deque(intervals)
+
+        if not include_labile:
+            for mod in self.properties['labile_modifications']:
+                mass += mod.mass
+
+        if not reverse:
+            if self.properties.get('n_term'):
+                for mod in self.properties['n_term']:
+                    try:
+                        mass += mod.mass
+                    except (AttributeError, KeyError):
+                        continue
+        else:
+            if self.properties.get('c_term'):
+                for mod in self.properties['c_term']:
+                    try:
+                        mass += mod.mass
+                    except (AttributeError, KeyError):
+                        continue
+
+        if include_unlocalized:
+            for mod in self.properties['unlocalized_modifications']:
+                mass += mod.mass
+
+        mass += _WATER_MASS
+
+        if not reverse:
+            iterator = (iter(range(0, n - 1)))
+        else:
+            iterator = (reversed(range(1, n)))
+
+        for i in iterator:
+            position = self.sequence[i]
+
+            aa = position[0]
+            try:
+                mass += std_aa_mass[aa]
+            except KeyError:
+                warnings.warn("%r does not have an exact mass" % (aa, ))
+
+            if aa in fixed_rules:
+                mass += fixed_rules[aa]
+
+            tags = position[1]
+            if tags:
+                for tag in tags:
+                    try:
+                        mass += tag.mass
+                    except (AttributeError, KeyError):
+                        continue
+
+            while intervals and intervals[0].contains(i):
+                iv = intervals.popleft()
+
+                try:
+                    mass += iv.tag.mass
+                except (AttributeError, KeyError):
+                    continue
+
+            masses.append(mass)
+
+        if np is not None:
+            masses = np.asarray(masses)
+            if charge != 0:
+                return mass_charge_ratio(masses, charge)
+            return masses
+        if charge != 0:
+            for i, mass in enumerate(masses):
+                masses[i] = mass_charge_ratio(mass, charge)
+        return masses
+
+    def find_tags_by_id(self, tag_id, include_position=True):
+        '''Find all occurrences of a particular tag ID
+
+        Parameters
+        ----------
+        tag_id : str
+            The tag ID to search for
+        include_position : bool
+            Whether or not to return the locations for matched
+            tag positions
+
+        Returns
+        -------
+        list[tuple[Any, TagBase]] or list[TagBase]
+        '''
+        if not tag_id.startswith("#"):
+            tag_id = "#" + tag_id
+        matches = []
+        for i, (_token, tags) in enumerate(self.sequence):
+            if tags:
+                for tag in tags:
+                    if tag.group_id == tag_id:
+                        if include_position:
+                            matches.append((i, tag))
+                        else:
+                            matches.append(tag)
+        for iv in self.properties['intervals']:
+            if iv.tag.group_id == tag_id:
+                matches.append((iv, iv.tag) if include_position else iv.tag)
+        for ulmod in self.properties['unlocalized_modifications']:
+            if ulmod.group_id == tag_id:
+                matches.append(('unlocalized_modifications', ulmod)
+                               if include_position else ulmod)
+        for lamod in self.properties['labile_modifications']:
+            if lamod.group_id == tag_id:
+                matches.append(('labile_modifications', lamod)
+                               if include_position else lamod)
+        return matches
+
+    @property
+    def tags(self):
+        return [tag for tags_at in [pos[1] for pos in self if pos[1]] for tag in tags_at]
diff --git a/pyteomics/protxml.py b/pyteomics/protxml.py
new file mode 100644
index 0000000000000000000000000000000000000000..51dea034a6c61a414e04b3847841e4e2c4a8d1c2
--- /dev/null
+++ b/pyteomics/protxml.py
@@ -0,0 +1,309 @@
+"""
+protxml - parsing of ProteinProphet output files
+================================================
+
+Summary
+-------
+
+**protXML** is the output format of the `ProteinProphet software <http://proteinprophet.sourceforge.net/>`_.
+It contains information about identified proteins and their statistical significance.
+
+This module provides minimalistic infrastructure for access to data stored in
+protXML files. The central class is :py:class:`ProtXML`, which
+reads protein entries and related information and saves them into
+Python dicts.
+
+Data access
+-----------
+
+  :py:class:`ProtXML` - a class representing a single protXML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through peptide-spectrum matches in a protXML
+  file. Calling the function is synonymous to instantiating the :py:class:`ProtXML` class.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`DataFrame` - read protXML files into a :py:class:`pandas.DataFrame`.
+
+Target-decoy approach
+---------------------
+
+  :py:func:`filter` - filter protein groups from a chain of protXML files to a specific FDR
+  using TDA.
+
+  :py:func:`filter.chain` - chain a series of filters applied independently to
+  several files.
+
+  :py:func:`filter.chain.from_iterable` - chain a series of filters applied
+  independently to an iterable of files.
+
+  :py:func:`filter_df` - filter protXML files and return a :py:class:`pandas.DataFrame`.
+
+  :py:func:`fdr` - estimate the false discovery rate of a set of protein groups using the
+  target-decoy approach.
+
+  :py:func:`qvalues` - get an array of scores and *q* values for protein groups using the target-decoy approach.
+
+  :py:func:`is_decoy` - determine whether a protein group is decoy or not. This function may not suit your use case.
+
+Dependencies
+------------
+
+This module requres :py:mod:`lxml`.
+
+--------------------------------------------------------------------------------
+"""
+
+#   Copyright 2018 Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+from . import xml, auxiliary as aux, _schema_defaults
+import operator as op
+
+class ProtXML(xml.MultiProcessingXML):
+    """Parser class for protXML files."""
+    file_format = 'protXML'
+    _root_element = 'protein_summary'
+    _default_schema = _schema_defaults._protxml_schema_defaults
+    # _default_version = None
+    _default_iter_tag = 'protein_group'
+    _indexed_tag_keys = {'protein_group': 'group_number'}
+    _default_id_attr = 'group_number'
+    _indexed_tags = {'protein_group'}
+    _structures_to_flatten = {'annotation'}
+    # attributes which contain unconverted values
+    _convert_items = {'float':  {'pct_spectrum_ids'},
+        'int': {'group_number', 'prot_length'},
+        'bool': {'is_contributing_evidence', 'is_nondegenerate_evidence'}
+        }.items()
+
+    def _get_info_smart(self, element, **kwargs):
+        """Extract the info in a smart way depending on the element type"""
+        try:
+            name = kwargs.pop('ename')
+        except KeyError:
+            name = xml._local_name(element)
+        rec = kwargs.pop('recursive', None)
+        if name == self._root_element:
+            info = self._get_info(element, ename=name,
+                    recursive=(rec if rec is not None else False),
+                    **kwargs)
+        else:
+            info = self._get_info(element, ename=name,
+                    recursive=(rec if rec is not None else True),
+                    **kwargs)
+
+        converters = {'float': float, 'int': int,
+                'bool': lambda x: x.lower() in {'1', 'true', 'y'}}
+        for k, v in dict(info).items():
+            for t, s in self._convert_items:
+                if k in s:
+                    del info[k]
+                    info[k] = converters[t](v)
+        p = info.get('parameter')
+        if isinstance(p, list) and len(p) == 1 and isinstance(p[0], dict):
+            info.update(info.pop('parameter')[0])
+
+        if 'modification_info' in info:
+            # this is a list with one element
+            info.update(info.pop('modification_info')[0])
+
+        if 'unique_stripped_peptides' in info:
+            info['unique_stripped_peptides'] = info['unique_stripped_peptides'].split('+')
+        return info
+
+def read(source, read_schema=False, iterative=True, **kwargs):
+    """Parse `source` and iterate through protein groups.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target protXML file or the file object itself.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the protXML header. Otherwise, use default parameters.
+        Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    Returns
+    -------
+    out : ProtXML
+       An iterator over dicts with protein group properties.
+    """
+
+    return ProtXML(source, read_schema=read_schema, iterative=iterative)
+
+
+# chain = aux._make_chain(read, 'read')
+chain = aux.ChainBase._make_chain(ProtXML)
+
+
+def _is_decoy_prefix(pg, prefix='DECOY_'):
+    """Determine if a protein group should be considered decoy.
+
+    This function checks that all protein names in a group start with `prefix`.
+    You may need to provide your own function for correct filtering and FDR estimation.
+
+    Parameters
+    ----------
+
+    pg : dict
+        A protein group dict produced by the :py:class:`ProtXML` parser.
+    prefix : str, optional
+        A prefix used to mark decoy proteins. Default is `'DECOY_'`.
+
+    Returns
+    -------
+
+    out : bool
+    """
+    return all(p['protein_name'].startswith(prefix) for p in pg['protein'])
+
+def _is_decoy_suffix(pg, suffix='_DECOY'):
+    """Determine if a protein group should be considered decoy.
+
+    This function checks that all protein names in a group end with `suffix`.
+    You may need to provide your own function for correct filtering and FDR estimation.
+
+    Parameters
+    ----------
+
+    pg : dict
+        A protein group dict produced by the :py:class:`ProtXML` parser.
+    suffix : str, optional
+        A suffix used to mark decoy proteins. Default is `'_DECOY'`.
+
+    Returns
+    -------
+
+    out : bool
+    """
+    return all(p['protein_name'].endswith(suffix) for p in pg['protein'])
+
+is_decoy = _is_decoy_prefix
+
+fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix)
+_key = op.itemgetter('probability')
+qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, _key)
+filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, _key, qvalues)
+filter.chain = aux._make_chain(filter, 'filter', True)
+
+def DataFrame(*args, **kwargs):
+    """Read protXML output files into a :py:class:`pandas.DataFrame`.
+
+    .. note :: Rows in the DataFrame correspond to individual proteins, not protein groups.
+
+    Requires :py:mod:`pandas`.
+
+    Parameters
+    ----------
+    sep : str or None, keyword only, optional
+        Some values related to protein groups are variable-length lists.
+        If `sep` is a :py:class:`str`, they will be packed into single string using
+        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
+        :py:const:`None`.
+
+    pd_kwargs : dict, optional
+        Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor.
+
+    *args
+        Passed to :py:func:`chain`.
+
+    **kwargs
+        Passed to :py:func:`chain`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    kwargs = kwargs.copy()
+    sep = kwargs.pop('sep', None)
+    pd_kwargs = kwargs.pop('pd_kwargs', {})
+    def gen_items():
+        with chain(*args, **kwargs) as f:
+            for item in f:
+                info = {}
+                for k, v in item.items():
+                    if isinstance(v, (str, int, float)):
+                        info[k] = v
+                if 'protein' in item:
+                    for prot in item['protein']:
+                        out = dict(info)
+                        out.update(prot)
+                        if 'unique_stripped_peptides' in out:
+                            if sep is not None:
+                                out['unique_stripped_peptides'] = sep.join(out['unique_stripped_peptides'])
+                        if 'indistinguishable_protein' in out:
+                            if sep is None:
+                                out['indistinguishable_protein'] = [p['protein_name'] for p in out['indistinguishable_protein']]
+                            else:
+                                out['indistinguishable_protein'] = sep.join(p['protein_name'] for p in out['indistinguishable_protein'])
+                        yield out
+    return pd.DataFrame(gen_items(), **pd_kwargs)
+
+
+def filter_df(*args, **kwargs):
+    """Read protXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs.
+    Positional arguments can be protXML files or DataFrames.
+
+    .. note :: Rows in the DataFrame correspond to individual proteins, not protein groups.
+
+    Requires :py:mod:`pandas`.
+
+    Parameters
+    ----------
+    key : str / iterable / callable, keyword only, optional
+        Default is 'probability'.
+    is_decoy : str / iterable / callable, keyword only, optional
+        Default is to check that "protein_name" starts with `'DECOY_'`.
+    reverse : bool, keyword only, optional
+        Should be :py:const:`True` if higher score is better.
+        Default is :py:const:`True` (because the default key is 'probability').
+    *args
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+    **kwargs
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    kwargs.setdefault('key', 'probability')
+    kwargs.setdefault('reverse', True)
+    if all(isinstance(arg, pd.DataFrame) for arg in args):
+        if len(args) > 1:
+            df = pd.concat(args)
+        else:
+            df = args[0]
+    else:
+        read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs}
+        df = DataFrame(*args, **read_kw)
+    if 'is_decoy' not in kwargs:
+        if 'decoy_suffix' in kwargs:
+            kwargs['is_decoy'] = df['protein_name'].str.endswith(kwargs['decoy_suffix'])
+        else:
+            kwargs['is_decoy'] = df['protein_name'].str.startswith(kwargs.get('decoy_prefix', 'DECOY_'))
+    return aux.filter(df, **kwargs)
diff --git a/pyteomics/pylab_aux.py b/pyteomics/pylab_aux.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52e17e904ee3827d44b9e6ca2436818b6b122f2
--- /dev/null
+++ b/pyteomics/pylab_aux.py
@@ -0,0 +1,831 @@
+"""
+pylab_aux - auxiliary functions for plotting with pylab
+=======================================================
+
+This module serves as a collection of useful routines for data plotting with
+matplotlib.
+
+Generic plotting
+----------------
+
+  :py:func:`plot_line` - plot a line.
+
+  :py:func:`scatter_trend` - plot a scatter plot with a regression line.
+
+  :py:func:`plot_function_3d` - plot a 3D graph of a function of two variables.
+
+  :py:func:`plot_function_contour` - plot a contour graph of a function of
+  two variables.
+
+Spectrum visualization
+----------------------
+
+  :py:func:`plot_spectrum` - plot a single spectrum (m/z vs intensity).
+
+  :py:func:`annotate_spectrum` - plot and annotate peaks in MS/MS spectrum.
+
+  :py:func:`mirror` - create a mirror plot of two spectra (using :py:mod:`spectrum_utils`).
+
+FDR control
+-----------
+
+  :py:func:`plot_qvalue_curve` - plot the dependence of q-value on the amount of PSMs
+  (similar to a ROC curve).
+
+See also
+--------
+
+  - `Matplotlib cookbook <http://www.scipy.org/Cookbook/Matplotlib/>`_
+  - `Matplotlib tutorial
+    <http://matplotlib.sourceforge.net/mpl_toolkits/mplot3d/tutorial.html>`_
+
+Dependencies
+------------
+
+This module requires :py:mod:`matplotlib`. Optional dependencies: :py:mod:`adjustText`, :py:mod:`spectrum_utils`.
+
+-------------------------------------------------------------------------------
+
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import pylab
+import numpy as np
+from .auxiliary import linear_regression, PyteomicsError
+from .version import VersionInfo
+from . import parser, mass, mgf, proforma
+
+try:
+    import spectrum_utils
+    if VersionInfo(spectrum_utils.__version__) < VersionInfo('0.4'):
+        raise ImportError("Supported spectrum_utils version is 0.4.0 or newer.")
+    import spectrum_utils.spectrum as sus
+    import spectrum_utils.plot as sup
+except ImportError:
+    sus = sup = None
+
+
+def plot_line(a, b, xlim=None, *args, **kwargs):
+    """Plot a line y = a * x + b.
+
+    Parameters
+    ----------
+    a : float
+        The slope of the line.
+    b : float
+        The intercept of the line.
+    xlim : tuple, optional
+        Minimal and maximal values of `x`. If not given, :py:func:`pylab.xlim` will be called.
+    *args
+        Passed to :py:func:`pylab.plot` after `x` and `y` values.
+    **kwargs
+        Passed to :py:func:`pylab.plot`.
+
+    Returns
+    -------
+    out : matplotlib.lines.Line2D
+        The line object.
+    """
+    if xlim is None:
+        xlim = pylab.xlim()
+    return pylab.plot([xlim[0], xlim[1]], [a * xlim[0] + b, a * xlim[1] + b], *args, **kwargs)
+
+
+def scatter_trend(x, y=None, **kwargs):
+    """Make a scatter plot with a linear regression.
+
+    Parameters
+    ----------
+    x : array_like of float
+        1-D array of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2).
+    y : array_like of float, optional
+        1-D arrays of floats. If `y` is omitted or :py:const:`None`, `x` must be a 2-D array of shape (N, 2).
+    plot_trend : bool, optional
+        If :py:const:`True` then plot a trendline (default).
+    plot_sigmas : bool, optional
+        If :py:const:`True` then plot confidence intervals of the linear fit.
+        :py:const:`False` by default.
+    show_legend : bool, optional
+        If :py:const:`True`, a legend will be shown with linear fit equation,
+        correlation coefficient, and standard deviation from the fit. Default is
+        :py:const:`True`.
+    title : str, optional
+        The title. Empty by default.
+    xlabel, ylabel : str, optional
+        The axes labels. Empty by default.
+    alpha_legend : float, optional
+        Legend box transparency. 1.0 by default
+    scatter_kwargs : dict, optional
+        Keyword arguments for :py:func:`pylab.scatter`.
+        Empty by default.
+    plot_kwargs : dict, optional
+        Keyword arguments for :py:func:`plot_line`.
+        By default, sets `xlim` and `label`.
+    legend_kwargs : dict, optional
+        Keyword arguments for :py:func:`pylab.legend`.
+        Default is :py:const:`{'loc': 'upper left'}`.
+    sigma_kwargs : dict, optional
+        Keyword arguments for :py:func:`pylab.plot` used for sigma lines.
+        Default is :py:const:`{'color': 'red', 'linestyle': 'dashed'}`.
+    sigma_values : iterable, optional
+        Each value will be multiplied with standard error of the fit, and the line
+        shifted by the resulting value will be plotted. Default is :py:const:`range(-3, 4)`.
+    regression : callable, optional
+        Function to perform linear regression. Will be given ``x`` and ``y`` as arguments.
+        Must return a 4-tuple: (a, b, r, stderr).
+        Default is :py:func:`pyteomics.auxiliary.linear_regression`.
+
+    Returns
+    -------
+    out : tuple
+        A (scatter_plot, trend_line, sigma_lines, legend) tuple.
+    """
+    regression = kwargs.get('regression', linear_regression)
+    a, b, r, stderr = regression(x, y)
+    pylab.title(kwargs.get('title', ''))
+    pylab.xlabel(kwargs.get('xlabel', ''))
+    pylab.ylabel(kwargs.get('ylabel', ''))
+
+    equation = (
+        '$y\,=\,{:.3f}x\,{}\,{:.3f}$, '
+        '$R^2=\,{:.3f}$ \n$\sigma\,=\,{:.3f}$'.format(
+            a, '-' if b < 0 else '+', abs(b), r*r, stderr))
+
+    if y is None:
+        x = np.array(x, copy=False)
+        y = x[:, 1]
+        x = x[:, 0]
+    else:
+        x = np.array(x)
+        y = np.array(y)
+    sc = pylab.scatter(x, y, **kwargs.get('scatter_kwargs', {}))
+    xlim = (x.min(), x.max())
+    plkw = kwargs.get('plot_kwargs', {}).copy()
+    plkw.setdefault('xlim', xlim)
+    plkw.setdefault('label', equation)
+    if kwargs.get('plot_trend', True):
+        line = plot_line(a, b, **plkw)
+    else:
+        line = None
+
+    if kwargs.get('plot_sigmas', False):
+        s_lines = []
+        sigma_kwargs = kwargs.get('sigma_kwargs', {'color': 'red', 'linestyle': 'dashed'})
+        for i in kwargs.get('sigma_values', range(-3, 4)):
+            s_lines.append(plot_line(a, b + i * stderr, xlim, **sigma_kwargs))
+    else:
+        s_lines = None
+
+    if kwargs.get('show_legend', True):
+        legend = pylab.legend(**kwargs.get('legend_kwargs', {'loc': 'upper left'}))
+        legend_frame = legend.get_frame()
+        legend_frame.set_alpha(kwargs.get('alpha_legend', 1.0))
+    else:
+        legend = None
+    return sc, line, s_lines, legend
+
+
+def plot_function_3d(x, y, function, **kwargs):
+    """Plot values of a function of two variables in 3D.
+
+    More on 3D plotting in pylab:
+
+    http://www.scipy.org/Cookbook/Matplotlib/mplot3D
+
+    Parameters
+    ----------
+    x : array_like of float
+        The plotting range on X axis.
+    y : array_like of float
+        The plotting range on Y axis.
+    function : function
+        The function to plot.
+    plot_type : {'surface', 'wireframe', 'scatter', 'contour', 'contourf'}, keyword only, optional
+        The type of a plot, see
+        `scipy cookbook <http://www.scipy.org/Cookbook/Matplotlib/mplot3D>`_
+        for examples. The default value is 'surface'.
+    num_contours : int
+        The number of contours to plot, 50 by default.
+    xlabel : str, keyword only, optional
+        The X axis label. Empty by default.
+    ylabel : str, keyword only, optional
+        The Y axis label. Empty by default.
+    zlabel : str, keyword only, optional
+        The Z axis label. Empty by default.
+    title : str, keyword only, optional
+        The title. Empty by default.
+    **kwargs
+        Passed to the respective plotting function.
+    """
+    import mpl_toolkits.mplot3d.axes3d as pylab3d
+    ax = pylab3d.Axes3D(pylab.gcf())
+    ax.set_xlabel(kwargs.pop('xlabel', ''))
+    ax.set_ylabel(kwargs.pop('ylabel', ''))
+    ax.set_zlabel(kwargs.pop('zlabel', ''))
+    ax.set_title(kwargs.pop('title', ''))
+    X, Y = np.meshgrid(x, y)
+    Z = []
+    for y_value in y:
+        Z.append([])
+        for x_value in x:
+            Z[-1].append(function(x_value, y_value))
+    Z = np.array(Z)
+    plot_type = kwargs.pop('plot_type', 'surface')
+    if plot_type == 'surface':
+        ax.plot_surface(X, Y, Z,
+                rstride=kwargs.pop('rstride', 1),
+                cstride=kwargs.pop('cstride', 1),
+                cmap=kwargs.pop('cmap', pylab.cm.jet),
+                **kwargs)
+    elif plot_type == 'wireframe':
+        ax.plot_wireframe(X, Y, Z,
+                cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs)
+    elif plot_type == 'scatter':
+        ax.scatter3D(np.ravel(X), np.ravel(Y), np.ravel(Z), **kwargs)
+    elif plot_type == 'contour':
+        num_contours = kwargs.pop('num_contours', 50)
+        ax.contour3D(X, Y, Z, num_contours,
+                cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs)
+    elif plot_type == 'contourf':
+        num_contours = kwargs.pop('num_contours', 50)
+        ax.contourf3D(X, Y, Z, num_contours,
+                cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs)
+    else:
+        raise PyteomicsError('Unknown plot type: {}'.format(plot_type))
+
+
+def plot_function_contour(x, y, function, **kwargs):
+    """Make a contour plot of a function of two variables.
+
+    Parameters
+    ----------
+    x, y : array_like of float
+        The positions of the nodes of a plotting grid.
+    function : function
+        The function to plot.
+    filling : bool
+        Fill contours if True (default).
+    num_contours : int
+        The number of contours to plot, 50 by default.
+    xlabel, ylabel : str, optional
+        The axes labels. Empty by default.
+    title : str, optional
+        The title. Empty by default.
+    **kwargs
+        Passed to :py:func:`pylab.contour` or :py:func:`pylab.contourf`.
+    """
+    pylab.xlabel(kwargs.pop('xlabel', ''))
+    pylab.ylabel(kwargs.pop('ylabel', ''))
+    pylab.title(kwargs.pop('title', ''))
+    X, Y = np.meshgrid(x, y)
+    Z = []
+    for y_value in y:
+        Z.append([])
+        for x_value in x:
+            Z[-1].append(function(x_value, y_value))
+    Z = np.array(Z)
+    num_contours = kwargs.pop('num_contours', 50)
+    if kwargs.pop('filling', True):
+        pylab.contourf(X, Y, Z, num_contours,
+                cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs)
+    else:
+        pylab.contour(X, Y, Z, num_contours,
+                cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs)
+
+
+def plot_qvalue_curve(qvalues, *args, **kwargs):
+    """
+    Plot a curve with q-values on the X axis and corresponding PSM number
+    (starting with ``1``) on the Y axis.
+
+    Parameters
+    ----------
+    qvalues : array-like
+        An array of q-values for sorted PSMs.
+    xlabel : str, keyword only, optional
+        Label for the X axis. Default is "q-value".
+    ylabel : str, keyword only, optional
+        Label for the Y axis. Default is "# of PSMs".
+    title : str, keyword only, optional
+        The title. Empty by default.
+    *args
+        Given to :py:func:`pylab.plot` after `x` and `y`.
+    **kwargs
+        Given to :py:func:`pylab.plot`.
+
+    Returns
+    -------
+    out : matplotlib.lines.Line2D
+    """
+    pylab.xlabel(kwargs.pop('xlabel', 'q-value'))
+    pylab.ylabel(kwargs.pop('ylabel', '# of PSMs'))
+    pylab.title(kwargs.pop('title', ''))
+    return pylab.plot(qvalues, 1 + np.arange(qvalues.size), *args, **kwargs)
+
+
+def _default_plot_spectrum(spectrum, *args, **kwargs):
+    ax = kwargs.pop('ax', None) or pylab.gca()
+    if kwargs.pop('centroided', True):
+        kwargs.setdefault('align', 'center')
+        kwargs.setdefault('width', 0)
+        kwargs.setdefault('linewidth', 1)
+        kwargs.setdefault('edgecolor', 'k')
+        ax.bar(spectrum['m/z array'], spectrum['intensity array'], *args, **kwargs)
+    else:
+        ax.plot(spectrum['m/z array'], spectrum['intensity array'], *args, **kwargs)
+    return ax
+
+
+def _spectrum_utils_plot(spectrum, *args, **kwargs):
+
+    with SpectrumUtilsColorScheme(kwargs.pop('colors', None)):
+        spectrum = _spectrum_utils_create_spectrum(spectrum, None, *args, **kwargs)
+        return sup.spectrum(spectrum)
+
+
+def _spectrum_utils_iplot(spectrum, *args, **kwargs):
+    import spectrum_utils.iplot as supi
+    with SpectrumUtilsColorScheme(kwargs.pop('colors', None)):
+        spectrum = _spectrum_utils_create_spectrum(spectrum, None, *args, **kwargs)
+        return supi.spectrum(spectrum)
+
+
+_plot_backends = {
+    'default': _default_plot_spectrum,
+    'spectrum_utils': _spectrum_utils_plot,
+    'spectrum_utils.iplot': _spectrum_utils_iplot,
+}
+
+
+def plot_spectrum(spectrum, *args, **kwargs):
+    """
+    Plot a spectrum, assuming it is a dictionary containing "m/z array" and "intensity array".
+
+    Parameters
+    ----------
+    spectrum : dict
+        A dictionary, as returned by pyteomics MS data parsers.
+        Must contain "m/z array" and "intensity array" keys with decoded arrays.
+    backend : str, keyword only, optional
+        One of `{'default', 'spectrum_utils', 'spectrum_utils.iplot'}`.
+        The `spectrum_utils` backend requires installing :py:mod:`spectrum_utils`.
+        The `spectrum_utils.iplot` backend requires installing :py:mod:`spectrum_utils[iplot]`.
+    xlabel : str, keyword only, optional
+        Label for the X axis. Default is "m/z".
+    ylabel : str, keyword only, optional
+        Label for the Y axis. Default is "intensity".
+    title : str, keyword only, optional
+        The title. Empty by default.
+
+    centroided : bool, keyword only, optional
+        Works only for the `default` backend.
+        If :py:const:`True` (default), peaks of the spectrum are plotted using :py:func:`pylab.bar`.
+        If :py:const:`False`, the arrays are simply plotted using :py:func:`pylab.plot`.
+    *args
+        When using `default` backend: given to :py:func:`pylab.plot` or :py:func:`pylab.bar` (depending on `centroided`).
+    **kwargs
+        When using `default` backend: given to :py:func:`pylab.plot` or :py:func:`pylab.bar` (depending on `centroided`).
+
+    min_intensity : float, keyword only, optional
+        Remove low-intensity peaks; this is a factor of maximum peak intensity. Default is 0 (no filtering).
+        Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    max_num_peaks : int or None, keyword only, optional
+        Remove low-intensity peaks; this is the number of peaks to keep. Default is :py:const:`None` (no filtering).
+        Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    scaling : one of `{'root', 'log', 'rank'}` or None, keyword only, optional
+        Scaling to apply to peak intensities. Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    max_intensity : float or None, keyword only, optional
+        Intensity of the most intense peak relative to which the peaks will be scaled
+        (the default is :py:const:`None`, which means that no scaling
+        relative to the most intense peak will be performed).
+        Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+
+    Returns
+    -------
+    out : matplotlib.pyplot.Axes
+    """
+    bname = kwargs.pop('backend', 'default')
+    backend = _plot_backends.get(bname)
+    if backend is None:
+        raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format(
+            bname, '; '.join(_plot_backends)))
+
+    pylab.xlabel(kwargs.pop('xlabel', 'm/z'))
+    pylab.ylabel(kwargs.pop('ylabel', 'intensity'))
+    if 'title' in kwargs:
+        pylab.title(kwargs.pop('title'))
+    return backend(spectrum, *args, **kwargs)
+
+
+def _default_annotate_spectrum(spectrum, peptide, *args, **kwargs):
+
+    # common kwargs
+    types = kwargs.pop('ion_types', ('b', 'y'))
+    aa_mass = kwargs.pop('aa_mass', mass.std_aa_mass)
+    mass_data = kwargs.pop('mass_data', mass.nist_mass)
+    ion_comp = kwargs.pop('ion_comp', mass.std_ion_comp)
+    colors = {
+        'a': '#388E3C',
+        'b': '#1976D2',
+        'c': '#00796B',
+        'x': '#7B1FA2',
+        'y': '#D32F2F',
+        'z': '#F57C00',
+    }
+    colors.update(kwargs.pop('colors', {}))
+    ftol = kwargs.pop('ftol', None)
+    if ftol is None:
+        rtol = kwargs.pop('rtol', 1e-5)
+    text_kw = kwargs.pop('text_kw', dict(ha='center', clip_on=True, backgroundcolor='#ffffff99'))
+    precursor_charge = kwargs.pop('precursor_charge', None)
+    if precursor_charge is None:
+        precursor_charge = _get_precursor_charge(spectrum)
+    if precursor_charge is None:
+        raise PyteomicsError('Could not extract precursor charge from spectrum. Please specify `precursor_charge` kwarg.')
+    maxcharge = kwargs.pop('maxcharge', max(1, precursor_charge - 1))
+    ax = kwargs.get('ax', None)
+    # end of common kwargs
+
+    # backend-specific kwargs
+    centroided = kwargs.pop('centroided', True)
+    adjust = kwargs.pop('adjust_text', None)
+    if adjust or adjust is None:
+        try:
+            from adjustText import adjust_text
+            adjust_kw = kwargs.pop('adjust_kw', dict(
+                only_move={'text': 'y', 'points': 'y', 'objects': 'y'}, autoalign=False, force_text=(1, 1)))
+        except ImportError:
+            if adjust:
+                raise PyteomicsError('Install adjustText for text adjustment')
+            adjust = False
+        else:
+            if adjust is None:
+                adjust = True
+    # end of backend-specific kwargs
+
+    parsed = parser.parse(peptide, True, labels=list(aa_mass) + [parser.std_cterm, parser.std_nterm])
+    n = len(parsed)
+    maxpeak = spectrum['intensity array'].max()
+    mz, names = {}, {}
+    for ion in types:
+        for charge in range(1, maxcharge + 1):
+            if ion[0] in 'abc':
+                for i in range(2, n):
+                    mz.setdefault(ion, []).append(mass.fast_mass2(parsed[:i] + [parser.std_cterm],
+                        aa_mass=aa_mass, charge=charge, ion_type=ion, mass_data=mass_data, ion_comp=ion_comp))
+                    names.setdefault(ion, []).append(ion[0] + str(i - 1) + ion[1:])
+            else:
+                for i in range(1, n - 1):
+                    mz.setdefault(ion, []).append(mass.fast_mass2([parser.std_nterm] + parsed[n - (i + 1):],
+                        aa_mass=aa_mass, charge=charge, ion_type=ion, mass_data=mass_data, ion_comp=ion_comp))
+                    names.setdefault(ion, []).append(ion[0] + str(i) + ion[1:])
+    texts = []
+    for ion in types:
+        c = colors.get(ion, colors.get(ion[0], 'blue'))
+        matrix = np.abs(spectrum['m/z array'] - np.array(mz[ion]).reshape(-1, 1))
+        if ftol is not None:
+            match = np.where(matrix < ftol)
+        else:
+            match = np.where(matrix / spectrum['m/z array'] < rtol)
+        pseudo_spec = {'m/z array': spectrum['m/z array'][match[1]], 'intensity array': spectrum['intensity array'][match[1]]}
+        plot_spectrum(pseudo_spec, centroided=True, edgecolor=c, ax=ax)
+        for j, i in zip(*match):
+            x = spectrum['m/z array'][i]
+            y = spectrum['intensity array'][i] + maxpeak * 0.02
+            name = names[ion][j]
+            texts.append(pylab.text(x, y, name, color=c, **text_kw))
+    if adjust:
+        adjust_text(texts, **adjust_kw)
+    kwargs.setdefault('zorder', -1)
+    return plot_spectrum(spectrum, *args, centroided=centroided, **kwargs)
+
+
+def _get_precursor_charge(spectrum):
+    try:
+        return mgf.MGFBase.parse_precursor_charge(spectrum['params']['charge'], list_only=True)[0]
+    except (PyteomicsError, KeyError):
+        pass
+    try:
+        return int(spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state'])
+    except KeyError:
+        pass
+    return None
+
+
+def _get_precursor_mz(spectrum):
+    try:
+        return spectrum['params']['pepmass'][0]
+    except KeyError:
+        pass
+    try:
+        return spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z']
+    except KeyError:
+        pass
+    if 'attributes' in spectrum:
+        for attr in spectrum['attributes']:
+            if attr in {"MS:1000827", "MS:1000744", "MS:1002234"}:
+                return spectrum['attributes'][attr]
+    return None
+
+
+def _spectrum_utils_create_spectrum(spectrum, *args, **kwargs):
+    if sus is None:
+        raise PyteomicsError('This backend requires `spectrum_utils>=0.4`.')
+
+    # backend-specific parameters
+    mz_range = kwargs.pop('mz_range', None)
+
+    min_intensity = kwargs.pop('min_intensity', 0.0)
+    max_num_peaks = kwargs.pop('max_num_peaks', None)
+    scaling = kwargs.pop('scaling', None)
+    max_intensity = kwargs.pop('max_intensity', None)
+    spectrum = sus.MsmsSpectrum(
+        'None', kwargs.pop('precursor_mz', None), kwargs.pop('precursor_charge', None),
+        spectrum['m/z array'], spectrum['intensity array'])
+    if mz_range:
+        spectrum = spectrum.set_mz_range(*mz_range)
+
+    spectrum = spectrum.filter_intensity(min_intensity=min_intensity, max_num_peaks=max_num_peaks
+        ).scale_intensity(scaling, max_intensity)
+    return spectrum
+
+
+def _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs):
+
+    # common kwargs
+    aa_mass = kwargs.pop('aa_mass', mass.std_aa_mass)
+    types = kwargs.pop('ion_types', ('b', 'y'))
+    tol = kwargs.pop('ftol', None)
+    if tol is None:
+        tol = kwargs.pop('rtol', 1e-5) * 1e6
+        tol_mode = 'ppm'
+    else:
+        tol_mode = 'Da'
+
+    # kwargs.pop('text_kw', None)  # not used
+
+    precursor_charge = kwargs.pop('precursor_charge', None)
+    if precursor_charge is None:
+        precursor_charge = _get_precursor_charge(spectrum)
+    if precursor_charge is None:
+        raise PyteomicsError('Could not extract precursor charge from spectrum. '
+            'Please specify `precursor_charge` keyword argument.')
+
+    maxcharge = kwargs.pop('maxcharge', max(1, precursor_charge - 1))
+    # end of common kwargs
+
+    # backend-specific parameters
+    remove_precursor_peak = kwargs.pop('remove_precursor_peak', False)
+
+    # peptide can be modX or proforma. spectrum_utils supports proforma only
+    aa_comp = kwargs.get('aa_comp')
+    mod_names = kwargs.get('mod_names')
+    prefix = kwargs.get('prefix')
+
+    try:
+        parsed_proforma = proforma.ProForma.parse(peptide)
+        peptide_pro = peptide
+    except Exception:
+        parsed_proforma = None
+        try:
+            peptide_pro = parser.to_proforma(peptide, aa_mass=aa_mass, aa_comp=aa_comp, mod_names=mod_names, prefix=prefix)
+        except Exception:
+            raise PyteomicsError("Cannot parse {} as ProForma or convert from modX".format(peptide))
+
+    precursor_mz = kwargs.pop('precursor_mz', None)
+    if precursor_mz is None:
+        precursor_mz = _get_precursor_mz(spectrum)
+    if precursor_mz is None:
+        try:
+            if aa_comp:
+                precursor_mz = mass.calculate_mass(peptide, aa_comp=aa_comp, charge=precursor_charge)
+            elif not parsed_proforma:
+                precursor_mz = mass.fast_mass2(peptide, aa_mass=aa_mass, charge=precursor_charge)
+            else:
+                precursor_mz = mass.mass_charge_ratio(parsed_proforma.mass, precursor_charge)
+        except PyteomicsError:
+            raise PyteomicsError('Cannot obtain precursor m/z, please specify `precursor_mz` argument.')
+
+    spectrum = _spectrum_utils_create_spectrum(spectrum, *args,
+        precursor_mz=precursor_mz, precursor_charge=precursor_charge, **kwargs)
+    if remove_precursor_peak:
+        spectrum = spectrum.remove_precursor_peak(tol, tol_mode)
+    spectrum = spectrum.annotate_proforma(peptide_pro, tol, tol_mode, types, maxcharge)
+
+    return spectrum
+
+
+class SpectrumUtilsColorScheme:
+    """Context manager that temporarily changes `spectrum_utils.plot.colors`."""
+    def __init__(self, colors):
+        self.colors = colors
+        self.previous_colors = sup.colors.copy()
+
+    def __enter__(self):
+        if self.colors:
+            sup.colors.update(self.colors)
+
+    def __exit__(self, *args, **kwargs):
+        sup.colors = self.previous_colors
+
+
+def _spectrum_utils_annotate_plot(spectrum, peptide, *args, **kwargs):
+
+    with SpectrumUtilsColorScheme(kwargs.pop('colors', None)):
+        spectrum = _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs)
+        return sup.spectrum(spectrum, annot_kws=kwargs.pop('text_kw', None), ax=kwargs.pop('ax', None))
+
+
+def _spectrum_utils_annotate_iplot(spectrum, peptide, *args, **kwargs):
+    import spectrum_utils.iplot as supi
+    with SpectrumUtilsColorScheme(kwargs.pop('colors', None)):
+        spectrum = _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs)
+        return supi.spectrum(spectrum, annot_kws=kwargs.pop('text_kw', None))
+
+
+_annotation_backends = {
+    'default': _default_annotate_spectrum,
+    'spectrum_utils': _spectrum_utils_annotate_plot,
+    'spectrum_utils.iplot': _spectrum_utils_annotate_iplot,
+}
+
+
+def annotate_spectrum(spectrum, peptide, *args, **kwargs):
+    """Plot a spectrum and annotate matching fragment peaks.
+
+    Parameters
+    ----------
+    spectrum : dict
+        A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys.
+    peptide : str
+        A modX sequence.
+    backend : str, keyword only, optional
+        One of `{'default', 'spectrum_utils', 'spectrum_utils.iplot'}`.
+        The `spectrum_utils` backend requires installing :py:mod:`spectrum_utils`.
+        The `spectrum_utils.iplot` backend requires installing :py:mod:`spectrum_utils[iplot]`.
+    ion_types : Container, keyword only, optional
+        Ion types to be considered for annotation. Default is `('b', 'y')`.
+    precursor_charge : int, keyword only, optional
+        If not specified, an attempt is made to extract it from `spectrum`.
+    maxcharge : int, keyword only, optional
+        Maximum charge state for fragment ions to be considered. Default is `precursor_charge - 1`.
+    colors : dict, keyword only, optional
+        Keys are ion types, values are colors to plot the annotated peaks with. Default depends on backend.
+    ftol : float, keyword only, optional
+        A fixed m/z tolerance value for peak matching. Alternative to `rtol`.
+    rtol : float, keyword only, optional
+        A relative m/z error for peak matching. Default is 10 ppm.
+    aa_mass : dict, keyword only, optional
+        A dictionary of amino acid residue masses.
+    text_kw : dict, keyword only, optional
+        Keyword arguments for :py:func:`pylab.text`.
+    xlabel : str, keyword only, optional
+        Label for the X axis. Default is "m/z". Does not work with `spectrum_utils.iplot` backend.
+    ylabel : str, keyword only, optional
+        Label for the Y axis. Default is "intensity". Does not work with `spectrum_utils.iplot` backend.
+    title : str, keyword only, optional
+        The title. Empty by default. Does not work with `spectrum_utils.iplot` backend.
+    ax : matplotlib.pyplot.Axes, keyword only, optional
+        Axes to draw the spectrum. Does not work with `spectrum_utils.iplot` backend.
+
+    *args
+        Passed to the plotting backend.
+    **kwargs
+        Passed to the plotting backend.
+
+    centroided : bool, keyword only, optional
+        Passed to :py:func:`plot_spectrum`. Only works with `default` backend.
+    ion_comp : dict, keyword only, optional
+        A dictionary defining ion compositions to override :py:const:`pyteomics.mass.std_ion_comp`.
+        Only works with `default` backend.
+    mass_data : dict, keyword only, optional
+        A dictionary of element masses to override :py:const:`pyteomics.mass.nist_mass`.
+        Only works with `default` backend.
+
+    adjust_text : bool, keyword only, optional
+        Adjust the overlapping text annotations using :py:mod:`adjustText`. Only works with `default` backend.
+    adjust_kw : dict, keyword only, optional
+        Keyword arguments for :py:func:`adjust_text`. Only works with `default` backend.
+
+    remove_precursor_peak : bool, keyword only, optional
+        Remove precursor peak from spectrum before annotation. Default is :py:const:`False`.
+        Only works with `spectrum_utils` backend.
+    min_intensity : float, keyword only, optional
+        Remove low-intensity peaks; this is a factor of maximum peak intensity. Default is 0 (no filtering).
+        Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    max_num_peaks : int or None, keyword only, optional
+        Remove low-intensity peaks; this is the number of peaks to keep. Default is :py:const:`None` (no filtering).
+        Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    scaling : one of `{'root', 'log', 'rank'}` or None, keyword only, optional
+        Scaling to apply to peak intensities. Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    max_intensity : float or None, keyword only, optional
+        Intensity of the most intense peak relative to which the peaks will be scaled
+        (the default is :py:const:`None`, which means that no scaling
+        relative to the most intense peak will be performed).
+        Only works with `spectrum_utils` and `spectrum_utils.iplot` backends.
+    aa_comp : dict, keyword only, optional
+        Amino acid compositions, including modified ones. If given, will be used for conversion from *modX* to ProForma.
+    mod_names : dict or callable, keyword only, optional
+        If given, will be used for conversion from *modX* to ProForma.
+    prefix : str, keyword only, optional
+        If given, will be used for conversion from *modX* to ProForma.
+
+    Returns
+    -------
+    out : matplotlib.pyplot.Axes
+    """
+    bname = kwargs.pop('backend', 'default')
+    backend = _annotation_backends.get(bname)
+    if backend is None:
+        raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format(
+            bname, '; '.join(_annotation_backends)))
+
+    pylab.xlabel(kwargs.pop('xlabel', 'm/z'))
+    pylab.ylabel(kwargs.pop('ylabel', 'intensity'))
+    pylab.title(kwargs.pop('title', ''))
+    return backend(spectrum, peptide, *args, **kwargs)
+
+
+def _spectrum_utils_mirror(spec_top, spec_bottom, spectrum_kws=None, ax=None, **kwargs):
+    with SpectrumUtilsColorScheme(kwargs.pop('colors', None)):
+        ax = sup.mirror(spec_top, spec_bottom, spectrum_kws=spectrum_kws, ax=ax)
+        ax.set_xlabel(kwargs.pop('xlabel', 'm/z'))
+        ax.set_ylabel(kwargs.pop('ylabel', 'intensity'))
+        ax.set_title(kwargs.pop('title', ''))
+        return ax
+
+
+def _spectrum_utils_iplot_mirror(spec_top, spec_bottom, spectrum_kws=None, **kwargs):
+    import spectrum_utils.iplot as supi
+    with SpectrumUtilsColorScheme(kwargs.pop('colors', None)):
+        return supi.mirror(spec_top, spec_bottom, spectrum_kws=spectrum_kws)
+
+
+_mirror_backends = {
+    'spectrum_utils': _spectrum_utils_mirror,
+    'spectrum_utils.iplot': _spectrum_utils_iplot_mirror,
+}
+
+
+def mirror(spec_top, spec_bottom, peptide=None, spectrum_kws=None, ax=None, **kwargs):
+    """Create a mirror plot of two (possible annotated) spectra using `spectrum_utils`.
+
+    Parameters
+    ----------
+    spec_top : dict
+        A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys.
+    spec_bottom : dict
+        A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys.
+    peptide : str or None, optional
+        A modX sequence or ProForma. If provided, the peaks will be annotated as peptide fragments.
+    spectrum_kws : dict or None, optional
+        Passed to :py:func:`spectrum_utils.plot.mirror`.
+    backend : str, keyword only, optional
+        One of {'spectrum_utils', 'spectrum_utils.iplot'}. Default is 'spectrum_utils'.
+
+        .. note ::
+            Requires :py:mod:`spectrum_utils` or :py:mod:`spectrun_utils[iplot]`, respectively.
+
+    ax : matplotlib.pyplot.Axes or None, optional
+        Passed to :py:func:`spectrum_utils.plot.mirror`. Works only for the 'spectrum_utils' backend.
+    xlabel : str, keyword only, optional
+        Label for the X axis. Default is "m/z". Works only for the 'spectrum_utils' backend.
+    ylabel : str, keyword only, optional
+        Label for the Y axis. Default is "intensity". Works only for the 'spectrum_utils' backend.
+    title : str, keyword only, optional
+        The title. Empty by default. Works only for the 'spectrum_utils' backend.
+
+    **kwargs : same as for :py:func:`annotate_spectrum` for `spectrum_utils` backends.
+
+    Returns
+    -------
+    out : matplotlib.pyplot.Axes
+    """
+
+    spec_gen = _spectrum_utils_create_spectrum if peptide is None else _spectrum_utils_annotate_spectrum
+    spec_top = spec_gen(spec_top, peptide, **kwargs)
+    spec_bottom = spec_gen(spec_bottom, peptide, **kwargs)
+
+    bname = kwargs.pop('backend', 'spectrum_utils')
+    backend = _mirror_backends.get(bname)
+    if backend is None:
+        raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format(
+            bname, '; '.join(_mirror_backends)))
+    backend_kw = {'spectrum_kws': spectrum_kws}
+    if bname == 'spectrum_utils':
+        backend_kw['ax'] = ax
+    backend_kw.update(kwargs)
+    return backend(spec_top, spec_bottom, **backend_kw)
diff --git a/pyteomics/tandem.py b/pyteomics/tandem.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba08b43f84799fe4ef04735966e26246c0cd34d7
--- /dev/null
+++ b/pyteomics/tandem.py
@@ -0,0 +1,384 @@
+"""
+tandem - X!Tandem output file reader
+====================================
+
+Summary
+-------
+
+`X!Tandem <http://thegpm.org/tandem/>`_  is an open-source proteomic search
+engine with a very simple, sophisticated application programming interface
+(API): it simply takes an XML file of instructions on its command line,
+and outputs the results into an XML file, which has been specified in the input
+XML file. The output format is described
+`here (PDF) <http://www.thegpm.org/docs/X_series_output_form.pdf>`_.
+
+This module provides a minimalistic way to extract information from X!Tandem
+output files. You can use the old functional interface (:py:func:`read`) or the
+new object-oriented interface (:py:class:`TandemXML`) to iterate over entries in
+`<group>` elements, i.e. identifications for a certain spectrum.
+
+Data access
+-----------
+
+  :py:class:`TandemXML` - a class representing a single X!Tandem output file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through peptide-spectrum matches in an X!Tandem
+  output file. Data from a single PSM are converted to a human-readable dict.
+
+  :py:func:`chain` - read multiple files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+  :py:func:`DataFrame` - read X!Tandem output files into a :py:class:`pandas.DataFrame`.
+
+Target-decoy approach
+---------------------
+
+  :py:func:`filter` - iterate through peptide-spectrum matches in a chain of
+  X!Tandem output files, yielding only top PSMs and keeping false discovery rate
+  (FDR) at the desired level. The FDR is estimated using the target-decoy
+  approach (TDA).
+
+  :py:func:`filter.chain` - chain a series of filters applied independently to
+  several files.
+
+  :py:func:`filter.chain.from_iterable` - chain a series of filters applied
+  independently to an iterable of files.
+
+  :py:func:`filter_df` - filter X!Tandem output files and return a :py:class:`pandas.DataFrame`.
+
+
+  :py:func:`is_decoy` - determine if a PSM is from the decoy database.
+
+  :py:func:`fdr` - estimate the FDR in a data set using TDA.
+
+  :py:func:`qvalues` - get an array of scores and local FDR values for a PSM
+  set using the target-decoy approach.
+
+Deprecated functions
+--------------------
+
+  :py:func:`iterfind` - iterate over elements in an X!Tandem file.
+  You can just call the corresponding method of the :py:class:`TandemXML`
+  object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml` and :py:mod:`numpy`.
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import operator
+from . import xml, auxiliary as aux, _schema_defaults
+
+
+class TandemXML(xml.XML):
+    """Parser class for TandemXML files."""
+    file_format = "TandemXML"
+    _root_element = "bioml"
+    _default_schema = _schema_defaults._tandem_schema_defaults
+    _default_iter_path = 'group[@type="model"]'
+    _structures_to_flatten = {'domain'}
+
+    def __init__(self, *args, **kwargs):
+        if 'recursive' not in kwargs:
+            super(TandemXML, self).__init__(*args, recursive=True, **kwargs)
+        else:
+            super(TandemXML, self).__init__(*args, **kwargs)
+
+    __init__.__doc__ = xml.XML.__init__.__doc__
+
+    def _get_info_smart(self, element, **kw):
+        info = self._get_info(element, **kw)
+        # handy simplifications below
+        if isinstance(info.get('note'), list) and len(info['note']) == 1 and set(info['note'][0]) == {'label', 'note'}:
+            info['note'] = info['note'][0]['note']
+        if 'protein' in info and 'label' in info:
+            del info['label']
+        if 'group' in info:
+            for g in info['group']:
+                label = g.pop('label')
+                type_ = g.pop('type')
+                info.setdefault(type_, {})[label] = g
+            del info['group']
+        if 'trace' in info:
+            for t in info['trace']:
+                info[t.pop('type')] = t
+            del info['trace']
+        if isinstance(info.get('values'), dict):
+            info['values'] = info['values']['values']
+        if isinstance(info.get('attribute'), list):
+            for a in info.pop('attribute'):
+                info[a['type']] = float(a['attribute'])
+        if 'support' in info:
+            for d in info['support'].get('supporting data', {}).values():
+                for label in ['Xdata', 'Ydata']:
+                    d[label]['values'] = d[label]['values'].astype(int)
+                    del d[label]['label']
+            if 'fragment ion mass spectrum' in info['support']:
+                fims = info['support']['fragment ion mass spectrum']
+                fims.update(fims.pop('tandem mass spectrum'))
+                for label in ['Xdata', 'Ydata']:
+                    del info['support']['fragment ion mass spectrum'][label]['label']
+        if 'charge' in info:
+            info['charge'] = int(info['charge'])
+        if info.get('rt') == '':
+            info['rt'] = None
+
+        return info
+
+    def _get_schema_info(self, read_schema):
+        return self._default_schema
+
+    def __next__(self):
+        n = super(TandemXML, self).__next__()
+        del n['type']
+        return n
+
+    next = __next__
+
+
+def read(source, iterative=True, **kwargs):
+    """Parse `source` and iterate through peptide-spectrum matches.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target X!Tandem output file or the file object itself.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    Returns
+    -------
+    out : iterator
+       An iterator over dicts with PSM properties.
+    """
+    return TandemXML(source, read_schema=False, recursive=True, iterative=iterative)
+
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified "XPath".
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create a :py:class:`TandemXML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    Returns
+    -------
+    out : iterator
+    """
+    return TandemXML(source, **kwargs).iterfind(path, **kwargs)
+
+
+# chain = aux._make_chain(read, 'read')
+chain = aux.ChainBase._make_chain(TandemXML)
+
+
+def _is_decoy_prefix(psm, prefix='DECOY_'):
+    """Given a PSM dict, return :py:const:`True` if all protein names for
+    the PSM start with `prefix`, and :py:const:`False` otherwise.
+
+    Parameters
+    ----------
+    psm : dict
+        A dict, as yielded by :py:func:`read`.
+    prefix : str, optional
+        A prefix used to mark decoy proteins. Default is `'DECOY_'`.
+
+    Returns
+    -------
+    out : bool
+    """
+    return all(prot['label'].startswith(prefix) for prot in psm['protein'])
+
+
+def _is_decoy_suffix(psm, suffix='_DECOY'):
+    """Given a PSM dict, return :py:const:`True` if all protein names for
+    the PSM end with `suffix`, and :py:const:`False` otherwise.
+
+    Parameters
+    ----------
+    psm : dict
+        A dict, as yielded by :py:func:`read`.
+    suffix : str, optional
+        A suffix used to mark decoy proteins. Default is `'_DECOY'`.
+
+    Returns
+    -------
+    out : bool
+    """
+    return all(prot['label'].endswith(suffix) for prot in psm['protein'])
+
+
+is_decoy = _is_decoy_prefix
+qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, operator.itemgetter('expect'))
+filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, operator.itemgetter('expect'), qvalues)
+fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix)
+filter.chain = aux._make_chain(filter, 'filter', True)
+
+
+def DataFrame(*args, **kwargs):
+    """Read X!Tandem output files into a :py:class:`pandas.DataFrame`.
+
+    Requires :py:mod:`pandas`.
+
+    Parameters
+    ----------
+
+    sep : str or None, optional
+        Some values related to PSMs (such as protein information) are variable-length
+        lists. If `sep` is a :py:class:`str`, they will be packed into single string using
+        this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is
+        :py:const:`None`.
+
+    pd_kwargs : dict, optional
+        Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor.
+
+    *args
+        Passed to :py:func:`chain`.
+
+    **kwargs
+        Passed to :py:func:`chain`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    data = []
+    prot_keys = ['id', 'uid', 'label', 'expect']
+    pep_keys = ['id', 'pre', 'post', 'start', 'end']
+    sep = kwargs.pop('sep', None)
+    pd_kwargs = kwargs.pop('pd_kwargs', {})
+    with chain(*args, **kwargs) as f:
+        for item in f:
+            info = {}
+            for k, v in item.items():
+                if isinstance(v, (str, int, float)):
+                    info[k] = v
+            protein = item['protein'][0]
+
+            for key in prot_keys:
+                vals = [prot.get(key) for prot in item['protein']]
+                if sep is not None:
+                    vals = sep.join(str(val) if val is not None else '' for val in vals)
+                info['protein_' + key] = vals
+            for key in pep_keys:
+                vals = [prot['peptide'].get(key) for prot in item['protein']]
+                if sep is not None:
+                    vals = sep.join(str(val) if val is not None else '' for val in vals)
+                info['peptide_' + key] = vals
+            aa = protein['peptide'].pop('aa', [])
+            info['modifications'] = ','.join('{0[modified]:.3f}@{0[type]}'.format(x) for x in aa)
+            for k in prot_keys:
+                protein.pop(k, None)
+            for k in pep_keys:
+                protein['peptide'].pop(k, None)
+            info.update(protein['peptide'])
+            fims = item['support']['fragment ion mass spectrum']
+            try:
+                info['scan'] = fims['note']
+            except KeyError:
+                info['scan'] = fims['id']
+            data.append(info)
+    return pd.DataFrame(data, **pd_kwargs)
+
+
+def filter_df(*args, **kwargs):
+    """Read X!Tandem output files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs.
+    Positional arguments can be X!Tandem output files or DataFrames.
+
+    Requires :py:mod:`pandas`.
+
+    Parameters
+    ----------
+    key : str / iterable / callable, optional
+        Default is 'expect'.
+    is_decoy : str / iterable / callable, optional
+        Default is to check if all strings in the "protein" column start with `'DECOY_'`
+    *args
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+    **kwargs
+        Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`.
+
+    Returns
+    -------
+    out : pandas.DataFrame
+    """
+    import pandas as pd
+    sep = kwargs.get('sep')
+    kwargs.setdefault('key', 'expect')
+    if all(isinstance(arg, pd.DataFrame) for arg in args):
+        if len(args) > 1:
+            df = pd.concat(args)
+        else:
+            df = args[0]
+    else:
+        read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs}
+        df = DataFrame(*args, **read_kw)
+
+    if 'is_decoy' not in kwargs:
+        if sep is not None:
+            if 'decoy_suffix' in kwargs:
+                kwargs['is_decoy'] = df['protein_label'].str.split(sep).apply(
+                    lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s))
+            else:
+                kwargs['is_decoy'] = df['protein_label'].str.split(sep).apply(
+                    lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s))
+        else:
+            if 'decoy_suffix' in kwargs:
+                kwargs['is_decoy'] = df['protein_label'].apply(
+                    lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s))
+            else:
+                kwargs['is_decoy'] = df['protein_label'].apply(
+                    lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s))
+
+    return aux.filter(df, **kwargs)
diff --git a/pyteomics/traml.py b/pyteomics/traml.py
new file mode 100644
index 0000000000000000000000000000000000000000..66ed27e675eb0cdc8afe873eebc35b9c1efb2ad5
--- /dev/null
+++ b/pyteomics/traml.py
@@ -0,0 +1,235 @@
+"""
+traml - targeted MS transition data in TraML format
+===================================================
+
+Summary
+-------
+
+TraML is a standard rich XML-format for targeted mass spectrometry method definitions.
+Please refer to `psidev.info <http://www.psidev.info/traml>`_
+for the detailed specification of the format and structure of TraML files.
+
+This module provides a minimalistic way to extract information from TraML
+files. You can use the object-oriented interface (:class:`TraML` instances) to
+access target definitions and transitions. :class:`TraML` objects also support
+indexing with entity IDs directly.
+
+Data access
+-----------
+
+  :py:class:`TraML` - a class representing a single TraML file.
+  Other data access functions use this class internally.
+
+  :py:func:`read` - iterate through transitions in TraML format.
+
+  :py:func:`chain` - read multiple TraML files at once.
+
+  :py:func:`chain.from_iterable` - read multiple files at once, using an
+  iterable of files.
+
+Controlled Vocabularies
+~~~~~~~~~~~~~~~~~~~~~~~
+TraML relies on controlled vocabularies to describe its contents extensibly. See
+`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_
+for more details on how they are used.
+
+Handling Time Units and Other Qualified Quantities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TraML contains information which may be described as using a variety of different time units.
+See `Unit Handling <../data.html#unit-handling>`_ for more information.
+
+Deprecated functions
+--------------------
+
+  :py:func:`version_info` - get version information about the TraML file.
+  You can just read the corresponding attribute of the :py:class:`TraML` object.
+
+  :py:func:`iterfind` - iterate over elements in an TraML file.
+  You can just call the corresponding method of the :py:class:`TraML` object.
+
+Dependencies
+------------
+
+This module requires :py:mod:`lxml`
+
+-------------------------------------------------------------------------------
+"""
+
+#   Copyright 2018 Joshua Klein, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+
+import warnings
+from . import xml, _schema_defaults, auxiliary as aux
+
+
+class TraML(xml.MultiProcessingXML, xml.IndexSavingXML):
+    """Parser class for TraML files."""
+    file_format = 'TraML'
+    _root_element = 'TraML'
+    _default_schema = _schema_defaults._traml_schema_defaults
+    _default_version = '1.0.0'
+
+    _default_iter_tag = 'Transition'
+    _indexed_tags = {
+        'Transition',
+        'Peptide',
+        'Compound',
+        'Target',
+        'Protein',
+        'Compound',
+    }
+
+    _element_handlers = xml.XML._element_handlers.copy()
+    _element_handlers.update({
+        'Modification': xml.XML._promote_empty_parameter_to_name,
+        'Interpretation': xml.XML._promote_empty_parameter_to_name,
+        'Software': xml.XML._promote_empty_parameter_to_name,
+    })
+
+    def __init__(self, *args, **kwargs):
+        kwargs.setdefault('retrieve_refs', True)
+        super(TraML, self).__init__(*args, **kwargs)
+
+    def _get_info_smart(self, element, **kw):
+        kwargs = dict(kw)
+        rec = kwargs.pop('recursive', None)
+        info = self._get_info(
+            element,
+            recursive=(rec if rec is not None else True),
+            **kwargs)
+        return info
+
+    def _retrieve_refs(self, info, **kwargs):
+        """Retrieves and embeds the data for each attribute in `info` that
+        ends in `Ref`. Removes the id attribute from `info`"""
+        for k, v in dict(info).items():
+            if k[-3:] in {'Ref', 'ref'}:
+                if isinstance(v, str):
+                    key = v
+                elif isinstance(v, dict):
+                    key = v['ref']
+                else:
+                    if k != 'ref':
+                        info[k[:-3]] = info.pop(k)
+                    continue
+                try:
+                    by_id = self.get_by_id(key, retrieve_refs=True)
+                except KeyError:
+                    warnings.warn('Ignoring unresolved reference: ' + key)
+                else:
+                    if k == 'ref':
+                        info.update(by_id)
+                    else:
+                        # by_id.pop('id', None)
+                        info[k[:-3]] = by_id
+                        del info[k]
+
+
+
+def read(source, retrieve_refs=True, read_schema=False, iterative=True, use_index=False, huge_tree=False):
+    """Parse `source` and iterate through transitions.
+
+    Parameters
+    ----------
+    source : str or file
+        A path to a target TraML file or the file object itself.
+
+    retrieve_refs : bool, optional
+        If :py:const:`True`, additional information from references will be
+        automatically added to the results. The file processing time will
+        increase. Default is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the TraML header. Otherwise, use default parameters.
+        Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    iterative : bool, optional
+        Defines whether iterative parsing should be used. It helps reduce
+        memory usage at almost the same parsing speed. Default is
+        :py:const:`True`.
+
+    use_index : bool, optional
+        Defines whether an index of byte offsets needs to be created for
+        spectrum elements. Default is :py:const:`False`.
+
+    huge_tree : bool, optional
+        This option is passed to the `lxml` parser and defines whether
+        security checks for XML tree depth and node size should be disabled.
+        Default is :py:const:`False`.
+        Enable this option for trusted files to avoid XMLSyntaxError exceptions
+        (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`).
+
+    Returns
+    -------
+    out : TraML
+       A :py:class:`TraML` object, suitable for iteration and possibly random access.
+    """
+
+    return TraML(source, retrieve_refs=retrieve_refs, read_schema=read_schema, iterative=iterative,
+                 use_index=use_index, huge_tree=huge_tree)
+
+
+def iterfind(source, path, **kwargs):
+    """Parse `source` and yield info on elements with specified local
+    name or by specified "XPath".
+
+    .. note:: This function is provided for backward compatibility only.
+        If you do multiple :py:func:`iterfind` calls on one file, you should
+        create an :py:class:`TraML` object and use its
+        :py:meth:`!iterfind` method.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    path : str
+        Element name or XPath-like expression. Only local names separated
+        with slashes are accepted. An asterisk (`*`) means any element.
+        You can specify a single condition in the end, such as:
+        ``"/path/to/element[some_value>1.5]"``
+        Note: you can do much more powerful filtering using plain Python.
+        The path can be absolute or "free". Please don't specify
+        namespaces.
+
+    recursive : bool, optional
+        If :py:const:`False`, subelements will not be processed when
+        extracting info from elements. Default is :py:const:`True`.
+
+    iterative : bool, optional
+        Specifies whether iterative XML parsing should be used. Iterative
+        parsing significantly reduces memory usage and may be just a little
+        slower. When `retrieve_refs` is :py:const:`True`, however, it is
+        highly recommended to disable iterative parsing if possible.
+        Default value is :py:const:`True`.
+
+    read_schema : bool, optional
+        If :py:const:`True`, attempt to extract information from the XML schema
+        mentioned in the mzIdentML header. Otherwise, use default
+        parameters. Not recommended without Internet connection or
+        if you don't like to get the related warnings.
+
+    Returns
+    -------
+    out : iterator
+    """
+    return TraML(source, **kwargs).iterfind(path, **kwargs)
+
+
+version_info = xml._make_version_info(TraML)
+
+chain = aux.ChainBase._make_chain(TraML)
diff --git a/pyteomics/usi.py b/pyteomics/usi.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a265b09f1d665f063d1144c979d5c38004995e
--- /dev/null
+++ b/pyteomics/usi.py
@@ -0,0 +1,527 @@
+"""
+usi - Universal Spectrum Identifier (USI) parser and minimal PROXI client
+=========================================================================
+
+Summary
+-------
+`USI <http://www.psidev.info/usi>`_ is a standardized method of referencing a specific
+spectrum in a dataset, possibly attached to an interpretation. This module includes a
+:class:`USI` type which can represent these constructs, :meth:`~USI.parse` them and
+reconstruct them.
+
+One use-case for USI is to request spectrum information from a `PROXI <http://www.psidev.info/proxi>`_
+service host. PROXI services are available from several of the major national proteomics data hosts,
+including MassIVE, PeptideAtlas, PRIDE, and jPOST.
+
+.. seealso::
+   LeDuc, Richard D., Eric W. Deutsch, Pierre-Alain Binz, Ryan T. Fellers, Anthony J. Cesnik,
+   Joshua A. Klein, Tim Van Den Bossche, et al.
+   "Proteomics Standards Initiative's ProForma 2.0: Unifying the Encoding of Proteoforms and Peptidoforms."
+   ArXiv:2109.11352 [q-Bio], September 23, 2021. http://arxiv.org/abs/2109.11352.
+
+
+
+Data access
+-----------
+
+  :py:class:`USI` for representing Universal Spectrum Identifiers. Call :meth:`USI.parse` to parse a USI
+  string.
+
+  :py:func:`proxi` to request a USI from a remote service. Provides access to the PeptideAtlas, MassIVE,
+  PRIDE and jPOST hosts.
+
+"""
+import json
+import warnings
+import threading
+import multiprocessing
+
+from collections import namedtuple, defaultdict
+
+try:
+    from multiprocessing.dummy import Pool as ThreadPool
+except ImportError:
+    ThreadPool = None
+
+try:
+    from urllib2 import Request, urlopen
+except ImportError:
+    from urllib.request import Request, urlopen
+
+try:
+    import numpy as np
+
+    def coerce_array(array_data):
+        return np.array([float(v) for v in array_data])
+
+except ImportError:
+
+    def coerce_array(array_data):
+        return [float(v) for v in array_data]
+
+from .auxiliary import PyteomicsError
+
+
+class USI(namedtuple("USI", ['protocol', 'dataset', 'datafile', 'scan_identifier_type', 'scan_identifier', 'interpretation'])):
+    '''Represent a Universal Spectrum Identifier (USI).
+
+    .. note::
+        This implementation will capture the interpretation component but will not interpret it at this time.
+
+    Attributes
+    ----------
+    protocol: str
+        The protocol to use to access the data (usually mzspec)
+    dataset: str
+        The name or accession number for the dataset the spectrum residues in
+    datafile: str
+        The basename of the data file from :attr:`dataset` to retrieve the spectrum from
+    scan_identifier_type: str
+        The format of the scan identifier, one of (scan, index, nativeId, trace)
+    scan_identifier: str
+        A usually numerical but potentially comma separated value encoded as a string to uniquely
+        identify the spectrum to be recovered from :attr:`datafile` in :attr:`dataset`.
+    interpretation: str
+        The trailing material of the USI, such as the ProForma peptide sequence and charge
+    '''
+    def __str__(self):
+        return ':'.join(filter(lambda x: x is not None, self))
+
+    @classmethod
+    def parse(cls, usi):
+        '''Parse a USI string into a :class:`USI` object.
+
+        Parameters
+        ----------
+        usi: str
+            The USI string to parse
+
+        Returns
+        -------
+        USI
+        '''
+        return cls(*_usi_parser(str(usi)))
+
+
+def cast_numeric(value):
+    try:
+        return int(value)
+    except ValueError:
+        pass
+    try:
+        return float(value)
+    except ValueError:
+        return value
+
+
+def _usi_parser(usi):
+    tokens = usi.split(":", 5)
+    protocol = tokens[0]
+    dataset = tokens[1]
+    datafile = tokens[2]
+    scan_identifier_type = tokens[3]
+    scan_identifier = tokens[4]
+    try:
+        interpretation = tokens[5]
+    except IndexError:
+        interpretation = None
+    return (protocol, dataset, datafile, scan_identifier_type, scan_identifier, interpretation)
+
+
+class _PROXIBackend(object):
+    '''A base class for all PROXI backends to implement the gory details of HTTP requests
+    and protocol parsing.
+
+    If special processing needs to be done to interpret the spectrum returned from the service
+    provider, override the :meth:`_coerce` method.
+
+    If extra information needs to be provided to the service provider for them to fulfill the
+    request not passed through the URL, override the :meth:`_request` method.
+
+    Attributes
+    ----------
+    name: str
+        The name of the backend service
+    url_template: str
+        The URL with {} fields to populate with the USI and any other relevant options, like protocol version
+        or the like.
+    options: dict
+        Additional options to be used when preparing the request URL.
+    '''
+    def __init__(self, name, url_template, **kwargs):
+        kwargs.setdefault('version', '0.1')
+        self.name = name
+        self.url_template = url_template
+        self.options = kwargs
+
+    def __repr__(self):
+        return "{self.__class__.__name__}({self.options})".format(self=self)
+
+    def _request(self, usi):
+        url = self.url_template.format(usi=usi, **self.options)
+        req = Request(url)
+        response = urlopen(req)
+        if response.getcode() != 200:
+            raise ValueError("PROXI Service Response Code %r" % (response.getcode()))
+        data = response.read().decode("utf-8")
+        data = json.loads(data)
+        return data
+
+    def get(self, usi):
+        '''Retrieve a ``USI`` from the host PROXI service over the network.
+
+        Parameters
+        ----------
+        usi : str or :class:`USI`
+            The universal spectrum identifier to retrieve.
+
+        Returns
+        -------
+        dict:
+            The spectrum as represented by the requested PROXI host.
+        '''
+        data = self._request(usi)
+        result = self._coerce(data)
+        return result
+
+    def _coerce(self, data):
+        '''Override and extend this method to change how the spectrum information is refined.
+
+        This implementation just deals with properly formatting the peak arrays and doing minor
+        cosmetic name normalization.
+
+        Parameters
+        ----------
+        data: dict
+            The raw mzSpecML representation parsed from JSON
+
+        Returns
+        -------
+        dict:
+            The coerced spectrum data of appropriate types
+        '''
+        if isinstance(data, list):
+            data_collection = data
+            data = data_collection[0]
+        result = {}
+        result['attributes'] = data.pop('attributes', [])
+        for attrib in result['attributes']:
+            if 'value' in attrib and isinstance(attrib['value'], str) and attrib['value'][0].isdigit():
+                try:
+                    attrib['value'] = cast_numeric(attrib['value'])
+                except TypeError:
+                    continue
+        result['m/z array'] = coerce_array(data.pop('mzs', []))
+        result['intensity array'] = coerce_array(data.pop('intensities', []))
+        for key, value in data.items():
+            if key in result:
+                raise ValueError(
+                    "Attempting to set explicit value for {key!r}".format(key=key))
+            result[key] = value
+        return result
+
+    def __call__(self, usi):
+        return self.get(usi)
+
+
+class PeptideAtlasBackend(_PROXIBackend):
+    _url_template = "http://www.peptideatlas.org/api/proxi/v{version}/spectra?resultType=full&usi={usi!s}"
+
+    def __init__(self, **kwargs):
+
+        super(PeptideAtlasBackend, self).__init__(
+            'PeptideAtlas', self._url_template, **kwargs)
+
+
+class MassIVEBackend(_PROXIBackend):
+
+    _url_template = "http://massive.ucsd.edu/ProteoSAFe/proxi/v{version}/spectra?resultType=full&usi={usi}"
+
+    def __init__(self, **kwargs):
+        super(MassIVEBackend, self).__init__(
+            'MassIVE', self._url_template, **kwargs)
+
+
+class PRIDEBackend(_PROXIBackend):
+    _url_template = "http://wwwdev.ebi.ac.uk/pride/proxi/archive/v{version}/spectra?resultType=full&usi={usi}"
+
+    def __init__(self, **kwargs):
+        super(PRIDEBackend, self).__init__(
+            'PRIDE', self._url_template, **kwargs)
+
+
+class JPOSTBackend(_PROXIBackend):
+    _url_template = 'https://repository.jpostdb.org/proxi/spectra?resultType=full&usi={usi}'
+
+    def __init__(self, **kwargs):
+        super(JPOSTBackend, self).__init__('jPOST', self._url_template, **kwargs)
+        kwargs.pop("version", None)
+
+
+class ProteomeExchangeBackend(_PROXIBackend):
+    _url_template = 'http://proteomecentral.proteomexchange.org/api/proxi/v{version}/spectra?resultType=full&usi={usi!s}'
+
+    def __init__(self, **kwargs):
+
+        super(ProteomeExchangeBackend, self).__init__(
+            'ProteomeExchange', self._url_template, **kwargs)
+
+
+class PROXIAggregator(object):
+    '''Aggregate across requests across multiple PROXI servers.
+
+    Will attempt to coalesce responses from responding servers into a single spectrum
+    representation.
+
+    Attributes
+    ----------
+    backends : :class:`dict` mapping :class:`str` to :class:`_PROXIBackend`
+        The backend servers to query. Defaults to the set of all available backends.
+    n_threads : int
+        The number of threads to run concurrently to while making requests. Defaults
+        to the number of servers to query.
+    timeout : float
+        The number of seconds to wait for a response.
+    ephemeral_pool : bool
+        Whether or not to tear down the thread pool between requests.
+    '''
+
+    _coalesce_resolution_methods = ("first", )
+
+    def __init__(self, backends=None, n_threads=None, timeout=15, merge=True, ephemeral_pool=True, **kwargs):
+        if backends is None:
+            backends = {k: v() for k, v in _proxies.items()}
+        if n_threads is None:
+            n_threads = len(backends)
+
+        self.lock = threading.RLock()
+
+        self.timeout = timeout
+        self.backends = backends
+        self.n_threads = n_threads
+        self.ephemeral_pool = ephemeral_pool
+        self.pool = None
+        self.merge = merge
+
+    def _init_pool(self):
+        if ThreadPool is None:
+            return False
+        if self.pool is not None:
+            return True
+        with self.lock:
+            if self.pool is None:
+                self.pool = ThreadPool(self.n_threads)
+        return True
+
+    def _clean_up_pool(self):
+        if self.pool:
+            self.pool.close()
+            self.pool.terminate()
+            self.pool = None
+
+    def _fetch_usi(self, usi):
+        use_pool = self._init_pool()
+        agg = []
+        if use_pool:
+            with self.lock:
+                for backend in self.backends.values():
+                    result = self.pool.apply_async(backend.get, (usi, ))
+                    agg.append((backend, result))
+                tmp = []
+                for backend, res in agg:
+                    try:
+                        res = res.get(self.timeout)
+                        tmp.append((backend, res))
+                    except (multiprocessing.TimeoutError, Exception) as err:
+                        tmp.append((backend, err))
+                agg = tmp
+                if self.ephemeral_pool:
+                    self._clean_up_pool()
+        else:
+            for backend in self.backends.values():
+                try:
+                    agg.append(backend, backend.get(usi))
+                except Exception as err:
+                    agg.append((backend, err))
+                    continue
+        return agg
+
+    def coalesce(self, responses, method='first'):
+        '''Merge responses from disparate servers into a single spectrum representation.
+
+        The merging process will use the first of every array encountered, and all unique
+        attributes.
+
+        Parameters
+        ----------
+        responses : list
+            A list of response values, pairs (:class:`_PROXIBackend` and either
+            :class:`dict` or :class:`Exception`).
+        method : str
+            The name of the coalescence technique to use. Currently only "first" is
+            supported.
+
+        Returns
+        -------
+        result : :class:`dict`
+            The coalesced spectrum
+        '''
+        if method not in self._coalesce_resolution_methods:
+            raise ValueError("Coalescence method %r not recognized" % (method, ))
+
+        def collapse_attribute(values):
+            try:
+                acc = list(set(v['value'] for v in values))
+            except TypeError:
+                acc = []
+                for v in values:
+                    if v['value'] not in acc:
+                        acc.append(v['value'])
+
+            result = []
+            template = values[0].copy()
+            for v in acc:
+                t = template.copy()
+                t['value'] = v
+                result.append(t)
+            return result
+
+        arrays = {}
+        attributes = defaultdict(list)
+
+        found = []
+        error = []
+
+        for backend, response in responses:
+            if isinstance(response, Exception):
+                error.append((backend.name, (response)))
+                continue
+            else:
+                found.append(backend.name)
+            for array_name in ('m/z array', 'intensity array'):
+                if array_name not in arrays:
+                    arrays[array_name] = response[array_name]
+                else:
+                    array = response[array_name]
+                    if len(array) != len(arrays[array_name]):
+                        warnings.warn("Length mismatch from %s for %s" %
+                            (backend.name, array_name))
+                        arrays[array_name] = max((array, arrays[array_name]), key=len)
+                    elif not np.allclose(array, arrays[array_name]):
+                        warnings.warn("Value mismatch from %s for %s" %
+                            (backend.name, array_name))
+            for attr in response['attributes']:
+                attributes[attr.get('accession', attr.get('name'))].append(attr)
+
+        finalized_attributes = []
+        for k, v in attributes.items():
+            finalized_attributes.extend(collapse_attribute(v))
+
+        result = {"responders": found, 'errors': error, 'attributes': finalized_attributes}
+        result.update(arrays)
+        if 'm/z array' not in result:
+            raise ValueError("No valid responses found")
+        return result
+
+    def tag_with_source(self, responses):
+        '''Mark each response with it's source.
+
+        Parameters
+        ----------
+        responses : list
+            A list of response values, pairs (:class:`_PROXIBackend` and either
+            :class:`dict` or :class:`Exception`).
+
+        Returns
+        -------
+        result : list[dict]
+            The tagged :class:`dict` for each response.
+        '''
+        output = []
+        for backend, response in responses:
+            if isinstance(response, dict):
+                response['source'] = backend
+            else:
+                response = {
+                    "source": backend,
+                    "error": response
+                }
+            output.append(response)
+        return output
+
+    def get(self, usi):
+        '''Retrieve a ``USI`` from each PROXI service over the network.
+
+        Parameters
+        ----------
+        usi : str or :class:`USI`
+            The universal spectrum identifier to retrieve.
+
+        Returns
+        -------
+        result : dict or list[dict]
+            The spectrum coalesced from all responding PROXI hosts if :attr:`merge` is :const:`True`,
+            or a list of responses marked by host.
+        '''
+        agg = self._fetch_usi(usi)
+        if self.merge:
+            return self.coalesce(agg)
+        else:
+            return self.tag_with_source(agg)
+
+    def __call__(self, usi):
+        return self.get(usi)
+
+    def __del__(self):
+        self._clean_up_pool()
+
+_proxies = {
+    "peptide_atlas": PeptideAtlasBackend,
+    "massive": MassIVEBackend,
+    "pride": PRIDEBackend,
+    "jpost": JPOSTBackend,
+    'proteome_exchange': ProteomeExchangeBackend,
+}
+
+default_backend = 'peptide_atlas'
+
+AGGREGATOR_KEY = "aggregator"
+AGGREGATOR = PROXIAggregator()
+
+
+def proxi(usi, backend=default_backend, **kwargs):
+    '''Retrieve a ``USI`` from a `PROXI <http://www.psidev.info/proxi>`.
+
+    Parameters
+    ----------
+    usi : str or :class:`USI`
+        The universal spectrum identifier to request.
+    backend : str or :class:`Callable`
+        Either the name of a PROXI host (peptide_atlas, massive, pride, jpost, or aggregator),
+        or a callable object (which :class:`_PROXIBackend` instances are) which will be used
+        to resolve the USI. The "aggregator" backend will use a :class:`PROXIAggregator` instance
+        which will request the same USI from all the registered servers and attempt to merge their
+        responses into a single whole. See :meth:`PROXIAggregator.coalesce` for more details on the
+        merging process.
+    **kwargs:
+        extra arguments passed when constructing the backend by name.
+
+    Returns
+    -------
+    dict :
+        The spectrum as represented by the requested PROXI host.
+    '''
+    if isinstance(backend, str):
+        if backend == AGGREGATOR_KEY:
+            backend = AGGREGATOR
+        elif backend in _proxies:
+            backend = _proxies[backend](**kwargs)
+        else:
+            raise PyteomicsError("Unknown PROXI backend name: {}.".format(backend))
+    elif isinstance(backend, type) and issubclass(backend, (_PROXIBackend, PROXIAggregator)):
+        backend = backend(**kwargs)
+    elif callable(backend):
+        backend = backend
+    else:
+        raise TypeError("Unrecognized backend type: {0.__name__}".format(type(backend)))
+    return backend(usi)
diff --git a/pyteomics/version.py b/pyteomics/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..66aca50842f95c0f102e86e70cfab06152ca0d2d
--- /dev/null
+++ b/pyteomics/version.py
@@ -0,0 +1,66 @@
+"""
+version - Pyteomics version information
+=======================================
+
+This module is provided for convenience and captures information about the current version number of Pyteomics.
+
+Classes
+-------
+
+  :py:class:`VersionInfo` - a namedtuple for version numbers that supports comparisons and can be initialized
+    from a version string.
+
+Constants
+---------
+
+  :py:const:`version` - a string with the current version.
+
+  :py:const:`version_info` - a tuple with structured information about the current version.
+
+"""
+
+__version__ = '4.6.4b3'
+
+from collections import namedtuple
+import re
+
+
+class VersionInfo(namedtuple('VersionInfo', ('major', 'minor', 'micro', 'releaselevel', 'serial'))):
+    """Tuple mimicking :py:const:`sys.version_info`"""
+    def __new__(cls, version_str):
+        if isinstance(version_str, str):
+            groups = re.match(r'(\d+)\.(\d+)(?:\.)?(\d+)?([a-zA-Z]+)?(\d+)?', version_str).groups()
+            inst = super(VersionInfo, cls).__new__(cls, *groups)
+        else:
+            inst = super(VersionInfo, cls).__new__(cls, *(str(x) if x is not None else x for x in version_str))
+        inst._version_str = version_str
+        inst._version_ints = tuple(int(x) if isinstance(x, str) and x.isdigit() else 0 for x in inst)
+        return inst
+
+    def __str__(self):
+        return 'Version {}'.format(self._version_str)
+
+    def __lt__(self, other):
+        if not isinstance(other, VersionInfo):
+            other = VersionInfo(other)
+        return self._version_ints < other._version_ints
+
+    def __gt__(self, other):
+        if not isinstance(other, VersionInfo):
+            other = VersionInfo(other)
+        return self._version_ints > other._version_ints
+
+    def __le__(self, other):
+        return self == other or self < other
+
+    def __ge__(self, other):
+        return self == other or self > other
+
+    def __eq__(self, other):
+        if not isinstance(other, VersionInfo):
+            other = VersionInfo(other)
+        return super(VersionInfo, self).__eq__(other)
+
+
+version_info = VersionInfo(__version__)
+version = __version__
diff --git a/pyteomics/xml.py b/pyteomics/xml.py
new file mode 100644
index 0000000000000000000000000000000000000000..db960c7eb3b5fe4375971408bc6b2f23e65e21b2
--- /dev/null
+++ b/pyteomics/xml.py
@@ -0,0 +1,1335 @@
+"""
+xml - utilities for XML parsing
+===============================
+
+This module is not intended for end users. It implements the abstract classes
+for all XML parsers, :py:class:`XML` and :py:class:`IndexedXML`, and some utility functions.
+
+Dependencies
+------------
+
+This module requres :py:mod:`lxml` and :py:mod:`numpy`.
+
+--------------------------------------------------------------------------------
+"""
+
+#   Copyright 2012 Anton Goloborodko, Lev Levitsky
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+import re
+import socket
+from traceback import format_exc
+import warnings
+from collections import OrderedDict, namedtuple
+from itertools import islice
+from lxml import etree
+import numpy as np
+
+from .auxiliary import FileReader, PyteomicsError, basestring, _file_obj, HierarchicalOffsetIndex
+from .auxiliary import unitint, unitfloat, unitstr, cvstr
+from .auxiliary import _keepstate_method as _keepstate
+from .auxiliary import BinaryDataArrayTransformer
+from .auxiliary import TaskMappingMixin, IndexedReaderMixin, IndexSavingMixin
+
+try:  # Python 2.7
+    from urllib2 import urlopen, URLError
+except ImportError:  # Python 3.x
+    from urllib.request import urlopen, URLError
+
+
+def _local_name(element):
+    """Strip namespace from the XML element's name"""
+    tag = element.tag
+    if tag and tag[0] == '{':
+        return tag.rpartition('}')[2]
+    return tag
+
+
+def xsd_parser(schema_url):
+    """Parse an XSD file from the specified URL into a schema dictionary
+    that can be used by :class:`XML` parsers to automatically cast data to
+    the appropriate type.
+
+    Parameters
+    ----------
+    schema_url : str
+        The URL to retrieve the schema from
+
+    Returns
+    -------
+    dict
+    """
+    ret = {}
+    if not (schema_url.startswith('http://') or
+            schema_url.startswith('https://') or
+            schema_url.startswith('file://')):
+        schema_url = 'file://' + schema_url
+    schema_file = urlopen(schema_url)
+    p = etree.XMLParser(remove_comments=True)
+    schema_tree = etree.parse(schema_file, parser=p)
+    types = {'ints': {'int', 'long', 'nonNegativeInteger', 'positiveInt',
+                      'integer', 'unsignedInt'},
+             'floats': {'float', 'double'},
+             'bools': {'boolean'},
+             'intlists': {'listOfIntegers'},
+             'floatlists': {'listOfFloats'},
+             'charlists': {'listOfChars', 'listOfCharsOrAny'}}
+    for k, val in types.items():
+        tuples = set()
+        for elem in schema_tree.iter():
+            if _local_name(elem) == 'attribute' and elem.attrib.get(
+                    'type', '').split(':')[-1] in val:
+                anc = elem.getparent()
+                anc_name = _local_name(anc)
+                while not (
+                        (anc_name == 'complexType' and 'name' in anc.attrib) or anc_name == 'element'):
+                    anc = anc.getparent()
+                    anc_name = _local_name(anc)
+                    if anc is None:
+                        break
+                else:
+                    if anc_name == 'complexType':
+                        elnames = [x.attrib['name'] for x in
+                                   schema_tree.iter()
+                                   if x.attrib.get('type', '').split(':')[-1] == anc.attrib['name']]
+                    else:
+                        elnames = (anc.attrib['name'],)
+                    for elname in elnames:
+                        tuples.add(
+                            (elname, elem.attrib['name']))
+        ret[k] = tuples
+    ret['lists'] = set(elem.attrib['name'] for elem in schema_tree.xpath(
+        '//*[local-name()="element"]') if 'name' in elem.attrib and
+        elem.attrib.get('maxOccurs', '1') != '1')
+    return ret
+
+
+class XMLValueConverter(object):
+    # Adapted from http://stackoverflow.com/questions/2764269/parsing-an-xsduration-datatype-into-a-python-datetime-timedelta-object
+    _duration_parser = re.compile(
+        (r'(?P<sign>-?)P(?:(?P<years>\d+\.?\d*)Y)?(?:(?P<months>\d+\.?\d*)M)?(?:(?P<days>\d+\.?\d*)D)?(?:T(?:(?P<hours>\d+\.?\d*)H)?(?:(?P<minutes>\d+\.?\d*)M)?(?:(?P<seconds>\d+\.?\d*)S)?)?'))
+
+    @classmethod
+    def duration_str_to_float(cls, s):
+        # Not a duration, so pass along
+        if not s.startswith('P'):
+            try:
+                return unitfloat(s, 'duration')
+            except ValueError:
+                return unitstr(s, 'duration')
+        match = cls._duration_parser.search(s)
+        if match:
+            matchdict = match.groupdict()
+            hours = float(matchdict.get('hours', 0) or 0)
+            minutes = float(matchdict.get('minutes', 0) or 0)
+            seconds = float(matchdict.get('seconds', 0) or 0)
+            minutes += hours * 60.
+            minutes += (seconds / 60.)
+            return unitfloat(minutes, 'minute')
+        else:
+            return unitstr(s, 'duration')
+
+    @classmethod
+    def str_to_bool(cls, s):
+        if s.lower() in {'true', '1', 'y'}:
+            return True
+        if s.lower() in {'false', '0', 'n'}:
+            return False
+        raise PyteomicsError('Cannot convert string to bool: ' + s)
+
+    @classmethod
+    def str_to_num(cls, s, numtype):
+        return numtype(s) if s else None
+
+    @classmethod
+    def to(cls, t):
+        def convert_from(s):
+            return cls.str_to_num(s, t)
+        return convert_from
+
+    @classmethod
+    def converters(cls):
+        return {
+            'ints': cls.to(unitint), 'floats': cls.to(unitfloat), 'bools': cls.str_to_bool,
+            'intlists': lambda x: np.fromstring(x.replace('\n', ' '), dtype=int, sep=' '),
+            'floatlists': lambda x: np.fromstring(x.replace('\n', ' '), sep=' '),
+            'charlists': list,
+            'duration': cls.duration_str_to_float
+        }
+
+
+class _XMLParam(namedtuple("XMLParam", ("name", "value", "type"))):
+    '''A holder for semantic parameters used in several common XML formats
+
+    Attributes
+    ----------
+    name: :class:`~.cvstr`
+        The name of the attribute, carrying the accession and unit information
+    value: :class:`~.unitfloat`, :class:`~.unitint` or :class:`~.unitstr`
+        The value of the parameter
+    type: :class:`str`
+        The parameter's local XML tag name.
+    '''
+    __slots__ = ()
+
+    def is_empty(self):
+        value = self.value
+        return value == "" or value is None
+
+
+class XML(FileReader):
+    """Base class for all format-specific XML parsers. The instances can be used
+    as context managers and as iterators.
+    """
+    # Configurable data
+    file_format = 'XML'
+    _root_element = None
+    _default_schema = {}
+    _read_schema = False
+    _default_version = 0
+    _default_iter_tag = None
+    _default_iter_path = None
+    _structures_to_flatten = []
+    _schema_location_param = 'schemaLocation'
+    _default_id_attr = 'id'
+    _huge_tree = False
+    _retrieve_refs_enabled = None  # only some subclasses implement this
+    _iterative = True
+
+    # Configurable plugin logic
+    _converters = XMLValueConverter.converters()
+    _element_handlers = {}
+
+    # Must be implemented by subclasses
+    def _get_info_smart(self, element, **kwargs):
+        raise NotImplementedError
+
+    def __init__(self, source, read_schema=None, iterative=None, build_id_cache=False, **kwargs):
+        """Create an XML parser object.
+
+        Parameters
+        ----------
+        source : str or file
+            File name or file-like object corresponding to an XML file.
+        read_schema : bool, optional
+            Defines whether schema file referenced in the file header
+            should be used to extract information about value conversion.
+            Default is :py:const:`False`.
+        iterative : bool, optional
+            Defines whether an :py:class:`ElementTree` object should be
+            constructed and stored on the instance or if iterative parsing
+            should be used instead. Iterative parsing keeps the memory usage
+            low for large XML files. Default is :py:const:`True`.
+        build_id_cache : bool, optional
+            Defines whether a dictionary mapping IDs to XML tree elements
+            should be built and stored on the instance. It is used in
+            :py:meth:`XML.get_by_id`, e.g. when using
+            :py:class:`pyteomics.mzid.MzIdentML` with ``retrieve_refs=True``.
+        huge_tree : bool, optional
+            This option is passed to the `lxml` parser and defines whether
+            security checks for XML tree depth and node size should be disabled.
+            Default is :py:const:`False`.
+            Enable this option for trusted files to avoid XMLSyntaxError exceptions
+            (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`).
+        """
+
+        super(XML, self).__init__(source, mode='rb', parser_func=self.iterfind, pass_file=False,
+                args=(self._default_iter_path or self._default_iter_tag,), kwargs=kwargs)
+        if iterative is None:
+            iterative = self._iterative
+        if iterative:
+            self._tree = None
+        else:
+            self.build_tree()
+        if build_id_cache:
+            self.build_id_cache()
+        else:
+            self._id_dict = None
+
+        self.version_info = self._get_version_info()
+        if read_schema is not None:
+            self._read_schema = read_schema
+        self.schema_info = self._get_schema_info(read_schema)
+
+        self._converters_items = self._converters.items()
+        self._huge_tree = kwargs.get('huge_tree', self._huge_tree)
+        self._retrieve_refs_enabled = kwargs.get('retrieve_refs')
+
+    def __reduce_ex__(self, protocol):
+        return self.__class__, (
+            self._source_init, self._read_schema, self._tree is None,
+            False,
+        ), self.__getstate__()
+
+    def __getstate__(self):
+        state = super(XML, self).__getstate__()
+        state['_huge_tree'] = self._huge_tree
+        state['_retrieve_refs_enabled'] = self._retrieve_refs_enabled
+        state['_id_dict'] = self._id_dict
+        return state
+
+    def __setstate__(self, state):
+        super(XML, self).__setstate__(state)
+        self._huge_tree = state['_huge_tree']
+        self._retrieve_refs_enabled = state['_retrieve_refs_enabled']
+        self._id_dict = state['_id_dict']
+
+    @_keepstate
+    def _get_version_info(self):
+        """
+        Provide version information about the XML file.
+
+        Returns
+        -------
+        out : tuple
+            A (version, schema URL) tuple, both elements are strings or None.
+        """
+        for _, elem in etree.iterparse(
+                self._source, events=('start',), remove_comments=True, huge_tree=self._huge_tree):
+            if _local_name(elem) == self._root_element:
+                return (elem.attrib.get('version'),
+                        elem.attrib.get(('{{{}}}'.format(elem.nsmap['xsi'])
+                            if 'xsi' in elem.nsmap else '') + self._schema_location_param))
+
+    @_keepstate
+    def _get_schema_info(self, read_schema=True):
+        """Stores defaults for the schema, tries to retrieve the schema for
+        other versions. Keys are: 'floats', 'ints', 'bools', 'lists',
+        'intlists', 'floatlists', 'charlists'."""
+        if not read_schema:
+            return self._default_schema
+
+        version, schema = self.version_info
+        if version == self._default_version:
+            return self._default_schema
+
+        ret = {}
+        try:
+            if not schema:
+                schema_url = ''
+                raise PyteomicsError(
+                        'Schema information not found in {}.'.format(self.name))
+            schema_url = schema.split()[-1]
+            ret = xsd_parser(schema_url)
+        except Exception as e:
+            if isinstance(e, (URLError, socket.error, socket.timeout)):
+                warnings.warn("Can't get the {0.file_format} schema for version "
+                "`{1}` from <{2}> at the moment.\n"
+                "Using defaults for {0._default_version}.\n"
+                "You can disable reading the schema by specifying "
+                "`read_schema=False`.".format(self, version, schema_url))
+            else:
+                warnings.warn("Unknown {0.file_format} version `{1}`.\n"
+                    "Attempt to use schema "
+                    "information from <{2}> failed.\n"
+                    "Exception information:\n{3}\n"
+                    "Falling back to defaults for {0._default_version}\n"
+                    "NOTE: This is just a warning, probably from a badly-"
+                    "generated XML file.\nYou will still most probably get "
+                    "decent results.\nLook here for suppressing warnings:\n"
+                    "http://docs.python.org/library/warnings.html#"
+                    "temporarily-suppressing-warnings\n"
+                    "You can also disable reading the schema by specifying "
+                    "`read_schema=False`.\n"
+                    "If you think this shouldn't have happened, please "
+                    "report this to\n"
+                    "http://github.com/levitsky/pyteomics/issues\n"
+                    "".format(self, version, schema_url, format_exc()))
+            ret = self._default_schema
+        return ret
+
+    def _handle_param(self, element, **kwargs):
+        """Unpacks cvParam and userParam tags into key-value pairs"""
+        types = {'int': unitint, 'float': unitfloat, 'string': unitstr}
+        attribs = element.attrib
+        unit_info = None
+        unit_accesssion = None
+        if 'unitCvRef' in attribs or 'unitName' in attribs:
+            unit_accesssion = attribs.get('unitAccession')
+            unit_name = attribs.get('unitName', unit_accesssion)
+            unit_info = unit_name
+        accession = attribs.get('accession')
+        value = attribs.get('value', '')
+        try:
+            if attribs.get('type') in types:
+                value = types[attribs['type']](value, unit_info)
+            else:
+                value = unitfloat(value, unit_info)
+        except ValueError:
+            value = unitstr(value, unit_info)
+
+        # return {cvstr(attribs['name'], accession, unit_accesssion): value}
+        return _XMLParam(cvstr(attribs['name'], accession, unit_accesssion), value, _local_name(element))
+
+    def _handle_referenceable_param_group(self, param_group_ref, **kwargs):
+        raise NotImplementedError()
+        return []
+
+    def _find_immediate_params(self, element, **kwargs):
+        return element.xpath(
+            './*[local-name()="cvParam" or local-name()="userParam" or local-name()="UserParam" or local-name()="referenceableParamGroupRef"]')
+
+    def _insert_param(self, info_dict, param):
+        key = param.name
+        if key in info_dict:
+            if isinstance(info_dict[key], list):
+                info_dict[key].append(param.value)
+            else:
+                info_dict[key] = [info_dict[key], param.value]
+        else:
+            info_dict[key] = param.value
+
+    def _promote_empty_parameter_to_name(self, info, params):
+        empty_values = []
+        not_empty_values = []
+        for param in params:
+            if param.is_empty():
+                empty_values.append(param)
+            else:
+                not_empty_values.append(param)
+
+        if len(empty_values) == 1 and 'name' not in info:
+            info['name'] = empty_values[0].name
+            return info, not_empty_values
+        return info, params
+
+    def _get_info(self, element, **kwargs):
+        """Extract info from element's attributes, possibly recursive.
+        <cvParam> and <userParam> elements are treated in a special way."""
+        try:
+            name = kwargs.pop('ename')
+        except KeyError:
+            name = _local_name(element)
+        schema_info = self.schema_info
+        if name in {'cvParam', 'userParam', 'UserParam'}:
+            return self._handle_param(element, **kwargs)
+        elif name == "referenceableParamGroupRef":
+            return self._handle_referenceable_param_group(element, **kwargs)
+
+        info = dict(element.attrib)
+        # process subelements
+        params = []
+        if kwargs.get('recursive'):
+            for child in element.iterchildren():
+                cname = _local_name(child)
+                if cname in {'cvParam', 'userParam', 'UserParam'}:
+                    newinfo = self._handle_param(child, **kwargs)
+                    params.append(newinfo)
+                elif cname == "referenceableParamGroupRef":
+                    params.extend(self._handle_referenceable_param_group(child, **kwargs))
+                else:
+                    if cname not in schema_info['lists']:
+                        info[cname] = self._get_info_smart(child, ename=cname, **kwargs)
+                    else:
+                        info.setdefault(cname, []).append(
+                            self._get_info_smart(child, ename=cname, **kwargs))
+        else:
+            # handle the case where we do not want to unpack all children, but
+            # *Param tags are considered part of the current entity, semantically
+            for child in self._find_immediate_params(element, **kwargs):
+                param_or_group = self._handle_param(child, **kwargs)
+                if isinstance(param_or_group, list):
+                    params.extend(param_or_group)
+                else:
+                    params.append(param_or_group)
+
+        handler = self._element_handlers.get(name)
+        if handler is not None:
+            info, params = handler(self, info, params)
+
+        for param in params:
+            self._insert_param(info, param)
+
+        # process element text
+        if element.text:
+            stext = element.text.strip()
+            if stext:
+                if info:
+                    info[name] = stext
+                else:
+                    return stext
+
+        # convert types
+        try:
+            for k, v in info.items():
+                for t, a in self._converters_items:
+                    if t in schema_info and (name, k) in schema_info[t]:
+                        info[k] = a(v)
+        except ValueError as e:
+            message = 'Error when converting types: {}'.format(e.args)
+            if not self._read_schema:
+                message += '\nTry reading the file with read_schema=True'
+            raise PyteomicsError(message)
+
+        # resolve refs
+        if kwargs.get('retrieve_refs', self._retrieve_refs_enabled):
+            self._retrieve_refs(info, **kwargs)
+
+        # flatten the excessive nesting
+        for k, v in dict(info).items():
+            if k in self._structures_to_flatten:
+                if isinstance(v, list):
+                    for vi in v:
+                        info.update(vi)
+                else:
+                    info.update(v)
+                del info[k]
+
+        # another simplification
+        for k, v in dict(info).items():
+            if isinstance(v, dict) and 'name' in v and len(v) == 1:
+                info[k] = v['name']
+        if len(info) == 2 and 'name' in info and (
+                'value' in info or 'values' in info):
+            name = info.pop('name')
+            info = {name: info.popitem()[1]}
+        return info
+
+    @_keepstate
+    def build_tree(self):
+        """Build and store the :py:class:`ElementTree` instance
+        for the underlying file"""
+        p = etree.XMLParser(remove_comments=True, huge_tree=True)
+        self._tree = etree.parse(self._source, parser=p)
+
+    def clear_tree(self):
+        """Remove the saved :py:class:`ElementTree`."""
+        self._tree = None
+
+    def _retrieve_refs(self, info, **kwargs):
+        """Retrieves and embeds the data for each attribute in `info` that
+        ends in _ref. Removes the id attribute from `info`.
+
+        This implementation is a stub and must be implemented for each specific
+        subclass. It is only called if :attr:`retrieve_refs` """
+        raise NotImplementedError(
+            ("_retrieve_refs is not implemented for {}. "
+             "Do not use `retrieve_refs=True`.").format(
+                self.__class__.__name__))
+
+    def iterfind(self, path, **kwargs):
+        """Parse the XML and yield info on elements with specified local
+        name or by specified "XPath".
+
+        Parameters
+        ----------
+        path : str
+            Element name or XPath-like expression. The path is very close to
+            full XPath syntax, but local names should be used for all elements in the path.
+            They will be substituted with local-name() checks, up to the (first) predicate.
+            The path can be absolute or "free". Please don't specify namespaces.
+        **kwargs : passed to :py:meth:`self._get_info_smart`.
+
+        Returns
+        -------
+        out : iterator
+        """
+        return Iterfind(self, path, **kwargs)
+
+    @_keepstate
+    def _iterfind_impl(self, path, **kwargs):
+        """Parse the XML and yield info on elements with specified local
+        name or by specified "XPath".
+
+        Parameters
+        ----------
+        path : str
+            Element name or XPath-like expression. The path is very close to
+            full XPath syntax, but local names should be used for all elements in the path.
+            They will be substituted with local-name() checks, up to the (first) predicate.
+            The path can be absolute or "free". Please don't specify namespaces.
+        **kwargs : passed to :py:meth:`self._get_info_smart`.
+
+        Returns
+        -------
+        out : iterator
+        """
+        try:
+            path, tail = re.match(pattern_path, path).groups()
+        except AttributeError:
+            raise PyteomicsError('Invalid path: ' + path)
+        if path[:2] == '//' or path[0] != '/':
+            absolute = False
+            if path[:2] == '//':
+                path = path[2:]
+                if path[0] == '/' or '//' in path:
+                    raise PyteomicsError("Too many /'s in a row.")
+        else:
+            absolute = True
+            path = path[1:]
+        nodes = path.rstrip('/').split('/')
+        if not nodes:
+            raise PyteomicsError('Invalid path: ' + path)
+
+        if not self._tree:
+            if tail:
+                if tail[0] == '[':
+                    tail = '(.)' + tail
+                else:
+                    raise PyteomicsError('Cannot parse path tail: ' + tail)
+                xpath = etree.XPath(tail)
+            localname = nodes[0]
+            found = False
+            for ev, elem in etree.iterparse(self, events=('start', 'end'), remove_comments=True, huge_tree=self._huge_tree):
+                name_lc = _local_name(elem)
+                if ev == 'start':
+                    if name_lc == localname or localname == '*':
+                        found += 1
+                else:
+                    if name_lc == localname or localname == '*':
+                        if (absolute and elem.getparent() is None) or not absolute:
+                            for child in get_rel_path(elem, nodes[1:]):
+                                if tail:
+                                    for elem in xpath(child):
+                                        info = self._get_info_smart(elem, **kwargs)
+                                        yield info
+                                else:
+                                    info = self._get_info_smart(child, **kwargs)
+                                    yield info
+                        if not localname == '*':
+                            found -= 1
+                    if not found:
+                        elem.clear()
+        else:
+            xpath = ('/' if absolute else '//') + '/'.join(
+                    '*[local-name()="{}"]'.format(node) if node != '*' else '*' for node in nodes ) + tail
+            for elem in self._tree.xpath(xpath):
+                info = self._get_info_smart(elem, **kwargs)
+                yield info
+
+    @_keepstate
+    def build_id_cache(self):
+        """Construct a cache for each element in the document, indexed by id
+        attribute"""
+        stack = 0
+        id_dict = {}
+        for event, elem in etree.iterparse(self._source, events=('start', 'end'),
+                remove_comments=True, huge_tree=self._huge_tree):
+            if event == 'start':
+                if 'id' in elem.attrib:
+                    stack += 1
+            else:
+                if 'id' in elem.attrib:
+                    stack -= 1
+                    id_dict[elem.attrib['id']] = elem
+                elif stack == 0:
+                    elem.clear()
+        self._id_dict = id_dict
+
+    def clear_id_cache(self):
+        """Clear the element ID cache"""
+        self._id_dict = {}
+
+    def _find_by_id_no_reset(self, elem_id, id_key=None):
+        """
+        An almost exact copy of :meth:`get_by_id` with the difference that it does
+        not reset the file reader's position before iterative parsing.
+
+        Parameters
+        ----------
+        elem_id : str
+            The element id to query for
+
+        Returns
+        -------
+        lxml.Element
+        """
+        found = False
+        if id_key is None:
+            id_key = self._default_id_attr
+        for event, elem in etree.iterparse(
+                self._source, events=('start', 'end'), remove_comments=True, huge_tree=self._huge_tree):
+            if event == 'start':
+                if elem.attrib.get(id_key) == elem_id:
+                    found = True
+            else:
+                if elem.attrib.get(id_key) == elem_id:
+                    return elem
+                if not found:
+                    elem.clear()
+        raise KeyError(elem_id)
+
+    @_keepstate
+    def get_by_id(self, elem_id, **kwargs):
+        """Parse the file and return the element with `id` attribute equal
+        to `elem_id`. Returns :py:const:`None` if no such element is found.
+
+        Parameters
+        ----------
+        elem_id : str
+            The value of the `id` attribute to match.
+
+        Returns
+        -------
+        out : :py:class:`dict` or :py:const:`None`
+        """
+        if not self._id_dict:
+            elem = self._find_by_id_no_reset(elem_id)
+        else:
+            elem = self._id_dict[elem_id]
+        return self._get_info_smart(elem, **kwargs)
+
+
+# XPath emulator tools
+pattern_path = re.compile(r'([\w/*]*)(.*)')
+
+
+def get_rel_path(element, names):
+    if not names:
+        yield element
+    else:
+        for child in element.iterchildren():
+            if names[0] == '*' or _local_name(child) == names[0]:
+                if len(names) == 1:
+                    yield child
+                else:
+                    for gchild in get_rel_path(child, names[1:]):
+                        yield gchild
+
+
+def xpath(tree, path, ns=None):
+    """Return the results of XPath query with added namespaces.
+    Assumes the ns declaration is on the root element or absent.
+
+    Parameters
+    ----------
+
+    tree : ElementTree
+    path : str
+    ns   : str or None, optional
+    """
+    if hasattr(tree, 'getroot'):
+        root = tree.getroot()
+    else:
+        root = tree
+        while root.getparent() is not None:
+            root = root.getparent()
+    ns = root.nsmap.get(ns)
+
+    def repl(m):
+        s = m.group(1)
+        if not ns: return s
+        if not s: return 'd:'
+        return '/d:'
+    new_path = re.sub(r'(\/|^)(?![\*\/])', repl, path)
+    n_s = ({'d': ns} if ns else None)
+    return tree.xpath(new_path, namespaces=n_s)
+
+
+def _make_version_info(cls):
+    def version_info(source):
+        return cls(source).version_info
+    version_info.__doc__ = """
+    Provide version information about the {0.file_format} file.
+
+    .. note:: This function is provided for backward compatibility only.
+        It simply creates an :py:class:`{0.__name__}` instance
+        and returns its :py:data:`!version_info` attribute.
+
+    Parameters
+    ----------
+    source : str or file
+        File name or file-like object.
+
+    Returns
+    -------
+    out : tuple
+        A (version, schema URL) tuple, both elements are strings or None.
+    """.format(cls)
+    return version_info
+
+
+class ByteCountingXMLScanner(_file_obj):
+    """
+    Carry out the construction of a byte offset index for `source` XML file
+    for each type of tag in :attr:`indexed_tags`.
+
+    Inheris from :py:class:`pyteomics.auxiliary._file_obj` to support the object-oriented
+    :py:func:`_keep_state` interface.
+    """
+    entities = {
+        'quot': '"',
+        'amp': '&',
+        'apos': "'",
+        'lt': '<',
+        'gt': '>',
+    }
+
+    xml_entity_pattern = re.compile(r"&({});".format('|'.join(entities.keys())))
+
+    def __init__(self, source, indexed_tags, block_size=1000000):
+        """
+        Parameters
+        ----------
+        indexed_tags : iterable of bytes
+            The XML tags (without namespaces) to build indices for.
+        block_size : int, optional
+            The size of the each chunk or "block" of the file to hold in memory as a
+            partitioned string at any given time. Defaults to `1000000`.
+        """
+        super(ByteCountingXMLScanner, self).__init__(source, 'rb')
+        self.indexed_tags = ensure_bytes(indexed_tags)
+        self.block_size = block_size
+
+    def _chunk_iterator(self):
+        """
+        Read a file in large blocks and chunk up each block into parts
+        resembling XML tags, yielding each chunk.
+
+        Assumes the file is opened in binary mode.
+        """
+        f = self.file
+        read_size = self.block_size
+        delim = b'<'
+        buff = f.read(read_size)
+        started_with_delim = buff.startswith(delim)
+        parts = buff.split(delim)
+        tail = parts[-1]
+        front = parts[:-1]
+        i = 0
+        for part in front:
+            i += 1
+            if part == b"":
+                continue
+            if i == 1:
+                if started_with_delim:
+                    yield delim + part
+                else:
+                    yield part
+            else:
+                yield delim + part
+        running = True
+        while running:
+            buff = f.read(read_size)
+            if not buff:
+                running = False
+                buff = tail
+            else:
+                buff = tail + buff
+            parts = buff.split(delim)
+            tail = parts[-1]
+            front = parts[:-1]
+            for part in front:
+                yield delim + part
+
+    def _generate_offsets(self):
+        """
+        Iterate over the lines of an XML file where each line contains exactly one tag,
+        tracking the byte count for each line. When a line contains a tag whose name matches
+        a name in :attr:`indexed_tags`, yield the byte offset, the tag type, and it's attributes.
+
+        Yields
+        ------
+        offset : int
+            The byte offset of a matched tag's opening line
+        tag_type : bytes
+            The type of tag matched
+        attr_dict : dict
+            The attributes on the matched tag
+        """
+        i = 0
+        packed = b"|".join(self.indexed_tags)
+        pattern = re.compile((r"^\s*<(%s)\s" % packed.decode()).encode())
+        attrs = re.compile(br"(\S+)=[\"']([^\"']*)[\"']")
+        for line in self._chunk_iterator():
+            match = pattern.match(line)
+            if match:
+                yield i, match.group(1), dict(attrs.findall(line))
+            i += len(line)
+
+    def _entity_sub_cb(self, match):
+        ent = match.group(1)
+        return self.entities[ent]
+
+    def replace_entities(self, key):
+        '''Replace XML entities in a string with their character representation
+
+        Uses the minimal mapping of XML entities pre-defined for all XML documents and
+        does not attempt to deal with external DTD defined entities. This mapping is found
+        in :attr:`entities`.
+
+        Parameters
+        ----------
+        key : str
+            The string to substitute
+
+        Returns
+        -------
+        str
+        '''
+        return self.xml_entity_pattern.sub(self._entity_sub_cb, key)
+
+    @_keepstate
+    def build_byte_index(self, lookup_id_key_mapping=None):
+        """
+        Builds a byte offset index for one or more types of tags.
+
+        Parameters
+        ----------
+        lookup_id_key_mapping : Mapping, optional
+            A mapping from tag name to the attribute to look up the identity
+            for each entity of that type to be extracted. Defaults to 'id' for
+            each type of tag.
+
+        Returns
+        -------
+        defaultdict(dict)
+            Mapping from tag type to dict from identifier to byte offset
+        """
+        if lookup_id_key_mapping is None:
+            lookup_id_key_mapping = {}
+        lookup_id_key_mapping = {ensure_bytes_single(key): ensure_bytes_single(value)
+            for key, value in lookup_id_key_mapping.items()}
+
+        for name in self.indexed_tags:
+            bname = ensure_bytes_single(name)
+            lookup_id_key_mapping.setdefault(bname, 'id')
+            lookup_id_key_mapping[bname] = ensure_bytes_single(lookup_id_key_mapping[bname])
+
+        indices = HierarchicalOffsetIndex()
+        g = self._generate_offsets()
+        for offset, offset_type, attrs in g:
+            k = attrs[lookup_id_key_mapping[offset_type]].decode('utf-8')
+            if '&' in k:
+                k = self.replace_entities(k)
+            indices[offset_type.decode('utf-8')][k] = offset
+        return indices
+
+    @classmethod
+    def scan(cls, source, indexed_tags):
+        inst = cls(source, indexed_tags)
+        return inst.build_byte_index()
+
+
+class TagSpecificXMLByteIndex(object):
+    """
+    Encapsulates the construction and querying of a byte offset index
+    for a set of XML tags.
+
+    This type mimics an immutable Mapping.
+
+    Attributes
+    ----------
+    indexed_tags : iterable of bytes
+        The tag names to index, not including a namespace
+    offsets : defaultdict(OrderedDict(str, int))
+        The hierarchy of byte offsets organized ``{"tag_type": {"id": byte_offset}}``
+    indexed_tag_keys: dict(str, str)
+        A mapping from tag name to unique identifier attribute
+
+    Parameters
+    ----------
+    index_tags: iterable of bytes
+        The tag names to include in the index
+
+    """
+    _default_indexed_tags = []
+    _default_keys = {}
+    _scanner_class = ByteCountingXMLScanner
+
+    def __init__(self, source, indexed_tags=None, keys=None):
+        if keys is None:
+            keys = self._default_keys.copy()
+        if indexed_tags is None:
+            indexed_tags = self._default_indexed_tags
+        self.indexed_tags = indexed_tags
+        self.indexed_tag_keys = keys
+        self.source = source
+        self.offsets = HierarchicalOffsetIndex()
+        self.build_index()
+
+    def __getstate__(self):
+        state = {}
+        state['indexed_tags'] = self.indexed_tags
+        state['indexed_tag_keys'] = self.indexed_tag_keys
+        state['offsets'] = self.offsets
+        return state
+
+    def __setstate__(self, state):
+        self.indexed_tags = state['indexed_tags']
+        self.indexed_tag_keys = state['indexed_tag_keys']
+        self.offsets = state['offsets']
+
+    def __getitem__(self, key):
+        return self.offsets[key]
+
+    def build_index(self):
+        """
+        Perform the byte offset index building for py:attr:`source`.
+
+        Returns
+        -------
+        offsets: defaultdict
+            The hierarchical offset, stored in offsets
+        """
+        scanner = self._scanner_class(self.source, self.indexed_tags)
+        self.offsets = scanner.build_byte_index(self.indexed_tag_keys)
+        return self.offsets
+
+    def items(self):
+        return self.offsets.items()
+
+    def keys(self):
+        return self.offsets.keys()
+
+    def __iter__(self):
+        return iter(self.keys())
+
+    def __len__(self):
+        return sum(len(group) for key, group in self.items())
+
+    @classmethod
+    def build(cls, source, indexed_tags=None, keys=None):
+        indexer = cls(source, indexed_tags, keys)
+        return indexer.offsets
+
+
+def ensure_bytes_single(string):
+    if isinstance(string, bytes):
+        return string
+    try:
+        return string.encode('utf-8')
+    except (AttributeError, UnicodeEncodeError):
+        raise PyteomicsError('{!r} could not be encoded'.format(string))
+
+
+def ensure_bytes(strings):
+    if isinstance(strings, basestring):
+        strings = [strings]
+    return [ensure_bytes_single(string) for string in strings]
+
+
+def _flatten_map(hierarchical_map):
+    all_records = []
+    for key, records in hierarchical_map.items():
+        all_records.extend(records.items())
+
+    all_records.sort(key=lambda x: x[1])
+    return OrderedDict(all_records)
+
+
+class IndexedXML(IndexedReaderMixin, XML):
+    """Subclass of :py:class:`XML` which uses an index of byte offsets for some
+    elements for quick random access.
+    """
+    _indexed_tags = set()
+    _indexed_tag_keys = {}
+    _use_index = True
+
+    def __init__(self, source, read_schema=False, iterative=True, build_id_cache=False,
+                 use_index=None, *args, **kwargs):
+        """Create an indexed XML parser object.
+
+        Parameters
+        ----------
+        source : str or file
+            File name or file-like object corresponding to an XML file.
+        read_schema : bool, optional
+            Defines whether schema file referenced in the file header
+            should be used to extract information about value conversion.
+            Default is :py:const:`False`.
+        iterative : bool, optional
+            Defines whether an :py:class:`ElementTree` object should be
+            constructed and stored on the instance or if iterative parsing
+            should be used instead. Iterative parsing keeps the memory usage
+            low for large XML files. Default is :py:const:`True`.
+        use_index : bool, optional
+            Defines whether an index of byte offsets needs to be created for
+            elements listed in `indexed_tags`.
+            This is useful for random access to spectra in mzML or elements of mzIdentML files,
+            or for iterative parsing of mzIdentML with ``retrieve_refs=True``.
+            If :py:const:`True`, `build_id_cache` is ignored.
+            If :py:const:`False`, the object acts exactly like :py:class:`XML`.
+            Default is :py:const:`True`.
+        indexed_tags : container of bytes, optional
+            If `use_index` is :py:const:`True`, elements listed in this parameter
+            will be indexed. Empty set by default.
+        """
+        tags = kwargs.get('indexed_tags')
+        tag_index_keys = kwargs.get('indexed_tag_keys')
+
+        if tags is not None:
+            self._indexed_tags = tags
+        if tag_index_keys is not None:
+            self._indexed_tag_keys = tag_index_keys
+
+        if use_index is not None:
+            self._use_index = use_index
+
+        if use_index:
+            build_id_cache = False
+            if self._default_iter_path and self._default_iter_path != self._default_iter_tag:
+                warnings.warn('_default_iter_path differs from _default_iter_tag and index is enabled. '
+                    '_default_iter_tag will be used in the index, mind the consequences.')
+        super(IndexedXML, self).__init__(source, read_schema, iterative, build_id_cache, *args, **kwargs)
+
+        self._offset_index = None
+        self._build_index()
+
+    @property
+    def default_index(self):
+        return self._offset_index[self._default_iter_tag]
+
+    def __reduce_ex__(self, protocol):
+        reconstructor, args, state = XML.__reduce_ex__(self, protocol)
+        args = args + (False, )
+        return reconstructor, args, state
+
+    def __getstate__(self):
+        state = super(IndexedXML, self).__getstate__()
+        state['_indexed_tags'] = self._indexed_tags
+        state['_indexed_tag_keys'] = self._indexed_tag_keys
+        state['_use_index'] = self._use_index
+        state['_offset_index'] = self._offset_index
+        return state
+
+    def __setstate__(self, state):
+        super(IndexedXML, self).__setstate__(state)
+        self._indexed_tags = state['_indexed_tags']
+        self._indexed_tag_keys = state['_indexed_tag_keys']
+        self._use_index = state['_use_index']
+        self._offset_index = state['_offset_index']
+
+    @_keepstate
+    def _build_index(self):
+        """
+        Build up a `dict` of `dict` of offsets for elements. Calls :func:`find_index_list`
+        on :attr:`_source` and assigns the return value to :attr:`_offset_index`
+        """
+        if not self._indexed_tags or not self._use_index:
+            return
+        self._offset_index = TagSpecificXMLByteIndex.build(
+            self._source, self._indexed_tags, self._indexed_tag_keys)
+
+    @_keepstate
+    def _find_by_id_reset(self, elem_id, id_key=None):
+        return self._find_by_id_no_reset(elem_id, id_key=id_key)
+
+    @_keepstate
+    def get_by_id(self, elem_id, id_key=None, element_type=None, **kwargs):
+        """
+        Retrieve the requested entity by its id. If the entity
+        is a spectrum described in the offset index, it will be retrieved
+        by immediately seeking to the starting position of the entry, otherwise
+        falling back to parsing from the start of the file.
+
+        Parameters
+        ----------
+        elem_id : str
+            The id value of the entity to retrieve.
+        id_key : str, optional
+            The name of the XML attribute to use for lookup.
+            Defaults to :py:attr:`self._default_id_attr`.
+
+        Returns
+        -------
+        dict
+        """
+        try:
+            index = self._offset_index
+            if element_type is None:
+                offset, element_type = index.find_no_type(elem_id)
+            else:
+                offset = index.find(elem_id, element_type)
+            self._source.seek(offset)
+            if id_key is None:
+                id_key = self._indexed_tag_keys.get(element_type)
+            elem = self._find_by_id_no_reset(elem_id, id_key=id_key)
+        except (KeyError, AttributeError, etree.LxmlError):
+            elem = self._find_by_id_reset(elem_id, id_key=id_key)
+        data = self._get_info_smart(elem, **kwargs)
+        return data
+
+    def __contains__(self, key):
+        return key in self._offset_index[self._default_iter_tag]
+
+    def __len__(self):
+        return len(self._offset_index[self._default_iter_tag])
+
+    def iterfind(self, path, **kwargs):
+        """Parse the XML and yield info on elements with specified local
+        name or by specified "XPath".
+
+        Parameters
+        ----------
+        path : str
+            Element name or XPath-like expression. The path is very close to
+            full XPath syntax, but local names should be used for all elements in the path.
+            They will be substituted with local-name() checks, up to the (first) predicate.
+            The path can be absolute or "free". Please don't specify namespaces.
+        **kwargs : passed to :py:meth:`self._get_info_smart`.
+
+        Returns
+        -------
+        out : iterator
+        """
+        if path in self._indexed_tags and self._use_index:
+            return IndexedIterfind(self, path, **kwargs)
+        return Iterfind(self, path, **kwargs)
+
+
+class MultiProcessingXML(IndexedXML, TaskMappingMixin):
+    """XML reader that feeds indexes to external processes
+    for parallel parsing and analysis of XML entries."""
+
+    def _task_map_iterator(self):
+        """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC
+        queue used by :meth:`map`
+
+        Returns
+        -------
+        :class:`Iteratable`
+        """
+        return iter(self._offset_index[self._default_iter_tag])
+
+
+class IndexSavingXML(IndexSavingMixin, IndexedXML):
+    """An extension to the IndexedXML type which
+    adds facilities to read and write the byte offset
+    index externally.
+    """
+    _index_class = HierarchicalOffsetIndex
+
+    def _read_byte_offsets(self):
+        """Read the byte offset index JSON file at :attr:`_byte_offset_filename`
+        and populate :attr:`_offset_index`
+        """
+        with open(self._byte_offset_filename, 'r') as f:
+            index = self._index_class.load(f)
+            if index.schema_version is None:
+                raise TypeError("Legacy Offset Index!")
+            self._offset_index = index
+
+
+class Iterfind(object):
+    def __init__(self, parser, tag_name, **kwargs):
+        self.parser = parser
+        self.tag_name = tag_name
+        self.config = kwargs
+        self._iterator = None
+
+    def __repr__(self):
+        template = "{self.__class__.__name__}({self.tag_name!r}{config})"
+        if self.config:
+            config = ", " + repr(self.config)
+        else:
+            config = ''
+        return template.format(self=self, config=config)
+
+    def __iter__(self):
+        return self
+
+    def _make_iterator(self):
+        return self.parser._iterfind_impl(self.tag_name, **self.config)
+
+    def __next__(self):
+        if self._iterator is None:
+            self._iterator = self._make_iterator()
+        return next(self._iterator)
+
+    def next(self):
+        return self.__next__()
+
+    @property
+    def is_indexed(self):
+        return False
+
+    def reset(self):
+        self._iterator = None
+        self.parser.reset()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.reset()
+
+    def map(self, *args,**kwargs):
+        raise NotImplementedError("This query isn't indexed, it cannot be mapped with multiprocessing")
+
+    def _get_by_index(self, idx):
+        self.reset()
+        value = next(islice(self, idx, idx + 1))
+        return value
+
+    def _get_by_slice(self, slc):
+        self.reset()
+        value = list(islice(self, slc.start, slc.stop, slc.step))
+        return value
+
+    def __getitem__(self, i):
+        if isinstance(i, slice):
+            return self._get_by_slice(i)
+        return self._get_by_index(i)
+
+
+class IndexedIterfind(TaskMappingMixin, Iterfind):
+
+    def __init__(self, parser, tag_name, **kwargs):
+        TaskMappingMixin.__init__(self, **kwargs)
+        Iterfind.__init__(self, parser, tag_name, **kwargs)
+
+    def _task_map_iterator(self):
+        """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC
+        queue used by :meth:`map`
+
+        Returns
+        -------
+        :class:`Iteratable`
+        """
+        return iter(self._index)
+
+    @property
+    def _offset_index(self):
+        return self._index
+
+    @property
+    def _index(self):
+        return self.parser.index[self.tag_name]
+
+    def _get_reader_for_worker_spec(self):
+        return self.parser
+
+    def _yield_from_index(self):
+        for key in self._task_map_iterator():
+            yield self.parser.get_by_id(key, **self.config)
+
+    def _make_iterator(self):
+        if self.is_indexed:
+            return self._yield_from_index()
+        warnings.warn("Non-indexed iterator created from %r" % (self, ))
+        return super(IndexedIterfind, self)._make_iterator()
+
+    @property
+    def is_indexed(self):
+        if hasattr(self.parser, 'index'):
+            if self.parser.index is not None:
+                index = self.parser.index
+                if isinstance(index, HierarchicalOffsetIndex):
+                    return bool(self.tag_name in index and index[self.tag_name])
+        return False
+
+    def _get_by_index(self, idx):
+        index = self._index
+        key = index.from_index(idx)
+        return self.parser.get_by_id(key)
+
+    def _get_by_slice(self, slc):
+        index = self._index
+        keys = index.from_slice(slc)
+        return self.parser.get_by_ids(keys)
+
+    def __len__(self):
+        index = self._index
+        return len(index)