diff --git a/dlomix/__init__.py b/dlomix/__init__.py deleted file mode 100644 index 4f3165727c670bb8dbf089e6a78f00c7e5d25b05..0000000000000000000000000000000000000000 --- a/dlomix/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -__version__ = "0.0.6" - -META_DATA = { - "author": "Omar Shouman", - "author_email": "o.shouman@tum.de", - "description": "Deep Learning for Proteomics", - "package_name": "DLOmix", - "copyright_text": "2023, Wilhelm Lab, TU Munich.", - "github_url": "https://github.com/wilhelm-lab/dlomix", -} diff --git a/dlomix/constants.py b/dlomix/constants.py deleted file mode 100644 index e771b3c7f0ad71563c36e9c19b3e024b76807f0a..0000000000000000000000000000000000000000 --- a/dlomix/constants.py +++ /dev/null @@ -1,100 +0,0 @@ -DEFAULT_PARQUET_ENGINE = "pyarrow" - -retention_time_pipeline_parameters = { - "model_params": {"seq_length": 30}, - "data_params": { - "seq_length": 30, - }, - "trained_model_path": "../pretrained_models/retention_time/example_rtmodel/", - "trained_model_zipfile_name": "rtmodel.zip", - "trained_model_stats": [0.0, 1.0], -} - -retention_time_pipeline_parameters.update( - { - "trained_model_url": "https://raw.githubusercontent.com/wilhelm-lab/dlomix/develop" - + retention_time_pipeline_parameters["trained_model_path"].strip("..") - + retention_time_pipeline_parameters["trained_model_zipfile_name"] - } -) - -ALPHABET_UNMOD = { - "A": 1, - "C": 2, - "D": 3, - "E": 4, - "F": 5, - "G": 6, - "H": 7, - "I": 8, - "K": 9, - "L": 10, - "M": 11, - "N": 12, - "P": 13, - "Q": 14, - "R": 15, - "S": 16, - "T": 17, - "V": 18, - "W": 19, - "Y": 20, -} - -# relevant for feature extraction for PTMs, only for reference -ALPHABET_PTMS = { - "A": 1, - "C": 2, - "D": 3, - "E": 4, - "F": 5, - "G": 6, - "H": 7, - "I": 8, - "K": 9, - "L": 10, - "M": 11, # amino acids - "N": 12, - "P": 13, - "Q": 14, - "R": 15, - "S": 16, - "T": 17, - "V": 18, - "W": 19, - "Y": 20, - "[]-": 21, - "-[]": 22, # termini - "M[UNIMOD:35]": 23, - "S[UNIMOD:21]": 24, - "T[UNIMOD:21]": 25, - "Y[UNIMOD:21]": 26, - "R[UNIMOD:7]": 27, - "K[UNIMOD:1]": 28, - "K[UNIMOD:121]": 29, - "Q(gl)": 30, - "R[UNIMOD:34]": 31, - "K[UNIMOD:34]": 32, - "T(ga)": 33, - "S(ga)": 34, - "T(gl)": 35, - "S(gl)": 36, - "C[UNIMOD:4]": 37, - "E(gl)": 39, - "[ac]-": 38, - "K[UNIMOD:36]": 40, - "K[UNIMOD:37]": 41, - "K[UNIMOD:122]": 42, - "K[UNIMOD:58]": 43, - "K[UNIMOD:1289]": 44, - "K[UNIMOD:747]": 45, - "K[UNIMOD:64]": 46, - "K[UNIMOD:1848]": 47, - "K[UNIMOD:1363]": 48, - "K[UNIMOD:1849]": 49, - "K[UNIMOD:3]": 50, - "R[UNIMOD:36]": 51, - "R[UNIMOD:36a]": 52, - "P[UNIMOD:35]": 53, - "Y[UNIMOD:354]": 54, -} diff --git a/dlomix/data/AbstractDataset.py b/dlomix/data/AbstractDataset.py deleted file mode 100644 index 1117ce71a5e508db12fbed42989b4b0012fb6266..0000000000000000000000000000000000000000 --- a/dlomix/data/AbstractDataset.py +++ /dev/null @@ -1,398 +0,0 @@ -import abc -from os.path import abspath, dirname - -import numpy as np -import pandas as pd -import tensorflow as tf - -from ..constants import DEFAULT_PARQUET_ENGINE -from ..utils import lower_and_trim_strings -from .parsers import ProformaParser -from .reader_utils import read_json_file, read_parquet_file_pandas - -# what characterizes a datasets --> -# 1. reading mode (string, CSV, json, parquet, in-memory, etc..) -# 2. inputs (define sequence column name and additional existing feature names) -# 3. features to extract --> abstracted out in featureextractors list -# 4. outputs --> targets to use (names of column or key name in a dict) - -# 1. identify reading mode -# and call static readerclass that take a data source and return a DataFrame (later consider other data structures) -# 2. pick inputs from the data after reader has finished, maintain the inputs dict -# 3. pick targets from the data after the reader has finished, maintain the targets dict -# 4. run feature extractors based on input sequences, maintain features dict -# 5. build TF Datasets accordingly - -# Consider collecting member variables related to the sequences in a named tuple (sequence, mod, n_term, c_term, etc..) - -# consider making the dataset object iterable --> iterate over main split tf dataset - - -class AbstractDataset(abc.ABC): - r"""Base class for datasets. - - Parameters - ----------- - data_source : str, tuple of two numpy.ndarray, numpy.ndarray, optional - source can be a tuple of two arrays (sequences, targets), single array (sequences), useful for test data, or a str with a file path to a csv file. Defaults to None. - sep : str, optional - separator to be used if the data source is a CSV file. Defaults to ",". - sequence_col : str, optional - name of the column containing the sequences in the provided CSV. Defaults to "sequence". - target_col : str, optional - name of the column containing the targets (indexed retention time). Defaults to "irt". - feature_cols : list, optional - a list of columns containing other features that can be used later as inputs to a model. Defaults to None. - seq_length : int, optional - the sequence length to be used, where all sequences will be padded to this length, longer sequences will be removed and not truncated. Defaults to 0. - parser : str, optional - name of the parser to use. Available parsers are in `dlomix.data.parsers.py`. Defaults to None; no parsing to be done on the sequence (works for unmodified sequences). - features_to_extract: list(dlomix.data.feature_extractors.SequenceFeatureExtractor), optional - List of feature extractor objects. Defaults to None; no features to extract. - batch_size : int, optional - the batch size to be used for consuming the dataset in training a model. Defaults to 32. - val_ratio : int, optional - a fraction to determine the size of the validation data (0.2 = 20%). Defaults to 0. - seed: int, optional - a seed to use for splitting the data to allow for a reproducible split. Defaults to 21. - test :bool, optional - a boolean whether the dataset is a test dataset or not. Defaults to False. - sample_run : bool, optional - a boolean to limit the number of examples to a small number, SAMPLE_RUN_N, for testing and debugging purposes. Defaults to False. - """ - - ATOM_TABLE = None - SPLIT_NAMES = ["train", "val", "test"] - BATCHES_TO_PREFETCH = tf.data.AUTOTUNE - - SAMPLE_RUN_N = 100 - METADATA_KEY = "metadata" - PARAMS_KEY = "parameters" - ANNOTATIONS_KEY = "annotations" - TARGET_NAME_KEY = "target_column_key" - SEQUENCE_COLUMN_KEY = "sequence_column_key" - - def __init__( - self, - data_source, - sep, - sequence_col, - target_col, - feature_cols=None, - seq_length=0, - parser=None, - features_to_extract=None, - batch_size=32, - val_ratio=0, - path_aminoacid_atomcounts=None, - seed=21, - test=False, - sample_run=False, - ): - super(AbstractDataset, self).__init__() - np.random.seed(seed) - - self.seed = seed - np.random.seed(self.seed) - - self.data_source = data_source - self.sep = sep - self.sequence_col = sequence_col.lower() - self.target_col = target_col.lower() - - if feature_cols: - self.feature_cols = lower_and_trim_strings(feature_cols) - else: - self.feature_cols = [] - - self.sample_run = sample_run - - # if seq_length is 0 (default) -> no padding - self.seq_length = seq_length - self.parser = parser - self.features_to_extract = features_to_extract - - self._data_mean = 0 - self._data_std = 1 - - self.batch_size = batch_size - self.val_ratio = val_ratio - self.testing_mode = test - - # main split is "train" if not in testing mode, otherwise "test" - self.main_split = ( - AbstractDataset.SPLIT_NAMES[0] - if not self.testing_mode - else AbstractDataset.SPLIT_NAMES[2] - ) - - # initialize TF Datasets dict - self.tf_dataset = ( - {self.main_split: None, AbstractDataset.SPLIT_NAMES[1]: None} - if val_ratio != 0 - else {self.main_split: None} - ) - - self.indicies_dict = ( - {self.main_split: None, AbstractDataset.SPLIT_NAMES[1]: None} - if val_ratio != 0 - else {self.main_split: None} - ) - - # if path to counts lookup table is provided, include count features, otherwise not - self.include_count_features = True if path_aminoacid_atomcounts else False - - if self.include_count_features: - self.aminoacid_atom_counts_csv_path = ( - path_aminoacid_atomcounts # "../lookups/aa_comp_rel.csv" - ) - self._init_atom_table() - - self._resolve_parser() - - self.sequences = None - self.unmodified_sequences = None - self.modifications = None - self.n_term_modifications = None - self.c_term_modifications = None - - self.sequence_features = None - self.sequence_features_names = None - - def _resolve_parser(self): - if self.parser is None: - return - elif self.parser == "proforma": - self.parser = ProformaParser() - else: - raise ValueError( - f"Invalid parser provided {self.parser}. For a list of available parsers, check dlomix.data.parsers.py" - ) - - def _parse_sequences(self): - ( - self.sequences, - self.modifications, - self.n_term_modifications, - self.c_term_modifications, - ) = self.parser.parse_sequences(self.sequences) - - def _resolve_string_data_path(self): - is_json_file = self.data_source.endswith(".json") - - if is_json_file: - json_file_base_dir = dirname(abspath(self.data_source)) - self.data_source = read_json_file(self.data_source) - self._update_data_loading_for_json_format(json_file_base_dir) - - is_parquet_url = ".parquet" in self.data_source and self.data_source.startswith( - "http" - ) - is_parquet_file = self.data_source.endswith(".parquet") - is_csv_file = self.data_source.endswith(".csv") - - if is_parquet_url or is_parquet_file: - df = read_parquet_file_pandas(self.data_source, DEFAULT_PARQUET_ENGINE) - return df - elif is_csv_file: - df = pd.read_csv(self.data_source) - return df - else: - raise ValueError( - "Invalid data source provided as a string, please provide a path to a csv, parquet, " - "or a json file." - ) - - def _extract_features(self): - if self.features_to_extract: - self.sequence_features = [] - self.sequence_features_names = [] - for feature_class in self.features_to_extract: - print("Extracting feature: ", feature_class) - extractor_class = feature_class - feature_array = np.array( - extractor_class.extract_all( - self.sequences, - self.modifications, - self.seq_length if extractor_class.pad_to_seq_length else 0, - ), - dtype=np.float32, - ) - # ensure an extra (1) dimension is added for later concatentiona - # this can be done later in tensorflow in the model as well, better ? - # what shapes of features could exist (BATCH X SEQ_LENGTH X 6), (BATCH X SEQ_LENGTH X 1) - if ( - feature_array.ndim < 3 - and feature_array.shape[-1] == self.seq_length - ): - feature_array = np.expand_dims(feature_array, axis=-1) - self.sequence_features.append(feature_array) - self.sequence_features_names.append( - extractor_class.__class__.__name__.lower() - ) - - def _reshape_sequence_feature_arrays(self): - pass - - def get_examples_at_indices(self, examples, split): - if isinstance(examples, np.ndarray): - return examples[self.indicies_dict[split]] - # to handle features - if isinstance(examples, list): - return [ - examples_single[self.indicies_dict[split]] - for examples_single in examples - ] - raise ValueError( - f"Provided data structure to subset for examples at split indices is neither a list nor a numpy array, but rather a {type(examples)}." - ) - - def _init_atom_table(self): - atom_counts = pd.read_csv(self.aminoacid_atom_counts_csv_path) - atom_counts = atom_counts.astype(str) - - keys_tensor = tf.constant(atom_counts["aa"].values) - values_tensor = tf.constant( - ["_".join(c) for c in list(atom_counts.iloc[:, 1:].values)] - ) - init = tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor) - AbstractDataset.ATOM_TABLE = tf.lookup.StaticHashTable( - init, default_value="0_0_0_0_0" - ) - - @abc.abstractmethod - def load_data(self, data): - """load data from source and populate numpy arrays to use for tf.Dataset - - Args: - data (str, tuple, dict): Path to csv or parquet file, tuple with numpy arrays, or a dict with keys - `AbstractDataset.METADATA_KEY`, `AbstractDataset.PARAMS_KEY`, - `AbstractDataset.TARGET_NAME_KEY`, `AbstractDataset.SEQUENCE_COLUMN_KEY`. - """ - raise NotImplementedError("Not implemented") - - @abc.abstractmethod - def _update_data_loading_for_json_format(self, base_dir): - raise NotImplementedError("Not implemented") - - @abc.abstractmethod - def _build_tf_dataset(self): - """Build the tf.Dataset object for available splits using the data loaded by `load_data`. - Example: - `for split in self.tf_dataset.keys(): - self.tf_dataset[split] = tf.data.Dataset.from_tensor_slices( - (self.inputs, self.outputs) - )` - """ - raise NotImplementedError("Not implemented") - - @abc.abstractmethod - def _preprocess_tf_dataset(self): - """Add processing logic (tensorflow functions) to apply to all tf.Datasets.""" - raise NotImplementedError("Not implemented") - - @abc.abstractmethod - def get_split_targets(self, split="val"): - """Retrieve all targets (original labels) for a specific split (dependent on the task at hand) - Args: - split (str, optional): Name of the split, check `AbstractDataset.SPLIT_NAMES`. Defaults to "val". - """ - raise NotImplementedError("Not implemented") - - @staticmethod - @abc.abstractmethod - def _convert_inputs_to_dict(inputs, target): - """Collect all inputs into a python dict with corresponding keys. - When multiple inputs are used,this function is used at the beginning of the pre-processing - of TF.Datasets. - - Args: - inputs (tuple(tf.Tensor)): tuple of input tensors - target (tf.Tensor): target label tensor - """ - raise NotImplementedError("Not implemented") - - def _pad_sequences(self, inputs, target): - if isinstance(inputs, dict): - inputs["sequence"] = self._pad_seq(inputs["sequence"]) - return inputs, target - else: - return self._pad_seq(inputs), target - - def _pad_seq(self, seq): - pad_len = tf.abs(self.seq_length - tf.size(seq)) - paddings = tf.concat([[0], [pad_len]], axis=0) - seq = tf.pad(seq, [paddings], "CONSTANT") - seq.set_shape([self.seq_length]) - return seq - - def _split_sequence(self, inputs, target): - if isinstance(inputs, dict): - inputs["sequence"] = tf.strings.bytes_split(inputs["sequence"]) - return inputs, target - else: - inputs = tf.strings.bytes_split(inputs) - return inputs, target - - def _generate_single_counts(self, inputs, target): - inputs["counts"] = tf.map_fn( - lambda x: AbstractDataset.ATOM_TABLE.lookup(x), inputs["sequence"] - ) - inputs["counts"] = tf.map_fn( - lambda x: tf.strings.split(x, sep="_"), inputs["counts"] - ) - inputs["counts"] = tf.strings.to_number(inputs["counts"]) - inputs["counts"].set_shape([self.seq_length, 5]) - - return inputs, target - - def _generate_di_counts(self, inputs, target): - # add every two neighboring elements without overlap [0 0 1 1 2 2 .... pad_length/2 pad_length/2] - segments_to_add = [i // 2 for i in range(self.seq_length)] - inputs["di_counts"] = tf.math.segment_sum( - inputs["counts"], tf.constant(segments_to_add) - ) - inputs["di_counts"].set_shape([self.seq_length // 2, 5]) - - return inputs, target - - def _get_tf_dataset(self, split=None): - assert ( - split in self.tf_dataset.keys() - ), f"Requested data split {split} is not available, available splits are {self.tf_dataset.keys()}" - if split in self.tf_dataset.keys(): - return self.tf_dataset[split] - return self.tf_dataset - - @property - def train_data(self): - """TensorFlow Dataset object for the training data""" - return self._get_tf_dataset(AbstractDataset.SPLIT_NAMES[0]) - - @property - def val_data(self): - """TensorFlow Dataset object for the validation data""" - return self._get_tf_dataset(AbstractDataset.SPLIT_NAMES[1]) - - @property - def test_data(self): - """TensorFlow Dataset object for the test data""" - return self._get_tf_dataset(AbstractDataset.SPLIT_NAMES[2]) - - @property - def data_mean(self): - """Mean value of the targets""" - return self._data_mean - - @property - def data_std(self): - """Standard deviation value of the targets""" - return self._data_std - - @data_mean.setter - def data_mean(self, value): - self._data_mean = value - - @data_std.setter - def data_std(self, value): - self._data_std = value diff --git a/dlomix/data/IntensityDataset.py b/dlomix/data/IntensityDataset.py deleted file mode 100644 index 1ad9e7dd11235980c89397d7ba2359ecd0493f62..0000000000000000000000000000000000000000 --- a/dlomix/data/IntensityDataset.py +++ /dev/null @@ -1,385 +0,0 @@ -from os.path import dirname, join - -import numpy as np -import tensorflow as tf - -from ..utils import convert_nested_list_to_numpy_array, flatten_dict_for_values -from .AbstractDataset import AbstractDataset - -# take into consideration if the pandas dataframe is pickled or not and then call read_pickle instead of read_csv -# allow the possiblity to have three different dataset objects, one for train, val, and test - - -class IntensityDataset(AbstractDataset): - r"""A dataset class for Intensity prediction tasks. It initialize a dataset object wrapping tf.Dataset and some relevant preprocessing steps. - - Parameters - ----------- - data_source : str, tuple of two numpy.ndarray, numpy.ndarray, optional - source can be a tuple of two arrays (sequences, targets), single array (sequences), useful for test data, or a str with a file path to a csv file. Defaults to None. - sep : str, optional - separator to be used if the data source is a CSV file. Defaults to ",". - sequence_col : str, optional - name of the column containing the sequences in the provided CSV. Defaults to "sequence". - target_col : str, optional - name of the column containing the targets (vector of intensities). Defaults to "intensities_raww". - feature_cols : list, optional - a list of columns containing other features that can be used later as inputs to a model. Defaults to None. - normalize_targets : bool, optional - a boolean whether to normalize the targets or not (subtract mean and divied by standard deviation). Defaults to False. - seq_length : int, optional - the sequence length to be used, where all sequences will be padded to this length, longer sequences will be removed and not truncated. Defaults to 0. - parser: Subclass of AbstractParser, optional - the parser to use to split amino acids and modifications. For more information, please see `dlomix.data.parsers` - batch_size : int, optional - the batch size to be used for consuming the dataset in training a model. Defaults to 32. - val_ratio : int, optional - a fraction to determine the size of the validation data (0.2 = 20%). Defaults to 0. - seed: int, optional - a seed to use for splitting the data to allow for a reproducible split. Defaults to 21. - test :bool, optional - a boolean whether the dataset is a test dataset or not. Defaults to False. - path_aminoacid_atomcounts : str, optional - a string with a path to a CSV table with the atom counts of the different amino acids (can be used for feature extraction). Defaults to None. - sample_run : bool, optional - a boolean to limit the number of examples to a small number, SAMPLE_RUN_N, for testing and debugging purposes. Defaults to False. - metadata_filtering_criteria : dict, optional - a dictionary with the filtering criteria (column names and conditions) to be used to filter the metadata. Defaults to None. - """ - - # TODO: For test dataset --> examples with longer sequences --> do not drop, add NaN for prediction - - def __init__( - self, - data_source=None, - sep=",", - sequence_col="sequence", - collision_energy_col="collision_energy_aligned_normed", - precursor_charge_col="precursor_charge_onehot", - intensities_col="intensities", - feature_cols=None, - normalize_targets=False, - seq_length=0, - parser=None, - features_to_extract=None, - batch_size=32, - val_ratio=0, - seed=21, - test=False, - path_aminoacid_atomcounts=None, - sample_run=False, - metadata_filtering_criteria=None, - ): - super().__init__( - data_source, - sep, - sequence_col, - intensities_col, - feature_cols, - seq_length, - parser, - features_to_extract, - batch_size, - val_ratio, - path_aminoacid_atomcounts, - seed, - test, - sample_run, - ) - - self.collision_energy_col = collision_energy_col.lower() - self.precursor_charge_col = precursor_charge_col.lower() - self.intensities_col = self.target_col - - self.metadata_filtering_criteria = metadata_filtering_criteria - - self.normalize_targets = normalize_targets - - self.no_intensities = self.testing_mode - - self.sequences = None - self.collision_energy = None - self.precursor_charge = None - self.intensities = None - - self.features_df = None - self.example_id = None - - # if data is provided with the constructor call --> load, otherwise --> done - if self.data_source is not None: - self.load_data(data=data_source) - - def load_data(self, data): - """Load data into the dataset object, can be used to load data at a later point after initialization. - This function triggers the whole pipeline of: data loading, validation (against sequence length), splitting, building TensorFlow dataset objects, and apply preprocessing. - - :param data: a `str` with a file path to csv file - :return: None - """ - self.data_source = data - - self._read_data() - # consider removing lengthy sequences when no parser is passed - - # Numpy & Pandas - if self.parser: - self._parse_sequences() - self._validate_remove_long_sequences() - if self.features_to_extract: - self._extract_features() - self._split_data() - - # TF.Dataset - self._build_tf_dataset() - self._preprocess_tf_dataset() - - """ - numpy array --> either a tuple or a single array - - Tuple --> means (sequences, collision_energy, precursor_charge, intensities) - - single ndarray --> means sequences only, useful for test dataset - str --> path to csv file or compressed csv file - """ - - def _read_data(self): - if isinstance(self.data_source, tuple): - tuple_size_is_three_or_four = ( - len(self.data_source) == 3 or len(self.data_source) == 4 - ) - if tuple_size_is_three_or_four: - tuple_elements_are_ndarray = all( - [isinstance(x, np.ndarray) for x in self.data_source] - ) - if tuple_elements_are_ndarray: - self.sequences = self.data_source[0] - self.collision_energy = self.data_source[1] - self.precursor_charge = self.data_source[2] - if len(self.data_source) == 4: - self.intensities = self.data_source[3] - self.no_intensities = False - - else: - self.intensities = np.zeros() - self.no_intensities = True - else: - raise ValueError( - "If a tuple is provided, it has to have a length of 4 and all elements should be numpy arrays." - ) - elif isinstance(self.data_source, str): - df = self._resolve_string_data_path() - - # used only for testing with a smaller sample from a csv file - if self.sample_run: - df = df.head(IntensityDataset.SAMPLE_RUN_N) - - # lower all column names - df.columns = [col_name.lower() for col_name in df.columns] - - # retrieve columns from the dataframe - self.sequences = df[self.sequence_col] - self.collision_energy = df[self.collision_energy_col] - self.precursor_charge = df[self.precursor_charge_col] - self.intensities = df[self.intensities_col] - - # parse strings into lists, for precursor charge and intensities - if isinstance(self.precursor_charge.iloc[0], str): - self.precursor_charge = self.precursor_charge.apply(eval) - - if isinstance(self.intensities.iloc[0], str): - self.intensities = self.intensities.apply(eval) - - # get numpy arrays with .values() for all inputs and intensities - - self.sequences = self.sequences.values - - # for concatenation later, we expand dimensions - self.collision_energy = self.collision_energy.values.reshape(-1, 1) - - self.precursor_charge = convert_nested_list_to_numpy_array( - self.precursor_charge.values, dtype=np.float32 - ) - self.intensities = convert_nested_list_to_numpy_array( - self.intensities.values - ) - - self.features_df = df[self.feature_cols] - else: - raise ValueError( - "Data source has to be either a tuple of four numpy arrays," - "or a string path to a csv file." - ) - - # give the index of the element as an ID for later reference if needed - self.example_id = list(range(len(self.sequences))) - - def _update_data_loading_for_json_format(self, base_dir=None): - import prospectdataset as prospect - - json_dict = self.data_source - meta_data_filepath = json_dict.get(IntensityDataset.METADATA_KEY, "") - annotation_data_value = json_dict.get(IntensityDataset.ANNOTATIONS_KEY, "") - annotations_filepaths = flatten_dict_for_values(annotation_data_value) - - # meta data file is assumed to be in the same path as the json input file - if base_dir: - meta_data_filepath = join(base_dir, meta_data_filepath) - annotations_filepaths = [ - join(base_dir, file) for file in annotations_filepaths - ] - - # all annotation files are assumed to be in the same directory - if len(annotations_filepaths) > 0: - annotations_dir = dirname(annotations_filepaths[0]) - else: - raise ValueError( - "No paths to annotation files were provided in the JSON file." - ) - - # ToDo: consider options to check if the files were processed earlier and skip this step since it is time consuming - - # to pass metadata_filtering_criteria - - print("Optionally Downloading and processing the data...") - print("Annotations directory: ", annotations_dir) - - # fix directory path, use file names from the json file ??? - print("Metadata filepath: ", meta_data_filepath) - print("Base directory: ", base_dir) - - self.data_source = prospect.download_process_pool( - annotations_data_dir=annotations_dir, - metadata_path=meta_data_filepath, - save_filepath=join(base_dir, "processed_pool.parquet"), - metadata_filtering_criteria=self.metadata_filtering_criteria, - ) - - self.intensities_col = json_dict.get(IntensityDataset.PARAMS_KEY, {}).get( - IntensityDataset.TARGET_NAME_KEY, self.intensities_col - ) - # ToDo: make dynamic based on parameters - self.sequence_col = "modified_sequence" - - def _validate_remove_long_sequences(self) -> None: - """ - Validate if all sequences are shorter than the padding length, otherwise drop them. - """ - assert self.sequences.shape[0] > 0, "No sequences in the provided data." - - # check if count of examples matches for all provided inputs - lengths = [ - len(self.sequences), - len(self.collision_energy), - len(self.precursor_charge), - ] - if not self.no_intensities: - lengths = lengths + [len(self.intensities)] - - assert np.all( - lengths == np.array(lengths[0]) - ), "Count of examples does not match for sequences and targets." - - limit = self.seq_length - vectorized_len = np.vectorize(lambda x: len(x)) - mask = vectorized_len(self.sequences) <= limit - self.sequences = self.sequences[mask] - self.collision_energy = self.collision_energy[mask] - self.precursor_charge = self.precursor_charge[mask] - self.intensities = self.intensities[mask] - - # once feature columns are introduced, apply the mask to the feature columns (subset the dataframe as well) - - def _split_data(self): - n = len(self.sequences) - - if self.val_ratio != 0 and (not self.testing_mode): - # add randomization for now and later consider the splitting logic - self.indicies_dict[IntensityDataset.SPLIT_NAMES[1]] = np.arange(n)[ - : int(n * self.val_ratio) - ] - self.indicies_dict[self.main_split] = np.arange(n)[ - int(n * self.val_ratio) : - ] - else: - self.indicies_dict[self.main_split] = np.arange(n) - - def _build_tf_dataset(self): - input_dict = {} - - for split in self.tf_dataset.keys(): - input_dict["sequence"] = self.get_examples_at_indices(self.sequences, split) - if self.features_to_extract: - for feature_name, feature_values in zip( - self.sequence_features_names, self.sequence_features - ): - input_dict[feature_name] = self.get_examples_at_indices( - feature_values, split - ) - - input_dict["collision_energy"] = self.get_examples_at_indices( - self.collision_energy, split - ) - input_dict["precursor_charge"] = self.get_examples_at_indices( - self.precursor_charge, split - ) - input_dict["target"] = self.get_examples_at_indices(self.intensities, split) - - self.tf_dataset[split] = tf.data.Dataset.from_tensor_slices(input_dict) - - def _preprocess_tf_dataset(self): - # ToDo: convert input to dict and assume this as the general case --> abstract out in parent class - - for split in self.tf_dataset.keys(): - self.tf_dataset[split] = ( - self.tf_dataset[split] - .map( - IntensityDataset._convert_inputs_to_dict, - num_parallel_calls=tf.data.AUTOTUNE, - ) - .map( - lambda i, t: self._split_sequence(i, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - .map( - lambda i, t: self._pad_sequences(i, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ) - - # Here: feature engineering on the fly if needed (atom counts, etc...) - - self.tf_dataset[split] = ( - self.tf_dataset[split] - .batch(self.batch_size) - .prefetch(IntensityDataset.BATCHES_TO_PREFETCH) - ) - - def get_split_targets(self, split="val"): - """Retrieve all targets (original labels) for a specific split. - - :param split: a string specifiying the split name (train, val, test) - :return: nd.array with the targets - """ - if split not in self.indicies_dict.keys(): - raise ValueError( - "requested split does not exist, availabe splits are: " - + list(self.indicies_dict.keys()) - ) - - return self.intensities[self.indicies_dict[split]] - - def denormalize_targets(self, targets): - """Denormalize the given targets (can also be predictions) by multiplying the standard deviation and adding the mean. - - :param targets: an nd.array with targets or predictions - :return: a denormalized nd.array with the targets or the predictions - """ - return targets * self._data_std + self._data_mean - - def _normalize_target(self, seq, target): - target = tf.math.divide( - tf.math.subtract(target, self._data_mean), self._data_std - ) - return seq, target - - @staticmethod - def _convert_inputs_to_dict(inputs): - return inputs, inputs.pop("target") diff --git a/dlomix/data/RetentionTimeDataset.py b/dlomix/data/RetentionTimeDataset.py deleted file mode 100644 index 542927ae46daf07b1d39c479ba7746c914df0cf5..0000000000000000000000000000000000000000 --- a/dlomix/data/RetentionTimeDataset.py +++ /dev/null @@ -1,370 +0,0 @@ -from os.path import join - -import numpy as np -import pandas as pd -import tensorflow as tf - -from .AbstractDataset import AbstractDataset - -# take into consideration if the pandas dataframe is pickled or not and then call read_pickle instead of read_csv -# allow the possiblity to have three different dataset objects, one for train, val, and test - - -class RetentionTimeDataset(AbstractDataset): - r"""A dataset class for Retention Time prediction tasks. It initialize a dataset object wrapping tf.Dataset and some relevant preprocessing steps. - - Parameters - ----------- - data_source : str, tuple of two numpy.ndarray, numpy.ndarray, optional - source can be a tuple of two arrays (sequences, targets), single array (sequences), useful for test data, or a str with a file path to a csv file. Defaults to None. - sep : str, optional - separator to be used if the data source is a CSV file. Defaults to ",". - sequence_col : str, optional - name of the column containing the sequences in the provided CSV. Defaults to "sequence". - target_col : str, optional - name of the column containing the targets (indexed retention time). Defaults to "irt". - feature_cols : list, optional - a list of columns containing other features that can be used later as inputs to a model. Defaults to None. - normalize_targets : bool, optional - a boolean whether to normalize the targets or not (subtract mean and divied by standard deviation). Defaults to False. - seq_length : int, optional - the sequence length to be used, where all sequences will be padded to this length, longer sequences will be removed and not truncated. Defaults to 0. - parser: Subclass of AbstractParser, optional - the parser to use to split amino acids and modifications. For more information, please see `dlomix.data.parsers` - batch_size : int, optional - the batch size to be used for consuming the dataset in training a model. Defaults to 32. - val_ratio : int, optional - a fraction to determine the size of the validation data (0.2 = 20%). Defaults to 0. - seed: int, optional - a seed to use for splitting the data to allow for a reproducible split. Defaults to 21. - test :bool, optional - a boolean whether the dataset is a test dataset or not. Defaults to False. - path_aminoacid_atomcounts : str, optional - a string with a path to a CSV table with the atom counts of the different amino acids (can be used for feature extraction). Defaults to None. - sample_run : bool, optional - a boolean to limit the number of examples to a small number, SAMPLE_RUN_N, for testing and debugging purposes. Defaults to False. - """ - - # TODO: For test dataset --> examples with longer sequences --> do not drop, add NaN for prediction - - def __init__( - self, - data_source=None, - sep=",", - sequence_col="sequence", - target_col="irt", - feature_cols=None, - normalize_targets=False, - seq_length=0, - parser=None, - features_to_extract=None, - batch_size=32, - val_ratio=0, - seed=21, - test=False, - path_aminoacid_atomcounts=None, - sample_run=False, - ): - super().__init__( - data_source, - sep, - sequence_col, - target_col, - feature_cols, - seq_length, - parser, - features_to_extract, - batch_size, - val_ratio, - path_aminoacid_atomcounts, - seed, - test, - sample_run, - ) - - self.normalize_targets = normalize_targets - - self.sequences = None - self.targets = None - self.features_df = None - self.example_id = None - - # if data is provided with the constructor call --> load, otherwise --> done - if self.data_source is not None: - self.load_data(data=data_source) - - def load_data(self, data): - """Load data into the dataset object, can be used to load data at a later point after initialization. - This function triggers the whole pipeline of: data loading, validation (against sequence length), splitting, building TensorFlow dataset objects, and apply preprocessing. - - :param data: can be: tuple of two arrays (sequences, targets), single array (sequences), useful for test data, or a `str` with a file path toa csv file - :return: None - """ - self.data_source = data - - self._read_data() - if self.parser: - self._parse_sequences() - self._validate_remove_long_sequences() - if self.features_to_extract: - self._extract_features() - self._split_data() - self._build_tf_dataset() - self._preprocess_tf_dataset() - - """ - numpy array --> either a tuple or a single array - - Tuple --> means (sequences, targets) - - single ndarray --> means sequences only, useful for test dataset - str --> path to csv file or compressed csv file - """ - - def _read_data(self): - if isinstance(self.data_source, dict): - self._update_data_loading_for_json_format() - - if isinstance(self.data_source, tuple): - tuple_size_is_two = len(self.data_source) == 2 - if tuple_size_is_two: - tuple_elements_are_ndarray = isinstance( - self.data_source[0], np.ndarray - ) and isinstance(self.data_source[1], np.ndarray) - if tuple_elements_are_ndarray: - self.sequences = self.data_source[0] - self.targets = self.data_source[1] - else: - raise ValueError( - "If a tuple is provided, it has to have a length of 2 and both elements should be numpy arrays." - ) - - elif isinstance(self.data_source, np.ndarray): - self.sequences = self.data_source - self.targets = np.zeros(self.sequences.shape[0]) - self._data_mean, self._data_std = 0, 1 - - elif isinstance(self.data_source, (str, dict)): - if isinstance(self.data_source, dict): - # a dict is passed in-memory via the json - df = pd.DataFrame(self.data_source) - else: - # a string path is passed via the json or as a constructor argument - df = self._resolve_string_data_path() - - # consider sorting to leverage caching when extracting features - # df.sort_values(by=self.sequence_col, inplace=True) - - # used only for testing with a smaller sample from a csv file - if self.sample_run: - df = df.head(RetentionTimeDataset.SAMPLE_RUN_N) - - # lower all column names - df.columns = [col_name.lower() for col_name in df.columns] - - self.sequences, self.targets = ( - df[self.sequence_col].values, - df[self.target_col].values, - ) - self._data_mean, self._data_std = np.mean(self.targets), np.std( - self.targets - ) - - self.features_df = df[self.feature_cols] - else: - raise ValueError( - "Data source has to be either a tuple of two numpy arrays, a single numpy array, " - "or a string with a path to a csv/parquet/json file." - ) - - # give the index of the element as an ID for later reference if needed - self.example_id = list(range(len(self.sequences))) - - def _update_data_loading_for_json_format(self, base_dir=None): - json_dict = self.data_source - - self.data_source = json_dict.get(RetentionTimeDataset.METADATA_KEY, "") - - # meta data file is assumed to be in the same path as the json input file - if base_dir: - self.data_source = join( - base_dir, json_dict.get(RetentionTimeDataset.METADATA_KEY, "") - ) - - self.target_col = json_dict.get(RetentionTimeDataset.PARAMS_KEY, {}).get( - RetentionTimeDataset.TARGET_NAME_KEY, self.target_col - ) - # ToDo: make dynamic based on parameters - self.sequence_col = "modified_sequence" - - def _validate_remove_long_sequences(self) -> None: - """ - Validate if all sequences are shorter than the padding length, otherwise drop them. - """ - if self.sequences.shape[0] <= 0: - raise ValueError( - "No sequences in the provided data or sequences were not parsed correctly." - ) - - if len(self.sequences) != len(self.targets): - raise ValueError( - "Count of examples does not match for sequences and targets." - ) - - limit = self.seq_length - vectorized_len = np.vectorize(lambda x: len(x)) - mask = vectorized_len(self.sequences) <= limit - self.sequences, self.targets = self.sequences[mask], self.targets[mask] - self.modifications = self.modifications[mask] - - self.n_term_modifications, self.c_term_modifications = ( - self.n_term_modifications[mask], - self.c_term_modifications[mask], - ) - - # once feature columns are introduced, apply the mask to the feature columns (subset the dataframe as well) - - def _split_data(self): - n = len(self.sequences) - - if self.val_ratio != 0 and (not self.testing_mode): - # add randomization for now and later consider the splitting logic - self.indicies_dict[RetentionTimeDataset.SPLIT_NAMES[1]] = np.arange(n)[ - : int(n * self.val_ratio) - ] - self.indicies_dict[self.main_split] = np.arange(n)[ - int(n * self.val_ratio) : - ] - else: - self.indicies_dict[self.main_split] = np.arange(n) - - def _build_tf_dataset(self): - input_dict = {} - - for split in self.tf_dataset.keys(): - input_dict["sequence"] = self.get_examples_at_indices(self.sequences, split) - - if self.features_to_extract: - for feature_name, feature_values in zip( - self.sequence_features_names, self.sequence_features - ): - input_dict[feature_name] = self.get_examples_at_indices( - feature_values, split - ) - - input_dict["target"] = self.get_examples_at_indices(self.targets, split) - - self.tf_dataset[split] = tf.data.Dataset.from_tensor_slices(input_dict) - - def _preprocess_tf_dataset(self): - for split in self.tf_dataset.keys(): - self.tf_dataset[split] = self.tf_dataset[split].map( - RetentionTimeDataset._convert_inputs_to_dict, - num_parallel_calls=tf.data.AUTOTUNE, - ) - - # avoid normalizing targets for test data --> should not be needed - if self.normalize_targets and not self.testing_mode: - self.tf_dataset[split] = self.tf_dataset[split].map( - lambda s, t: self._normalize_target(s, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - - self.tf_dataset[split] = ( - self.tf_dataset[split] - .map( - lambda s, t: self._split_sequence(s, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - .map( - lambda s, t: self._pad_sequences(s, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ) - - if self.include_count_features: - self.tf_dataset[split] = ( - self.tf_dataset[split] - .map( - RetentionTimeDataset._convert_inputs_to_dict, - num_parallel_calls=tf.data.AUTOTUNE, - ) - .map( - lambda s, t: self._generate_single_counts(s, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - .map( - lambda s, t: self._generate_di_counts(s, t), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ) - - self.tf_dataset[split] = ( - self.tf_dataset[split] - .batch(self.batch_size) - .prefetch(RetentionTimeDataset.BATCHES_TO_PREFETCH) - ) - - def get_split_targets(self, split="val"): - """Retrieve all targets (original labels) for a specific split. - - :param split: a string specifiying the split name (train, val, test) - :return: nd.array with the targets - """ - if split not in self.indicies_dict.keys(): - raise ValueError( - "requested split does not exist, availabe splits are: " - + list(self.indicies_dict.keys()) - ) - - return self.targets[self.indicies_dict[split]] - - def denormalize_targets(self, targets): - """Denormalize the given targets (can also be predictions) by multiplying the standard deviation and adding the mean. - - :param targets: an nd.array with targets or predictions - :return: a denormalized nd.array with the targets or the predictions - """ - if self.normalize_targets: - return targets * self._data_std + self._data_mean - else: - return targets - - def _normalize_target(self, seq, target): - target = tf.math.divide( - tf.math.subtract(target, self._data_mean), self._data_std - ) - return seq, target - - """ - if more than one input is added, inputs are added to a python dict, the following methods assume that - """ - - @staticmethod - def _convert_inputs_to_dict(inputs): - return inputs, inputs.pop("target") - - -if __name__ == "__main__": - test_data_dict = { - "metadata": { - "linear rt": [1, 2, 3], - "modified_sequence": ["ABC", "ABC", "ABC"], - }, - "annotations": {}, - "parameters": {"target_column_key": "linear rt"}, - } - - pd.DataFrame(test_data_dict["metadata"]).to_parquet("metadata.parquet") - - test_data_dict_file = { - "metadata": "metadata.parquet", - "annotations": {}, - "parameters": {"target_column_key": "linear rt"}, - } - - rtdataset = RetentionTimeDataset(data_source=test_data_dict, seq_length=20) - print(rtdataset.sequences) - print(rtdataset.targets) - - rtdataset = RetentionTimeDataset(data_source=test_data_dict_file, seq_length=20) - print(rtdataset.sequences) - print(rtdataset.targets) diff --git a/dlomix/data/__init__.py b/dlomix/data/__init__.py deleted file mode 100644 index 5d9bcfbc63b8fb1418622d92f730fb89b390cae9..0000000000000000000000000000000000000000 --- a/dlomix/data/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .AbstractDataset import * -from .feature_extractors import * -from .IntensityDataset import * -from .RetentionTimeDataset import * - -__all__ = [ - "RetentionTimeDataset", - "IntensityDataset", - "AbstractDataset", - "LengthFeature", - "SequenceFeatureExtractor", - "ModificationLocationFeature", - "ModificationLossFeature", - "ModificationGainFeature", -] diff --git a/dlomix/data/feature_extractors.py b/dlomix/data/feature_extractors.py deleted file mode 100644 index 98f5b592e1a3975824826e35748c8d16725d0cbc..0000000000000000000000000000000000000000 --- a/dlomix/data/feature_extractors.py +++ /dev/null @@ -1,198 +0,0 @@ -import abc - -from ..utils import get_constructor_call_object_creation - - -class SequenceFeatureExtractor(abc.ABC): - def __init__(self, pad_to_seq_length=False, padding_element=-1): - super(SequenceFeatureExtractor, self).__init__() - self.pad_to_seq_length = pad_to_seq_length - self.padding_element = padding_element - - @abc.abstractmethod - def extract(self, seq, mods, **kwargs): - pass - - def extract_all(self, sequences, modifications, seq_length=0): - features = [] - for seq, mods in zip(sequences, modifications): - feature = self.extract(seq, mods, seq_length=seq_length) - if seq_length: - feature = self.pad_feature_to_seq_length(feature, seq_length) - features.append(feature) - return features - - def pad_feature_to_seq_length(self, single_feature, seq_length=0): - feature_length = len(single_feature) - - if feature_length > seq_length: - raise ValueError( - f"Feature length ({len(single_feature)}) is longer than sequence length provided ({seq_length})." - ) - - padding_length = seq_length - feature_length - single_feature += [self.padding_element] * padding_length - - return single_feature - - def __repr__(self) -> str: - return get_constructor_call_object_creation(self) - - -class LengthFeature(SequenceFeatureExtractor): - def __init__(self): - super(LengthFeature, self).__init__() - - def extract(self, seq, mods, **kwargs): - return len(seq) - - -class ModificationLocationFeature(SequenceFeatureExtractor): - DICT_PTM_MOD_ATOM = { - "M[UNIMOD:35]": 4, - "S[UNIMOD:21]": 3, - "T[UNIMOD:21]": 3, - "Y[UNIMOD:21]": 3, - "R[UNIMOD:7]": 1, - "K[UNIMOD:1]": 2, - "K[UNIMOD:121]": 2, - "Q(gl)": 1, - "R[UNIMOD:34]": 2, - "K[UNIMOD:34]": 2, - "T(ga)": 3, - "S(ga)": 3, - "T(gl)": 3, - "S(gl)": 3, - "C[UNIMOD:4]": 4, - "[ac]-": 2, - "E(gl)": 1, - "K[UNIMOD:36]": 2, - "K[UNIMOD:37]": 2, - "K[UNIMOD:122]": 2, - "K[UNIMOD:58]": 2, - "K[UNIMOD:1289]": 2, - "K[UNIMOD:747]": 2, - "K[UNIMOD:64]": 2, - "K[UNIMOD:1848]": 2, - "K[UNIMOD:1363]": 2, - "K[UNIMOD:1849]": 2, - "K[UNIMOD:3]": 2, - "unknown": 1, - "R[UNIMOD:36]": 2, - "P[UNIMOD:35]": 1, - "Y[UNIMOD:354]": 1, - } - - def __init__(self): - super(ModificationLocationFeature, self).__init__(pad_to_seq_length=True) - - def extract(self, seq, mods, seq_length): - modified_aas = [f"{s}[UNIMOD:{m}]" for s, m in zip(seq, mods)] - feature = [ - ModificationLocationFeature.DICT_PTM_MOD_ATOM.get(i, 0) - for i in modified_aas - ] - - return feature - - -class ModificationLossFeature(SequenceFeatureExtractor): - PTM_LOSS_LOOKUP = { - "M[UNIMOD:35]": [0, 0, 0, 0, 0, 0], - "S[UNIMOD:21]": [1, 0, 0, 0, 0, 0], - "T[UNIMOD:21]": [1, 0, 0, 0, 0, 0], - "Y[UNIMOD:21]": [1, 0, 0, 0, 0, 0], - "R[UNIMOD:7]": [1, 0, 1, 0, 0, 0], - "K[UNIMOD:1]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:121]": [1, 0, 0, 0, 0, 0], - "Q(gl)": [9, 4, 2, 1, 0, 0], - "R[UNIMOD:34]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:34]": [1, 0, 0, 0, 0, 0], - "T(ga)": [1, 0, 0, 0, 0, 0], - "S(ga)": [1, 0, 0, 0, 0, 0], - "T(gl)": [1, 0, 0, 0, 0, 0], - "S(gl)": [1, 0, 0, 0, 0, 0], - "C[UNIMOD:4]": [1, 0, 0, 0, 0, 0], - "[ac]-": [1, 0, 0, 0, 0, 0], - "E(gl)": [8, 4, 1, 2, 0, 0], - "K[UNIMOD:36]": [2, 0, 0, 0, 0, 0], - "K[UNIMOD:37]": [3, 0, 0, 0, 0, 0], - "K[UNIMOD:122]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:58]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:1289]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:747]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:64]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:1848]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:1363]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:1849]": [1, 0, 0, 0, 0, 0], - "K[UNIMOD:3]": [1, 0, 0, 0, 0, 0], - "unknown": [3, 0, 2, 0, 0, 0], - "R[UNIMOD:36]": [2, 0, 0, 0, 0, 0], - "P[UNIMOD:35]": [1, 0, 0, 0, 0, 0], - "Y[UNIMOD:354]": [1, 0, 0, 0, 0, 0], - } - - def __init__(self): - super(ModificationLossFeature, self).__init__( - pad_to_seq_length=True, padding_element=[0, 0, 0, 0, 0, 0] - ) - - def extract(self, seq, mods, seq_length): - modified_aas = [f"{s}[UNIMOD:{m}]" for s, m in zip(seq, mods)] - feature = [ - ModificationLossFeature.PTM_LOSS_LOOKUP.get(i, [0] * 6) - for i in modified_aas - ] - - return feature - - -class ModificationGainFeature(SequenceFeatureExtractor): - PTM_GAIN_LOOKUP = { - "M[UNIMOD:35]": [0, 0, 0, 1, 0, 0], - "S[UNIMOD:21]": [2, 0, 0, 3, 1, 0], - "T[UNIMOD:21]": [2, 0, 0, 3, 1, 0], - "Y[UNIMOD:21]": [2, 0, 0, 3, 1, 0], - "R[UNIMOD:7]": [0, 0, 0, 1, 0, 0], - "K[UNIMOD:1]": [3, 2, 0, 1, 0, 0], - "K[UNIMOD:121]": [7, 4, 2, 2, 0, 0], - "Q(gl)": [6, 4, 1, 1, 0, 0], - "R[UNIMOD:34]": [3, 1, 0, 0, 0, 0], - "K[UNIMOD:34]": [3, 1, 0, 0, 0, 0], - "T(ga)": [14, 8, 1, 5, 0, 0], - "S(ga)": [14, 8, 1, 5, 0, 0], - "T(gl)": [14, 8, 1, 5, 0, 0], - "S(gl)": [14, 8, 1, 5, 0, 0], - "C[UNIMOD:4]": [4, 2, 1, 1, 0, 0], - "[ac]-": [3, 2, 0, 1, 0, 0], - "E(gl)": [6, 4, 1, 1, 0, 0], - "K[UNIMOD:36]": [6, 2, 0, 0, 0, 0], - "K[UNIMOD:37]": [9, 3, 0, 0, 0, 0], - "K[UNIMOD:122]": [0, 1, 0, 1, 0, 0], - "K[UNIMOD:58]": [5, 3, 0, 1, 0, 0], - "K[UNIMOD:1289]": [7, 4, 0, 1, 0, 0], - "K[UNIMOD:747]": [3, 3, 0, 3, 0, 0], - "K[UNIMOD:64]": [5, 4, 0, 3, 0, 0], - "K[UNIMOD:1848]": [7, 5, 0, 3, 0, 0], - "K[UNIMOD:1363]": [5, 4, 0, 1, 0, 0], - "K[UNIMOD:1849]": [7, 4, 0, 2, 0, 0], - "K[UNIMOD:3]": [15, 10, 2, 2, 0, 1], - "unknown": [7, 2, 2, 0, 0, 0], - "R[UNIMOD:36]": [6, 2, 0, 0, 0, 0], - "P[UNIMOD:35]": [1, 0, 0, 1, 0, 0], - "Y[UNIMOD:354]": [0, 0, 1, 2, 0, 0], - } - - def __init__(self): - super(ModificationGainFeature, self).__init__( - pad_to_seq_length=True, padding_element=[0, 0, 0, 0, 0, 0] - ) - - def extract(self, seq, mods, seq_length): - modified_aas = [f"{s}[UNIMOD:{m}]" for s, m in zip(seq, mods)] - feature = [ - ModificationGainFeature.PTM_GAIN_LOOKUP.get(i, [0] * 6) - for i in modified_aas - ] - - return feature diff --git a/dlomix/data/parsers.py b/dlomix/data/parsers.py deleted file mode 100644 index e3256faab57b61f961e96e968abde8bb1575a460..0000000000000000000000000000000000000000 --- a/dlomix/data/parsers.py +++ /dev/null @@ -1,106 +0,0 @@ -import abc - -import numpy as np -from pyteomics.proforma import parse - - -class AbstractParser(abc.ABC): - """ - Abstract class for Parsers that read sequences and split the modification information from the amino acids. - The abstract method `_parse_sequence(self, sequence)` is to be implemented by child classes. - """ - - @abc.abstractmethod - def _parse_sequence(self, sequence: str): - """parse a single sequence and return amino acids and modifications as separate data structures. - - Args: - sequence (str): a modified sequence - """ - raise NotImplementedError("Not implemented.") - - def _take_first_modification_proforma_output(self, mods): - # # take first non-null element (modification only) (applied to all modifications including n and c terminal) - # # ensure it is a single element and not a string - # return next(filter(lambda x: x is not None, mods), None) - return [m[0].id if m is not None else -1 for m in mods] - - def _flatten_seq_mods(self, parsed_sequence: list): - """helper function to flatten a list of tuples to two lists. - - Args: - parsed_sequence (list): a list of tuples (Amino Acids, Modification) `[('A', None), ('B', Unimod:1), ('C', None)]` - - Returns: - list: a list of two lists or tuples (one for Amino acids and the other for modifications). `[['A', 'B', 'C'], [None, Unimod:1, None]]` - """ - seq, mods = [list(i) for i in zip(*parsed_sequence)] - return seq, mods - - def parse_sequences(self, sequences): - """a generic function to apply the implementation of `_parse_sequence` to a list of sequencens. - - Args: - sequences (list): list of string sequences, possibly with modifications. - - Returns: - tuple(list, list, list, list): sequences, modifications, n_terminal modifications, c_terminal modifications - """ - seqs = [] - mods = [] - n_terms = [] - c_terms = [] - for seq in sequences: - seq, mod, n, c = self._parse_sequence(seq) - - # build sequence as a string from Amino Acid list - seq = "".join(seq) - seqs.append(seq) - - mods.append(mod) - - n_terms.append(n) - c_terms.append(c) - seqs = np.array(seqs) - - mods = np.array(mods, dtype=object) - n_terms = np.array(n_terms) - c_terms = np.array(c_terms) - return seqs, mods, n_terms, c_terms - - -class ProformaParser(AbstractParser): - def __init__(self): - super().__init__() - - def _parse_sequence(self, sequence): - """Implementation for parsing sequences according to the Proforma notation based on the Unimod representation. - - Args: - sequence (str): sequence of amino acids, possibly with modifications. - N-term and C-term modifications have to be separated with a `-`. Example: `[Unimod:1]-ABC` - - Returns: - tuple(list, list, list): output of `pyteomics.proforma.parse' with the n-term and c-term modifications - extracted from the originally returned modifiers dict. - More information: https://pyteomics.readthedocs.io/en/latest/api/proforma.html#pyteomics.proforma.parse - """ - # returns tuple (list of tuples (AA, mods), and a dict with properties) - parsed_sequence, terminal_mods_dict = parse(sequence) - - n_term_mods = terminal_mods_dict.get("n_term") - c_term_mods = terminal_mods_dict.get("c_term") - - if n_term_mods: - n_term_mods = n_term_mods.pop().id - else: - n_term_mods = -1 - if c_term_mods: - c_term_mods = c_term_mods.pop().id - else: - c_term_mods = -1 - - seq, mod = self._flatten_seq_mods(parsed_sequence) - mod = self._take_first_modification_proforma_output(mod) - - return seq, mod, n_term_mods, c_term_mods diff --git a/dlomix/data/reader_utils.py b/dlomix/data/reader_utils.py deleted file mode 100644 index 3f3ed4716afbe425bb733402fb911bf4bfe26745..0000000000000000000000000000000000000000 --- a/dlomix/data/reader_utils.py +++ /dev/null @@ -1,52 +0,0 @@ -import json - -import pandas as pd - - -def read_parquet_file_pandas(filepath, parquet_engine): - """ - Reads a Parquet file located at the given filepath using pandas and the specified Parquet engine. - - Parameters: - ----------- - filepath : str - The file path of the Parquet file to read. - parquet_engine : str - The name of the Parquet engine to use for reading the file. - - Returns: - -------- - pandas.DataFrame - A pandas DataFrame containing the data from the Parquet file. - - Raises: - ------- - ImportError - If the specified Parquet engine is missing, fastparquet must be installed. - """ - try: - df = pd.read_parquet(filepath, engine=parquet_engine) - except ImportError: - raise ImportError( - "Parquet engine is missing, please install fastparquet using pip or conda." - ) - return df - - -def read_json_file(filepath): - """ - Reads a JSON file located at the given filepath and returns its contents as a dictionary. - - Parameters: - ----------- - filepath : str - The file path of the JSON file to read. - - Returns: - -------- - dict - A dictionary containing the contents of the JSON file. - """ - with open(filepath, "r") as j: - json_dict = json.loads(j.read()) - return json_dict diff --git a/dlomix/eval/__init__.py b/dlomix/eval/__init__.py deleted file mode 100644 index 4df20838d7797f0f623af0f6904a50d85b91a39f..0000000000000000000000000000000000000000 --- a/dlomix/eval/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .rt_eval import TimeDeltaMetric - -__all__ = [TimeDeltaMetric] diff --git a/dlomix/eval/rt_eval.py b/dlomix/eval/rt_eval.py deleted file mode 100644 index 231cbf19846a8aea4c620bb168ce5b8fa49cf951..0000000000000000000000000000000000000000 --- a/dlomix/eval/rt_eval.py +++ /dev/null @@ -1,83 +0,0 @@ -import tensorflow as tf -import tensorflow.keras.backend as K - - -class TimeDeltaMetric(tf.keras.metrics.Metric): - """Implementation of the time delta metric as a Keras Metric. - - Parameters - ---------- - mean (int, optional): Mean value of the targets in case normalization was performed. Defaults to 0. - std (int, optional): Standard deviation value of the targets in case normalization was performed. Defaults to 1. - percentage (float, optional): What percentage of the data points to consider, this is specific to the conmputation of the metric. Defaults to 0.95 which corresponds to 95% of the datapoints and is the mostly used value in papers. - name (str, optional): Name of the metric so that it can be reported and used later in Keras History objects. Defaults to 'timedelta'. - rescale_targets (bool, optional): Whether to rescale (denormalize) targets or not. Defaults to False. - rescale_predictions (bool, optional): Whether to rescale (denormalize) predictions or not. Defaults to False. - double_delta (bool, optional): Whether to multiple the computed delta by 2 in order to make it two-sided or not. Defaults to False. - """ - - def __init__( - self, - mean=0, - std=1, - percentage=0.95, - name="timedelta", - rescale_targets=False, - rescale_predictions=False, - double_delta=False, - **kwargs - ): - - super(TimeDeltaMetric, self).__init__(name=name, **kwargs) - self.delta = self.add_weight(name="delta", initializer="zeros") - self.batch_count = self.add_weight(name="batch-count", initializer="zeros") - self.mean = mean - self.std = std - self.percentage = percentage - self.rescale_targets = rescale_targets - self.rescale_predictions = rescale_predictions - self.double_delta = double_delta - - def update_state(self, y_true, y_pred, sample_weight=None): - # rescale - if self.rescale_targets: - y_true = y_true * self.std + self.mean - - if self.rescale_predictions: - y_pred = y_pred * self.std + self.mean - - # find position of the index - length = tf.shape(y_true)[0] - mark = tf.cast(length, dtype=tf.float32) * self.percentage - mark = tf.cast(mark, dtype=tf.int32) - - # compute residuals and sort - abs_error = tf.abs(y_true - y_pred) - d = tf.sort(abs_error)[mark - 1] - - # two-sided delta - if self.double_delta: - d = d * 2 - - # update count of batches - self.batch_count.assign_add(1.0) - - # update delta - self.delta.assign_add(tf.math.reduce_sum(d)) - - def result(self): - # this is simple averaging over the batches, more complex reduction can be added based on domain expertises - # Examples are: take max or min of both deltas (translates to a strict or a relaxed metric) - return tf.math.divide(self.delta, self.batch_count) - - -# code adopted and modified based on: -# https://github.com/horsepurve/DeepRTplus/blob/cde829ef4bd8b38a216d668cf79757c07133b34b/RTdata_emb.py -def delta95_metric(y_true, y_pred): - mark95 = tf.cast( - tf.cast(tf.shape(y_true)[0], dtype=tf.float32) * 0.95, dtype=tf.int32 - ) - abs_error = K.abs(y_true - y_pred) - delta = tf.sort(abs_error)[mark95 - 1] - norm_range = K.max(y_true) - K.min(y_true) - return (delta * 2) / (norm_range) diff --git a/dlomix/layers/__init__.py b/dlomix/layers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/dlomix/layers/attention.py b/dlomix/layers/attention.py deleted file mode 100644 index 74dc249fe255531df0d33ac2704e23c6a98051a8..0000000000000000000000000000000000000000 --- a/dlomix/layers/attention.py +++ /dev/null @@ -1,112 +0,0 @@ -import tensorflow as tf -import tensorflow.keras.backend as K -from tensorflow.keras import constraints, initializers, regularizers - - -class DecoderAttentionLayer(tf.keras.layers.Layer): - def __init__(self, time_steps): - super(DecoderAttentionLayer, self).__init__() - self.time_steps = time_steps - - def build(self, input_shape): - self.permute = tf.keras.layers.Permute((2, 1)) - self.dense = tf.keras.layers.Dense(self.time_steps, activation="softmax") - self.multiply = tf.keras.layers.Multiply() - - def call(self, inputs): - x = self.permute(inputs) - x = self.dense(x) - x = self.permute(x) - x = self.multiply([inputs, x]) - return x - - -class AttentionLayer(tf.keras.layers.Layer): - def __init__( - self, - context=False, - W_regularizer=None, - b_regularizer=None, - u_regularizer=None, - W_constraint=None, - b_constraint=None, - u_constraint=None, - bias=True, - **kwargs - ): - self.supports_masking = True - self.init = initializers.get("glorot_uniform") - self.W_regularizer = regularizers.get(W_regularizer) - self.b_regularizer = regularizers.get(b_regularizer) - self.u_regularizer = regularizers.get(u_regularizer) - self.W_constraint = constraints.get(W_constraint) - self.b_constraint = constraints.get(b_constraint) - self.u_constraint = constraints.get(u_constraint) - self.bias = bias - self.context = context - super(AttentionLayer, self).__init__(**kwargs) - - def build(self, input_shape): - assert len(input_shape) == 3 - self.W = self.add_weight( - shape=(input_shape[-1],), - initializer=self.init, - name="{}_W".format(self.name), - regularizer=self.W_regularizer, - constraint=self.W_constraint, - ) - if self.bias: - self.b = self.add_weight( - shape=(input_shape[1],), - initializer="zero", - name="{}_b".format(self.name), - regularizer=self.b_regularizer, - constraint=self.b_constraint, - ) - else: - self.b = None - if self.context: - self.u = self.add_weight( - shape=(input_shape[-1],), - initializer=self.init, - name="{}_u".format(self.name), - regularizer=self.u_regularizer, - constraint=self.u_constraint, - ) - - self.built = True - - def compute_mask(self, input, input_mask=None): - return None - - def call(self, x, mask=None): - a = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1) - if self.bias: - a += self.b - a = K.tanh(a) - if self.context: - a = K.squeeze(K.dot(x, K.expand_dims(self.u)), axis=-1) - a = K.exp(a) - if mask is not None: - a *= K.cast(mask, K.floatx()) - a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) - a = K.expand_dims(a) - weighted_input = x * a - return K.sum(weighted_input, axis=1) - - def compute_output_shape(self, input_shape): - return input_shape[0], input_shape[-1] - - def get_config(self): - config = { - "bias": self.bias, - "context": self.context, - "W_regularizer": regularizers.serialize(self.W_regularizer), - "b_regularizer": regularizers.serialize(self.b_regularizer), - "u_regularizer": regularizers.serialize(self.u_regularizer), - "W_constraint": constraints.serialize(self.W_constraint), - "b_constraint": constraints.serialize(self.b_constraint), - "u_constraint": constraints.serialize(self.u_constraint), - } - base_config = super(AttentionLayer, self).get_config() - return dict(list(base_config.items()) + list(config.items())) diff --git a/dlomix/losses/__init__.py b/dlomix/losses/__init__.py deleted file mode 100644 index c966525e400d2208418fd356b904461082b7736a..0000000000000000000000000000000000000000 --- a/dlomix/losses/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .intensity import masked_pearson_correlation_distance, masked_spectral_distance - -__all__ = [masked_spectral_distance, masked_pearson_correlation_distance] diff --git a/dlomix/losses/intensity.py b/dlomix/losses/intensity.py deleted file mode 100644 index 512233d655d2b59a8341c31dcd1abbb8863a5d37..0000000000000000000000000000000000000000 --- a/dlomix/losses/intensity.py +++ /dev/null @@ -1,69 +0,0 @@ -import numpy as np -import tensorflow as tf -import tensorflow.keras.backend as K - - -def masked_spectral_distance(y_true, y_pred): - """Masked, normalized spectral angles between true and pred vectors - > arccos(1*1 + 0*0) = 0 > SL = 0 > high correlation - > arccos(0*1 + 1*0) = pi/2 > SL = 1 > low correlation - """ - - # To avoid numerical instability during training on GPUs, - # we add a fuzzing constant epsilon of 1×10−7 to all vectors - epsilon = K.epsilon() - - # Masking: we multiply values by (true + 1) because then the peaks that cannot - # be there (and have value of -1 as explained above) won't be considered - pred_masked = ((y_true + 1) * y_pred) / (y_true + 1 + epsilon) - true_masked = ((y_true + 1) * y_true) / (y_true + 1 + epsilon) - - # L2 norm - pred_norm = K.l2_normalize(true_masked, axis=-1) - true_norm = K.l2_normalize(pred_masked, axis=-1) - - # Spectral Angle (SA) calculation - # (from the definition below, it is clear that ions with higher intensities - # will always have a higher contribution) - product = K.sum(pred_norm * true_norm, axis=1) - arccos = tf.math.acos(product) - return 2 * arccos / np.pi - - -def masked_pearson_correlation_distance(y_true, y_pred): - """ - Calculates the masked Pearson correlation distance between true and predicted intensity vectors. - - The masked Pearson correlation distance is a metric for comparing the similarity between two intensity vectors, - taking into account only the non-negative values in the true values tensor (which represent valid peaks). - - Parameters: - ----------- - y_true : tf.Tensor - A tensor containing the true values, with shape `(batch_size, num_values)`. - y_pred : tf.Tensor - A tensor containing the predicted values, with the same shape as `y_true`. - - Returns: - -------- - tf.Tensor - A tensor containing the masked Pearson correlation distance between `y_true` and `y_pred`. - - Raises: - ------- - ValueError - If `y_true` and `y_pred` have different shapes. - """ - epsilon = K.epsilon() - - # Masking: we multiply values by (true + 1) because then the peaks that cannot - # be there (and have value of -1 as explained above) won't be considered - pred_masked = ((y_true + 1) * y_pred) / (y_true + 1 + epsilon) - true_masked = ((y_true + 1) * y_true) / (y_true + 1 + epsilon) - - mx = tf.math.reduce_mean(true_masked) - my = tf.math.reduce_mean(pred_masked) - xm, ym = true_masked - mx, pred_masked - my - r_num = tf.math.reduce_mean(tf.multiply(xm, ym)) - r_den = tf.math.reduce_std(xm) * tf.math.reduce_std(ym) - return 1 - (r_num / r_den) diff --git a/dlomix/models/__init__.py b/dlomix/models/__init__.py deleted file mode 100644 index 5f6c4df63bcefeede6e3a74c83e6a4bd8700eb51..0000000000000000000000000000000000000000 --- a/dlomix/models/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .base import * -from .deepLC import * -from .prosit import * - -__all__ = [ - "RetentionTimePredictor", - "PrositRetentionTimePredictor", - "DeepLCRetentionTimePredictor", - "PrositIntensityPredictor", -] diff --git a/dlomix/models/base.py b/dlomix/models/base.py deleted file mode 100644 index 632da9bb62b5f938a5ad6c78158722fb78eaf20b..0000000000000000000000000000000000000000 --- a/dlomix/models/base.py +++ /dev/null @@ -1,81 +0,0 @@ -import tensorflow as tf -from tensorflow.keras.layers.experimental import preprocessing - -from ..constants import ALPHABET_UNMOD - - -class RetentionTimePredictor(tf.keras.Model): - """A simple class for Retention Time prediction models. - - Parameters - ---------- - embedding_dim (int, optional): Dimensionality of the embeddings to be used for representing the Amino Acids. Defaults to 16. - seq_length (int, optional): Sequence length of the peptide sequences. Defaults to 30. - encoder (str, optional): String for specifying the decoder to use, either based on 1D conv-layers or LSTMs. Defaults to "conv1d". - vocab_dict (dict, optional): Dictionary mapping for the vocabulary (the amino acids in this case). Defaults to ALPHABET_UNMOD. - """ - - def __init__( - self, - embedding_dim=16, - seq_length=30, - encoder="conv1d", - vocab_dict=ALPHABET_UNMOD, - ): - super(RetentionTimePredictor, self).__init__() - - # tie the count of embeddings to the size of the vocabulary (count of amino acids) - self.embeddings_count = len(vocab_dict) + 2 - - self.string_lookup = preprocessing.StringLookup( - vocabulary=list(vocab_dict.keys()) - ) - - self.embedding = tf.keras.layers.Embedding( - input_dim=self.embeddings_count, - output_dim=embedding_dim, - input_length=seq_length, - ) - - self._build_encoder(encoder) - - self.flatten = tf.keras.layers.Flatten() - self.regressor = tf.keras.Sequential( - [ - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dense(64, activation="relu"), - ] - ) - - self.output_layer = tf.keras.layers.Dense(1) - - def _build_encoder(self, encoder_type): - if encoder_type.lower() == "conv1d": - self.encoder = tf.keras.Sequential( - [ - tf.keras.layers.Conv1D( - filters=256, kernel_size=3, padding="same", activation="relu" - ), - tf.keras.layers.Conv1D( - filters=512, kernel_size=3, padding="valid", activation="relu" - ), - tf.keras.layers.MaxPooling1D(pool_size=2), - ] - ) - else: - self.encoder = tf.keras.Sequential( - [ - tf.keras.layers.LSTM(256, return_sequences=True), - tf.keras.layers.LSTM(256), - ] - ) - - def call(self, inputs, **kwargs): - x = self.string_lookup(inputs) - x = self.embedding(x) - x = self.encoder(x) - x = self.flatten(x) - x = self.regressor(x) - x = self.output_layer(x) - - return x diff --git a/dlomix/models/deepLC.py b/dlomix/models/deepLC.py deleted file mode 100644 index 3c6b6a2b6727d86d71c3b4cd226fd184538a2b6b..0000000000000000000000000000000000000000 --- a/dlomix/models/deepLC.py +++ /dev/null @@ -1,131 +0,0 @@ -import tensorflow as tf -from tensorflow.keras.layers.experimental import preprocessing - -from ..constants import ALPHABET_UNMOD - - -class DeepLCRetentionTimePredictor(tf.keras.Model): - def __init__( - self, seq_length=60, vocab_dict=ALPHABET_UNMOD, use_global_features=False - ): - super(DeepLCRetentionTimePredictor, self).__init__() - self.seq_length = seq_length - self._use_global_features = use_global_features - - self.leaky_relu = tf.keras.layers.ReLU(max_value=20, negative_slope=0.1) - self.string_lookup = preprocessing.StringLookup( - vocabulary=list(vocab_dict.keys()) - ) - - self._build_aminoacid_branch() - self._build_diaminoacid_branch() - self._build_onehot_encoding_branch() - self._build_regressor() - self.output_layer = tf.keras.layers.Dense(1) - - if self._use_global_features: - self._build_global_features_branch() - - def _build_aminoacid_branch(self): - self.aminoacid_branch = tf.keras.Sequential( - [ - self._build_conv_pool_block(n_filters=256, kernel=8, padding="same"), - self._build_conv_pool_block(n_filters=128, kernel=8, padding="same"), - self._build_conv_pool_block( - n_filters=64, kernel=8, padding="same", pool=False - ), - tf.keras.layers.Flatten(), - ] - ) - - def _build_diaminoacid_branch(self): - self.diaminoacid_branch = tf.keras.Sequential( - [ - self._build_conv_pool_block(n_filters=128, kernel=2, padding="same"), - self._build_conv_pool_block(n_filters=64, kernel=2, padding="same"), - tf.keras.layers.Flatten(), - ] - ) - - def _build_global_features_branch(self): - self.global_features_branch = tf.keras.Sequential( - [ - tf.keras.layers.Dense(16, activation=self.leaky_relu), - tf.keras.layers.Dense(16, activation=self.leaky_relu), - tf.keras.layers.Dense(16, activation=self.leaky_relu), - ] - ) - - def _build_onehot_encoding_branch(self): - self.onehot_encoding_branch = tf.keras.Sequential( - [ - self._build_conv_pool_block( - n_filters=2, - kernel=2, - padding="same", - activation="tanh", - pool_strides=10, - pool_size=10, - ), - tf.keras.layers.Flatten(), - ] - ) - - def _build_regressor(self): - self.regressor = tf.keras.Sequential( - [tf.keras.layers.Dense(128, activation=self.leaky_relu) for _ in range(5)] - ) - - def _build_conv_pool_block( - self, - n_conv_layers=2, - n_filters=256, - kernel=8, - padding="same", - activation="leaky_relu", - pool=True, - pool_strides=2, - pool_size=2, - ): - # leaky relu by default - activation_fn = self.leaky_relu - - if activation in ["tanh", "relu"]: - activation_fn = activation - - block = tf.keras.Sequential( - [ - tf.keras.layers.Conv1D( - filters=n_filters, - kernel_size=kernel, - padding=padding, - activation=activation_fn, - ) - for _ in range(n_conv_layers) - ] - ) - if pool: - block.add(tf.keras.layers.MaxPooling1D(pool_size=2, strides=2)) - - return block - - def call(self, inputs, **kwargs): - outputs = {} - - integer_encoded = self.string_lookup(inputs["seq"]) - onehot_encoded = tf.one_hot(integer_encoded, depth=self.seq_length) - - if self._use_global_features: - outputs["global_features_output"] = self.global_features_branch( - inputs["global_features"] - ) - - outputs["onehot_branch_output"] = self.onehot_encoding_branch(onehot_encoded) - outputs["aminoacids_branch_output"] = self.aminoacid_branch(inputs["counts"]) - outputs["diaminoacids_branch_output"] = self.diaminoacid_branch( - inputs["di_counts"] - ) - - concatenated_output = tf.concat(outputs.values(), axis=1) - concatenated_output = self.regressor(concatenated_output) - return self.output_layer(concatenated_output) diff --git a/dlomix/models/prosit.py b/dlomix/models/prosit.py deleted file mode 100644 index 5fa1b2cde246954a37f15f2e833176e5535b1381..0000000000000000000000000000000000000000 --- a/dlomix/models/prosit.py +++ /dev/null @@ -1,333 +0,0 @@ -import tensorflow as tf -from tensorflow.keras.layers.experimental import preprocessing - -from ..constants import ALPHABET_UNMOD -from ..data.feature_extractors import ( - ModificationGainFeature, - ModificationLocationFeature, - ModificationLossFeature, -) -from ..layers.attention import AttentionLayer, DecoderAttentionLayer - - -class PrositRetentionTimePredictor(tf.keras.Model): - """Implementation of the Prosit model for retention time prediction. - - Parameters - ----------- - embedding_output_dim (int, optional): Size of the embeddings to use. Defaults to 16. - seq_length (int, optional): Sequence length of the peptide sequences. Defaults to 30. - vocab_dict (dict, optional): Dictionary mapping for the vocabulary (the amino acids in this case). Defaults to None, which is mapped to `ALPHABET_UNMOD`. - dropout_rate (float, optional): Probability to use for dropout layers in the encoder. Defaults to 0.5. - latent_dropout_rate (float, optional): Probability to use for dropout layers in the regressor layers after encoding. Defaults to 0.1. - recurrent_layers_sizes (tuple, optional): A tuple of 2 values for the sizes of the two GRU layers in the encoder. Defaults to (256, 512). - regressor_layer_size (int, optional): Size of the dense layer in the regressor after the encoder. Defaults to 512. - """ - - DEFAULT_INPUT_KEYS = { - "SEQUENCE_KEY": "sequence", - } - - def __init__( - self, - embedding_output_dim=16, - seq_length=30, - vocab_dict=None, - dropout_rate=0.5, - latent_dropout_rate=0.1, - recurrent_layers_sizes=(256, 512), - regressor_layer_size=512, - ): - super(PrositRetentionTimePredictor, self).__init__() - - self.dropout_rate = dropout_rate - self.latent_dropout_rate = latent_dropout_rate - self.regressor_layer_size = regressor_layer_size - self.recurrent_layers_sizes = recurrent_layers_sizes - - if vocab_dict: - self.vocab_dict = vocab_dict - else: - self.vocab_dict = ALPHABET_UNMOD - - # tie the count of embeddings to the size of the vocabulary (count of amino acids) - self.embeddings_count = len(self.vocab_dict) + 2 - - self.string_lookup = preprocessing.StringLookup( - vocabulary=list(self.vocab_dict.keys()) - ) - - self.embedding = tf.keras.layers.Embedding( - input_dim=self.embeddings_count, - output_dim=embedding_output_dim, - input_length=seq_length, - ) - self._build_encoder() - - self.attention = AttentionLayer() - - self.regressor = tf.keras.Sequential( - [ - tf.keras.layers.Dense(self.regressor_layer_size, activation="relu"), - tf.keras.layers.Dropout(rate=self.latent_dropout_rate), - ] - ) - - self.output_layer = tf.keras.layers.Dense(1) - - def _build_encoder(self): - self.encoder = tf.keras.Sequential( - [ - tf.keras.layers.Bidirectional( - tf.keras.layers.GRU( - units=self.recurrent_layers_sizes[0], return_sequences=True - ) - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - tf.keras.layers.GRU( - units=self.recurrent_layers_sizes[1], return_sequences=True - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - ] - ) - - def call(self, inputs, **kwargs): - if isinstance(inputs, dict): - x = inputs.get( - PrositRetentionTimePredictor.DEFAULT_INPUT_KEYS["SEQUENCE_KEY"] - ) - else: - x = inputs - x = self.string_lookup(x) - x = self.embedding(x) - x = self.encoder(x) - x = self.attention(x) - x = self.regressor(x) - x = self.output_layer(x) - return x - - -class PrositIntensityPredictor(tf.keras.Model): - """Implementation of the Prosit model for intensity prediction. - - Parameters - ----------- - embedding_output_dim (int, optional): Size of the embeddings to use. Defaults to 16. - seq_length (int, optional): Sequence length of the peptide sequences. Defaults to 30. - vocab_dict (dict, optional): Dictionary mapping for the vocabulary (the amino acids in this case). Defaults to None, which is mapped to `ALPHABET_UNMOD`. - dropout_rate (float, optional): Probability to use for dropout layers in the encoder. Defaults to 0.5. - latent_dropout_rate (float, optional): Probability to use for dropout layers in the regressor layers after encoding. Defaults to 0.1. - recurrent_layers_sizes (tuple, optional): A tuple of 2 values for the sizes of the two GRU layers in the encoder. Defaults to (256, 512). - regressor_layer_size (int, optional): Size of the dense layer in the regressor after the encoder. Defaults to 512. - use_ptm_counts (boolean, optional): Whether to use PTM counts and create corresponding layers, has to be aligned with input_keys. Defaults to False. - input_keys (dict, optional): dict of string keys and values mapping a fixed key to a value key in the inputs dict from the dataset class. Defaults to None, which corresponds then to the required default input keys `DEFAULT_INPUT_KEYS`. - meta_data_keys (list, optional): list of string values corresponding to fixed keys in the inputs dict that are considered meta data. Defaults to None, which corresponds then to the default meta data keys `META_DATA_KEYS`. - """ - - # consider using kwargs in the call function instead ! - - DEFAULT_INPUT_KEYS = { - "SEQUENCE_KEY": "sequence", - "COLLISION_ENERGY_KEY": "collision_energy", - "PRECURSOR_CHARGE_KEY": "precursor_charge", - "FRAGMENTATION_TYPE_KEY": "fragmentation_type", - } - - # can be extended to include all possible meta data - META_DATA_KEYS = [ - "COLLISION_ENERGY_KEY", - "PRECURSOR_CHARGE_KEY", - "FRAGMENTATION_TYPE_KEY", - ] - PTM_INPUT_KEYS = [ - ModificationLossFeature.__name__.lower(), - ModificationGainFeature.__name__.lower(), - ModificationLocationFeature.__name__.lower(), - ] - - def __init__( - self, - embedding_output_dim=16, - seq_length=30, - len_fion=6, - vocab_dict=None, - dropout_rate=0.2, - latent_dropout_rate=0.1, - recurrent_layers_sizes=(256, 512), - regressor_layer_size=512, - use_ptm_counts=False, - input_keys=None, - meta_data_keys=None, - ): - super(PrositIntensityPredictor, self).__init__() - - self.dropout_rate = dropout_rate - self.latent_dropout_rate = latent_dropout_rate - self.regressor_layer_size = regressor_layer_size - self.recurrent_layers_sizes = recurrent_layers_sizes - self.seq_length = seq_length - self.len_fion = len_fion - self.use_ptm_counts = use_ptm_counts - self.input_keys = input_keys - self.meta_data_keys = meta_data_keys - - # maximum number of fragment ions - self.max_ion = self.seq_length - 1 - - if vocab_dict: - self.vocab_dict = vocab_dict - else: - self.vocab_dict = ALPHABET_UNMOD - - # tie the count of embeddings to the size of the vocabulary (count of amino acids) - self.embeddings_count = len(self.vocab_dict) + 2 - - self.string_lookup = preprocessing.StringLookup( - vocabulary=list(self.vocab_dict.keys()) - ) - - self.embedding = tf.keras.layers.Embedding( - input_dim=self.embeddings_count, - output_dim=embedding_output_dim, - input_length=seq_length, - ) - - if self.input_keys is None: - self.input_keys = PrositIntensityPredictor.DEFAULT_INPUT_KEYS - - if self.meta_data_keys is None: - self.meta_data_keys = PrositIntensityPredictor.META_DATA_KEYS - - self._build_encoders() - self._build_decoder() - - self.attention = AttentionLayer(name="encoder_att") - - self.meta_data_fusion_layer = tf.keras.Sequential( - [ - tf.keras.layers.Multiply(name="add_meta"), - tf.keras.layers.RepeatVector(self.max_ion, name="repeat"), - ] - ) - - self.regressor = tf.keras.Sequential( - [ - tf.keras.layers.TimeDistributed( - tf.keras.layers.Dense(self.len_fion), name="time_dense" - ), - tf.keras.layers.LeakyReLU(name="activation"), - tf.keras.layers.Flatten(name="out"), - ] - ) - - def _build_encoders(self): - self.meta_encoder = tf.keras.Sequential( - [ - tf.keras.layers.Concatenate(name="meta_in"), - tf.keras.layers.Dense( - self.recurrent_layers_sizes[1], name="meta_dense" - ), - tf.keras.layers.Dropout(self.dropout_rate, name="meta_dense_do"), - ] - ) - - self.sequence_encoder = tf.keras.Sequential( - [ - tf.keras.layers.Bidirectional( - tf.keras.layers.GRU( - units=self.recurrent_layers_sizes[0], return_sequences=True - ) - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - tf.keras.layers.GRU( - units=self.recurrent_layers_sizes[1], return_sequences=True - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - ] - ) - if not self.use_ptm_counts: - self.ptm_encoder, self.ptm_aa_fusion = None, None - else: - self.ptm_encoder = tf.keras.Sequential( - [ - tf.keras.layers.Concatenate(name="ptm_ac_loss_gain"), - tf.keras.layers.Bidirectional( - tf.keras.layers.GRU( - units=self.recurrent_layers_sizes[0], return_sequences=True - ) - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - tf.keras.layers.GRU( - units=self.recurrent_layers_sizes[1], return_sequences=True - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - ] - ) - - self.ptm_aa_fusion = tf.keras.layers.Multiply(name="aa_ptm_in") - - def _build_decoder(self): - self.decoder = tf.keras.Sequential( - [ - tf.keras.layers.GRU( - units=self.regressor_layer_size, - return_sequences=True, - name="decoder", - ), - tf.keras.layers.Dropout(rate=self.dropout_rate), - DecoderAttentionLayer(self.max_ion), - ] - ) - - def call(self, inputs, **kwargs): - peptides_in = inputs.get(self.input_keys["SEQUENCE_KEY"]) - - # read meta data from the input dict - meta_data = [] - # note that the value here is the key to use in the inputs dict passed from the dataset - for meta_key, key_in_inputs in self.input_keys.items(): - if meta_key in PrositIntensityPredictor.META_DATA_KEYS: - # get the input under the specified key if exists - meta_in = inputs.get(key_in_inputs, None) - if meta_in is not None: - # add the input to the list of meta data inputs - meta_data.append(meta_in) - - if self.meta_encoder and len(meta_data) > 0: - encoded_meta = self.meta_encoder(meta_data) - else: - raise ValueError( - f"Following metadata keys are expected in the model for Prosit Intesity: {PrositIntensityPredictor.META_DATA_KEYS}. The actual input passed to the model contains the following keys: {list(inputs.keys())}" - ) - - # read PTM atom count features from the input dict - ptm_ac_features = [] - for ptm_key in PrositIntensityPredictor.PTM_INPUT_KEYS: - ptm_ac_f = inputs.get(ptm_key, None) - if ptm_ac_f is not None: - ptm_ac_features.append(ptm_ac_f) - - if self.ptm_encoder and len(ptm_ac_features) > 0: - encoded_ptm = self.ptm_encoder(ptm_ac_features) - elif self.use_ptm_counts: - raise ValueError( - f"PTM features enabled and following PTM features are expected in the model for Prosit Intesity: {PrositIntensityPredictor.PTM_INPUT_KEYS}. The actual input passed to the model contains the following keys: {list(inputs.keys())}" - ) - else: - encoded_ptm = None - - x = self.string_lookup(peptides_in) - x = self.embedding(x) - x = self.sequence_encoder(x) - - if self.use_ptm_counts and self.ptm_aa_fusion and encoded_ptm is not None: - x = self.ptm_aa_fusion([x, encoded_ptm]) - - x = self.attention(x) - - x = self.meta_data_fusion_layer([x, encoded_meta]) - - x = self.decoder(x) - x = self.regressor(x) - - return x diff --git a/dlomix/pipelines/__init__.py b/dlomix/pipelines/__init__.py deleted file mode 100644 index 8d4a3a851f12410fc2245ff8b42d405b64111bac..0000000000000000000000000000000000000000 --- a/dlomix/pipelines/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .pipeline import RetentionTimePipeline - -__all__ = [RetentionTimePipeline] diff --git a/dlomix/pipelines/pipeline.py b/dlomix/pipelines/pipeline.py deleted file mode 100644 index 926ad54388ef0a4c3208daa8842c9ce0b760d001..0000000000000000000000000000000000000000 --- a/dlomix/pipelines/pipeline.py +++ /dev/null @@ -1,101 +0,0 @@ -import zipfile -from os import makedirs -from os.path import dirname, splitext - -import numpy as np -import requests - -from ..constants import retention_time_pipeline_parameters -from ..data.RetentionTimeDataset import RetentionTimeDataset -from ..models.base import RetentionTimePredictor -from ..reports import RetentionTimeReport - -# pipelines can be used to train the model further or from scratch given a dataset -# add string arguments (e.g. prosit to create the model, data source to create the dataset) - -# if neither train nor test are provided --> use toy datasets to (train if necessary or load pre-trained weights), predict on test, and generate report -# if test only --> load pre-trained weights, predict and generate report -# if train and test --> do what you have to do - - -class RetentionTimePipeline: - def __init__(self, pre_trained=True): - super(RetentionTimePipeline, self).__init__() - self.model = None - self.test_dataset = None - self.pre_trained = pre_trained - - # pass the config in the constructor - # refactor to have a base class Pipeline - - self._build_model() - - def _build_model(self): - self.model = RetentionTimePredictor( - **retention_time_pipeline_parameters["model_params"] - ) - - if self.pre_trained: - self._download_unzip_pretrained_model( - retention_time_pipeline_parameters["trained_model_url"], - retention_time_pipeline_parameters["trained_model_path"] - + retention_time_pipeline_parameters["trained_model_zipfile_name"], - ) - - self.model.load_weights( - retention_time_pipeline_parameters["trained_model_path"] - + splitext( - retention_time_pipeline_parameters["trained_model_zipfile_name"] - )[0] - ) - - def _download_unzip_pretrained_model(self, model_url, save_path): - makedirs(model_url) - r = requests.get(model_url) - - with open(save_path, "wb") as f: - f.write(r.content) - - self._unzip_model(save_path) - - def _unzip_model(self, model_zipfile_path): - zip_ref = zipfile.ZipFile(model_zipfile_path) - model_folder = dirname(model_zipfile_path) - zip_ref.extractall(model_folder) - zip_ref.close() - - """ - - Predict retention times given data either as numpy array of sequences or a filepath to a csv file - - """ - - def predict(self, data=None): - if not (isinstance(data, str) or isinstance(data, np.ndarray)): - raise ValueError( - "Dataset should be provided either as a numpy array or a string pointing to a file." - ) - - self.test_dataset = RetentionTimeDataset( - data, - **retention_time_pipeline_parameters["data_params"], - val_ratio=0, - test=True - ) - ( - self.test_dataset.data_mean, - self.test_dataset.data_std, - ) = retention_time_pipeline_parameters["trained_model_stats"] - - predictions = self.model.predict(self.test_dataset.test_data) - predictions = self.test_dataset.denormalize_targets(predictions) - predictions = predictions.ravel() - - return predictions - - def predict_report(self, data, output_path="./") -> None: - predictions = self.predict(data) - report = RetentionTimeReport(output_path=output_path, history=None) - - test_targets = self.test_dataset.get_split_targets(split="test") - report.generate_report(test_targets, predictions) diff --git a/dlomix/reports/IntensityReport.py b/dlomix/reports/IntensityReport.py deleted file mode 100644 index 21d0196b2ee26998415dd0fa8a9b458b88f06500..0000000000000000000000000000000000000000 --- a/dlomix/reports/IntensityReport.py +++ /dev/null @@ -1,74 +0,0 @@ -from os.path import join - -import pandas as pd -import seaborn as sns - -from .postprocessing import normalize_intensity_predictions -from .Report import PDFFile, Report - - -class IntensityReport(Report): - """Report generation for Fragment Ion Intensity Prediction tasks.""" - - TARGETS_LABEL = "x" - PREDICTIONS_LABEL = "y" - DEFAULT_BATCH_SIZE = 600 - - def __init__(self, output_path, history, figures_ext="png", batch_size=0): - super(IntensityReport, self).__init__(output_path, history, figures_ext) - - self.pdf_file = PDFFile("DLOmix - Fragment Ion Intensity Report") - - if batch_size: - self.batch_size = batch_size - else: - self.batch_size = IntensityReport.DEFAULT_BATCH_SIZE - - def generate_report(self, dataset, predictions): - self._init_report_resources() - - predictions_df = self.generate_intensity_results_df(dataset, predictions) - self.plot_all_metrics() - - # make custom plots - self.plot_spectral_angle(predictions_df) - - self._compile_report_resources_add_pdf_pages() - self.pdf_file.output(join(self._output_path, "intensity_Report.pdf"), "F") - - def generate_intensity_results_df(self, dataset, predictions): - predictions_df = pd.DataFrame() - - predictions_df["sequences"] = dataset.sequences - predictions_df["intensities_pred"] = predictions.tolist() - predictions_df["precursor_charge_onehot"] = dataset.precursor_charge.tolist() - predictions_df["intensities_raw"] = dataset.intensities.tolist() - - return predictions_df - - def plot_spectral_angle(self, predictions_df): - """Create spectral plot - - Arguments - --------- - predictions_df: dataframe with raw intensities, predictions, sequences, precursor_charges - """ - - predictions_acc = normalize_intensity_predictions( - predictions_df, self.batch_size - ) - violin_plot = sns.violinplot(predictions_acc["spectral_angle"]) - - save_path = join( - self._output_path, "violin_spectral_angle_plot" + self._figures_ext - ) - - fig = violin_plot.get_figure() - fig.savefig(save_path) - - self._add_report_resource( - "spectral_angle_plot", - "Spectral angle plot", - "The following figure shows the spectral angle plot for the test data.", - save_path, - ) diff --git a/dlomix/reports/Report.py b/dlomix/reports/Report.py deleted file mode 100644 index 8b40cb3391e37d05a4ed93117b7294f1298c01a4..0000000000000000000000000000000000000000 --- a/dlomix/reports/Report.py +++ /dev/null @@ -1,238 +0,0 @@ -import abc -import glob -import warnings -from os import makedirs -from os.path import join - -import tensorflow as tf -from fpdf import FPDF -from matplotlib import pyplot as plt - - -class Report(abc.ABC): - """Base class for reports, child classes should implement the abstract method generate_report. - - Parameters - ---------- - output_path: path to save output files and figures. - history : reference to a Keras History object or its history dict attribute (History.history). - figures_ext: File extension and format for saving figures. - """ - - VALID_FIGURE_FORMATS = ["pdf", "jpeg", "jpg", "png"] - - def __init__(self, output_path, history, figures_ext): - self._output_path = output_path - makedirs(self._output_path, exist_ok=True) - - if history is None: - warnings.warn( - "The passed History object is None, no training/validation data can be reported." - ) - self._history_dict = {} - else: - self._set_history_dict(history) - self._set_figures_format(figures_ext) - - # an empty dict to use to list the report resources - self._init_report_resources() - - # empty pdf file - self.pdf_file = None - - def _set_history_dict(self, history): - if isinstance(history, dict): - self._history_dict = history - elif not isinstance(history, tf.keras.callbacks.History): - raise ValueError( - "Reporting requires a History object (tf.keras.callbacks.History) or its history dict attribute (History.history), which is returned from a call to " - f"model.fit(). Passed history argument is of type {type(history)} ", - ) - elif not hasattr(history, "history"): - raise ValueError( - "The passed History object does not have a history attribute, which is a dict with results." - ) - else: - self._history_dict = history.history - - if len(self._history_dict.keys()) == 0: - warnings.warn( - "The passed History object contains an empty history dict, no training was done." - ) - - def _set_figures_format(self, figures_ext): - figures_ext = figures_ext.lower() - if figures_ext.startswith("."): - figures_ext = figures_ext[1:] - if figures_ext not in Report.VALID_FIGURE_FORMATS: - raise ValueError( - f"Allowed figure formats are: {Report.VALID_FIGURE_FORMATS}" - ) - self._figures_ext = "." + figures_ext - - def _get_all_saved_plots(self): - all_plots = glob.glob(join(self._output_path, "*" + self._figures_ext)) - return all_plots - - def _add_report_resource(self, key, title, paragraph_text, value): - self._report_resources[key] = (title, paragraph_text, value) - - def _init_report_resources(self): - self._report_resources = {} - - def _compile_report_resources_add_pdf_pages(self): - for key, resource in self._report_resources.items(): - value_is_fig_path = self._figures_ext in str(resource[2]) - plot_word_is_in_key = "plot" in key - if value_is_fig_path or plot_word_is_in_key: - self.pdf_file.add_content_plot_page( - section_title=resource[0], - section_body=resource[1], - plot_filepath=resource[2], - ) - else: - self.pdf_file.add_content_text_page( - section_title=resource[0], section_body=resource[1] - ) - - def plot_keras_metric(self, metric_name, save_plot=True): - """Plot a keras metric given its name and the history object returned by model.fit() - - Arguments - --------- - metric_name: String with the name of the metric. - save_plot (bool, optional): whether to save plot to disk or not. Defaults to True. - """ - - if metric_name.lower() not in self._history_dict.keys(): - raise ValueError( - f"Metric name to plot is not available in the history dict. Available metrics to plot are {self._history_dict.keys()}", - ) - - if ( - "val_" + metric_name.lower() not in self._history_dict.keys() - and metric_name.lower() not in ["lr"] - ): - raise ValueError( - f"""No validation epochs were run during training, the metric name to plot is not available in the history dict. - Available metrics to plot are {self._history_dict.keys()} - """, - ) - plt.plot(self._history_dict[metric_name]) - plt.plot(self._history_dict["val_" + metric_name]) - plt.title(metric_name) - plt.ylabel(metric_name) - plt.xlabel("epoch") - plt.legend(["train", "val"], loc="upper left") - if save_plot: - save_path = join(self._output_path, metric_name + self._figures_ext) - plt.savefig(save_path) - plt.show() - plt.close() - metric_name_spaced = metric_name.replace("_", " ") - self._add_report_resource( - metric_name + "_plot", - metric_name_spaced.title(), - f"The following figure shows the {metric_name_spaced} for training and validation.", - save_path, - ) - - def plot_all_metrics(self): - """Plot all available Keras metrics in the History object.""" - metrics = self._history_dict.keys() - metrics = filter(lambda x: not x.startswith(tuple(["val_", "_"])), metrics) - print("Plotting all metrics: ", list(metrics)) - for metric in metrics: - self.plot_keras_metric(metric) - - @abc.abstractmethod - def generate_report(self, targets, predictions, **kwargs): - """Abstract method to generate a complete report. Child classes need to implement this method. - - Arguments - --------- - targets: Array with target values. - predictions: Array with prediction values. - """ - - -class PDFFile(FPDF): - """PDF file template class. - - Parameters - ---------- - title: Title for the pdf file - """ - - PAGE_WIDTH = 210 - PAGE_HEIGHT = 297 - - SECTION_PARAGRAPH_FONT = ["Arial", "", 11] - SECTION_TITLE_FONT = ["Arial", "B", 13] - LINE_HEIGHT = 5 - - def __init__(self, title): - super().__init__() - self.title = title - self.width = PDFFile.PAGE_WIDTH - self.height = PDFFile.PAGE_HEIGHT - - self.set_auto_page_break(True) - self.document_empty = True - - def header(self): - self.set_font("Arial", "B", 11) - self.cell(self.width - 80) - self.cell(60, 1, self.title, 0, 0, "R") - self.ln(20) - - def footer(self): - # Page numbers in the footer - self.set_y(-15) - self.set_font("Arial", "I", 8) - self.set_text_color(128) - self.cell(0, 10, "Page " + str(self.page_no()), 0, 0, "C") - - def _add_plot(self, plot_filepath): - self.image(plot_filepath) - self.ln(3 * PDFFile.LINE_HEIGHT) - - def _add_section_content(self, section_title, section_body): - if section_title != "": - self.set_font(*PDFFile.SECTION_TITLE_FONT) - self.cell(w=0, txt=section_title) - self.ln(PDFFile.LINE_HEIGHT) - if section_body != "": - self.set_font(*PDFFile.SECTION_PARAGRAPH_FONT) - self.multi_cell(w=0, h=PDFFile.LINE_HEIGHT, txt=section_body) - self.ln(PDFFile.LINE_HEIGHT) - - def _create_first_page_if_document_empty(self): - if self.document_empty: - self.add_page() - self.document_empty = False - - def add_content_text_page(self, section_title, section_body): - """Add a section title and a paragraph. - - Arguments - --------- - section_title: title for the section. - section_body: paragraph text to add. - """ - self._create_first_page_if_document_empty() - self._add_section_content(section_title, section_body) - - def add_content_plot_page(self, plot_filepath, section_title="", section_body=""): - """Add a new page with a section title, a paragraph, and a plot. At least a plot has to be provided. - - Arguments - --------- - plot_filepath (str): filepath of the plot to be inserted in the new page. - section_title (str, optional): title for the section. Defaults to "". - section_body (str, optional): paragraph text to add. Defaults to "". - """ - - self._create_first_page_if_document_empty() - self._add_section_content(section_title, section_body) - self._add_plot(plot_filepath) diff --git a/dlomix/reports/RetentionTimeReport.py b/dlomix/reports/RetentionTimeReport.py deleted file mode 100644 index 5842635928d9af73726a9530526238c0b305c75d..0000000000000000000000000000000000000000 --- a/dlomix/reports/RetentionTimeReport.py +++ /dev/null @@ -1,169 +0,0 @@ -from os.path import join -from warnings import warn - -import numpy as np -from matplotlib import pyplot as plt -from matplotlib.colors import LogNorm -from matplotlib.ticker import LogLocator - -from .Report import PDFFile, Report - - -class RetentionTimeReport(Report): - """Report generation for Retention Time Prediction tasks.""" - - TARGETS_LABEL = "iRT (measured)" - PREDICTIONS_LABEL = "iRT (predicted)" - - def __init__(self, output_path, history, figures_ext="png"): - super(RetentionTimeReport, self).__init__(output_path, history, figures_ext) - - warn(f"{self.__class__.__name__} This class is deprecated and will not further developed. Use RetentionTimeReportWandb instead for creating a report with the Weights & Biases Report API.", - DeprecationWarning, - stacklevel=2 - ) - - - self.pdf_file = PDFFile("DLOmix - Retention Time Report") - - def generate_report(self, targets, predictions, **kwargs): - self._init_report_resources() - - _ = self.calculate_r2(targets, predictions) - self.plot_all_metrics() - self.plot_residuals(targets, predictions) - self.plot_density(targets, predictions) - - self._compile_report_resources_add_pdf_pages() - - self.pdf_file.output(join(self._output_path, "iRT_Report.pdf"), "F") - - def calculate_r2(self, targets, predictions): - """Calculate R-squared using sklearn given true targets and predictions - - Arguments - --------- - targets: Array with target values - predictions: Array with prediction values - - Returns: - r_squared (float): float value of R squared - """ - from sklearn.metrics import r2_score - - r2 = r2_score(np.ravel(targets), np.ravel(predictions)) - - self._add_report_resource( - "r2", - "R-Squared", - f"The R-squared value for the predictions is {round(r2, 4)}", - r2, - ) - - return r2 - - def plot_residuals(self, targets, predictions, xrange=(0, 0)): - """Plot histogram of residuals - - Argsuments - ---------- - targets: Array with target values - predictions: Array with prediction values - xrange (tuple, optional): X-axis range for plotting the histogram. Defaults to (-10, 10). - """ - error = np.ravel(predictions) - np.ravel(targets) - - x_min, x_max = xrange - if xrange == (0, 0): - mean, std_dev = np.mean(error), np.std(error) - x_min, x_max = mean - (3 * std_dev), mean + (3 * std_dev) - - bins = np.linspace(x_min, x_max, 200) - - plt.hist(error, bins, alpha=0.5, color="orange") - plt.title("Historgram of Residuals") - plt.xlabel("Residual value") - plt.ylabel("Count") - save_path = join(self._output_path, "histogram_residuals" + self._figures_ext) - plt.savefig(save_path) - plt.show() - plt.close() - - self._add_report_resource( - "residuals_plot", - "Error Residuals", - "The following plot shows a historgram of residuals for the test data.", - save_path, - ) - - def plot_density( - self, - targets, - predictions, - irt_delta95=5, - palette="Reds_r", - delta95_line_color="#36479E", - nbins=1000, - ): - """Create density plot - - Arguments - --------- - targets: Array with target values - predictions: Array with prediction values - irt_delta95 (int, optional): iRT Value of the delta 95% . Defaults to 5. - palette (str, optional): Color palette from matplotlib. Defaults to 'Reds_r'. - delta95_line_color (str, optional): Color for the delta 95% line. Defaults to '#36479E'. - nbins (int, optional): Number of bins to use for creating the 2D histogram. Defaults to 1000. - """ - - H, xedges, yedges = np.histogram2d(targets, predictions, bins=nbins) - - x_min = np.min(targets) - x_max = np.max(targets) - - # H needs to be rotated and flipped - H = np.rot90(H) - H = np.flipud(H) - - # Mask zeros - Hmasked = np.ma.masked_where(H == 0, H) # Mask pixels with a value of zero - - # Plot 2D histogram using pcolor - cm = plt.cm.get_cmap(palette) - plt.pcolormesh( - xedges, yedges, Hmasked, cmap=cm, norm=LogNorm(vmin=1e0, vmax=1e2) - ) - - plt.xlabel(RetentionTimeReport.TARGETS_LABEL, fontsize=18) - plt.ylabel(RetentionTimeReport.PREDICTIONS_LABEL, fontsize=18) - - cbar = plt.colorbar(ticks=LogLocator(subs=range(5))) - cbar.ax.set_ylabel("Counts", fontsize=14) - - plt.plot([x_min, x_max], [x_min, x_max], c="black") - plt.plot( - [x_min, x_max], - [x_min - irt_delta95, x_max - irt_delta95], - color=delta95_line_color, - ) - plt.plot( - [x_min, x_max], - [x_min + irt_delta95, x_max + irt_delta95], - color=delta95_line_color, - ) - - font_size = 14 # Adjust as appropriate. - cbar.ax.tick_params(labelsize=font_size) - cbar.ax.minorticks_on() - save_path = join(self._output_path, "density_plot" + self._figures_ext) - plt.savefig(save_path) - plt.show() - plt.close() - - self._add_report_resource( - "density_plot", - "Density Plot", - "The following figure shows the density plot with the delta-95 highlighted for the test data.", - save_path, - ) diff --git a/dlomix/reports/RetentionTimeReportModelComparisonWandb.py b/dlomix/reports/RetentionTimeReportModelComparisonWandb.py deleted file mode 100644 index d6ee86e61bbc095e31476909c452b1636cb5ffbd..0000000000000000000000000000000000000000 --- a/dlomix/reports/RetentionTimeReportModelComparisonWandb.py +++ /dev/null @@ -1,321 +0,0 @@ -import os -import re - -import numpy as np -import pandas as pd -import wandb -import wandb.apis.reports as wr - -from ..data.RetentionTimeDataset import RetentionTimeDataset - - -class RetentionTimeReportModelComparisonWandb: - - # Wilhelmlab WandB account that has all VEGA presets required for the reports - VEGA_LITE_PRESETS_ID = "prosit-compms" - - def __init__( - self, - models: dict, - project: str, - title: str, - description: str, - test_dataset: RetentionTimeDataset, - ): - """Creates WandB report for comparing models. - - Args: - models (dict): keys are model names, values are model objects - project (str): Name of the project. - title (str): Title of the report. - description (str): Description of the report. - test_dataset (RetentionTimeDataset): Test dataset object to compare predictions of models on. - """ - self.project = project - self.title = title - self.description = description - self.models = models - self.test_dataset = test_dataset - self.entity = wandb.apis.PublicApi().default_entity - self.api = wandb.Api() - - def create_report( - self, - add_data_section=True, - add_residuals_section=True, - add_r2_section=True, - add_density_section=True, - ): - """Creates the report in wandb_run. - - Args: - add_data_section (bool, optional): Add a section for input data to the report. Defaults to True. - add_residuals_section (bool, optional): Add a section for residual plots. Defaults to True. - add_r2_section (bool, optional): Add a section for the R2 metric. Defaults to True. - add_density_section (bool, optional): Add a section for the density plot. Defaults to True. - """ - report = wr.Report( - project=self.project, title=self.title, description=self.description - ) - - report.blocks = [wr.TableOfContents()] - - if add_data_section: - report.blocks += self._build_data_section() - if add_residuals_section: - report.blocks += self._build_residuals_section() - if add_r2_section: - report.blocks += self._build_r2_section() - if add_density_section: - report.blocks += self._build_density_section() - - report.save() - - def calculate_r2(self, targets, predictions): - from sklearn.metrics import r2_score - - r2 = r2_score(targets, predictions) - return r2 - - def calculate_residuals(self, targets, predictions): - residuals = predictions - targets - return residuals - - def _build_data_section(self): - data_block = [ - wr.H1(text="Data"), - wr.P( - "The following section is showing a simple explorative data analysis of the used dataset. The first histogram shows the distribution of peptide lengths in the data set, while the second histogram shows the distribution of indexed retention times." - ), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=[ - wr.CustomChart( - query={"summaryTable": {"tableKey": self.table_key_len}}, - chart_name=f"{RetentionTimeReportModelComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_peptide_length", - chart_fields={"value": self.test_dataset.sequence_col}, - ), - wr.CustomChart( - query={"summaryTable": {"tableKey": self.table_key_rt}}, - chart_name=f"{RetentionTimeReportModelComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_irt", - chart_fields={"value": self.test_dataset.target_col}, - ), - ], - ), - wr.HorizontalRule(), - ] - return data_block - - def _build_residuals_section(self): - panel_list_models = [] - for model in self.models: - panel_list_models.append( - wr.CustomChart( - query={"summaryTable": {"tableKey": f"results_table_{model}"}}, - chart_name=f"{RetentionTimeReportModelComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_residuals_irt", - chart_fields={"value": "residuals", "name": model}, - ) - ) - - residuals_block = [ - wr.H1(text="Residuals"), - wr.P( - "This section shows the residuals histograms. Each plot shows the residuals of each of the compared models" - ), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=panel_list_models, - ), - wr.HorizontalRule(), - ] - - return residuals_block - - def _build_r2_section(self): - r2_block = [ - wr.H1(text="R2"), - wr.P("The following plot displays the R2 score for all the compared models."), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=[ - wr.BarPlot( - title="R2", - metrics=["r2"], - orientation="h", - title_x="R2", - max_runs_to_show=20, - max_bars_to_show=20, - font_size="auto", - ), - ], - ), - wr.HorizontalRule(), - ] - return r2_block - - def _build_density_section(self, irt_delta95=5): - panel_list_models = [] - targets = self.test_dataset.get_split_targets( - split=self.test_dataset.main_split - ) - for model in self.models: - panel_list_models.append( - wr.CustomChart( - query={"summaryTable": {"tableKey": f"results_table_{model}"}}, - chart_name=f"{RetentionTimeReportModelComparisonWandb.VEGA_LITE_PRESETS_ID}/density_plot_irt", - chart_fields={ - "measured": "irt", - "predicted": "predicted_irt", - "name": model, - "irt_delta95": irt_delta95, - }, - ) - ) - - density_block = [ - wr.H1(text="Density"), - wr.P( - "This section displays the density plots for all compared models." - ), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=panel_list_models, - ), - wr.HorizontalRule(), - ] - - return density_block - - def compare_models(self): - for model in self.models: - # initialize WANDB - current_model = model - wandb.init(project=self.project, name=current_model) - - # predict on test_dataset - predictions = self.models[model].predict(self.test_dataset.test_data) - predictions = predictions.ravel() - targets = self.test_dataset.get_split_targets( - split=self.test_dataset.main_split - ) - # create result df - results_df = pd.DataFrame( - { - "sequence": self.test_dataset.sequences, - "irt": targets, - "predicted_irt": predictions, - "residuals": self.calculate_residuals(targets, predictions), - } - ) - # log df as table to wandb_run - table = wandb.Table(dataframe=results_df) - wandb.log({f"results_table_{current_model}": table}) - - # log r2 to wandb_run - r2 = self.calculate_r2(targets, predictions) - wandb.log({"r2": r2}) - - # finish run - wandb.finish() - - # function to log sequence length table to wandb_run - def log_sequence_length_table( - self, data: pd.DataFrame, seq_col: str = "modified_sequence" - ): - name_hist = "counts_hist" - counts = self.count_seq_length(data, seq_col) - # convert to df for easier handling - counts_df = counts.to_frame() - table = wandb.Table(dataframe=counts_df) - # log to wandb_run - hist = wandb.plot_table( - vega_spec_name=f"{RetentionTimeReportModelComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_peptide_length", - data_table=table, - fields={"value": seq_col}, - ) - wandb.log({name_hist: hist}) - name_hist_table = name_hist + "_table" - return name_hist_table - - # function to count sequence length - def count_seq_length(self, data: pd.DataFrame, seq_col: str) -> pd.Series: - pattern = re.compile(r"\[UNIMOD:.*\]", re.IGNORECASE) - data[seq_col].replace(pattern, "", inplace=True) - return data[seq_col].str.len() - - # function to log retention time table to wandb_run - def log_rt_table(self, data: pd.DataFrame, rt_col: str = "indexed_retention_time"): - name_hist = "rt_hist" - rt = data.loc[:, rt_col] - # convert to df for easier handling - rt_df = rt.to_frame() - table = wandb.Table(dataframe=rt_df) - # log to wandb_run - hist = wandb.plot_table( - vega_spec_name=f"{RetentionTimeReportModelComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_irt", - data_table=table, - fields={"value": rt_col}, - ) - wandb.log({name_hist: hist}) - name_hist_table = name_hist + "_table" - return name_hist_table - - def log_data(self): - wandb.init(project=self.project, name="data_run") - # check if datasource is a string - if isinstance(self.test_dataset.data_source, str): - # read corresponding file - file_extension = os.path.splitext(self.test_dataset.data_source)[-1].lower() - - if file_extension == ".csv": - data = pd.read_csv(self.test_dataset.data_source) - if file_extension == ".json": - data = pd.read_json(self.test_dataset.data_source) - if file_extension == ".parquet": - data = pd.read_parquet( - self.test_dataset.data_source, engine="fastparquet" - ) - - self.table_key_len = self.log_sequence_length_table( - data, self.test_dataset.sequence_col - ) - self.table_key_rt = self.log_rt_table(data, self.test_dataset.target_col) - - # check if datasource is a tuple of two ndarrays or two lists - if ( - isinstance(self.test_dataset.data_source, tuple) - and all( - isinstance(item, (np.ndarray, list)) - for item in self.test_dataset.data_source - ) - and len(self.test_dataset.data_source) == 2 - ): - data = pd.DataFrame( - { - self.test_dataset.sequence_col: self.test_dataset.data_source[0], - self.test_dataset.target_col: self.test_dataset.data_source[1], - } - ) - self.table_key_len = self.log_sequence_length_table( - data, self.test_dataset.sequence_col - ) - self.table_key_rt = self.log_rt_table(data, self.test_dataset.target_col) - - # check if datasource is a single ndarray or list - # does not work? maybe error in RetentionTimeDataset - if isinstance(self.test_dataset.data_source, (np.ndarray, list)): - data = pd.DataFrame( - {self.test_dataset.sequence_col: self.test_dataset.data_source} - ) - self.table_key_len = self.log_sequence_length_table( - data, self.test_dataset.sequence_col - ) - wandb.finish() diff --git a/dlomix/reports/RetentionTimeReportRunComparisonWandb.py b/dlomix/reports/RetentionTimeReportRunComparisonWandb.py deleted file mode 100644 index 00e8a330d11cdaa4d63e03ffb0eaafddaa9deb77..0000000000000000000000000000000000000000 --- a/dlomix/reports/RetentionTimeReportRunComparisonWandb.py +++ /dev/null @@ -1,430 +0,0 @@ -import os -import re - -import numpy as np -import pandas as pd -import wandb -import wandb.apis.reports as wr -from wandb.keras import WandbCallback, WandbMetricsLogger - -from ..data import RetentionTimeDataset - -# ToDo: add R2 plot, TimeDelta plot, residuals - - -class RetentionTimeReportRunComparisonWandb: - - METRICS_TO_EXCLUDE = [ - "epoch/learning_rate", - "epoch/epoch", - "batch/learning_rate", - "batch/batch_step", - ] - - # Wilhelmlab WandB account that has all VEGA presets required for the reports - VEGA_LITE_PRESETS_ID = "prosit-compms" - - def __init__( - self, - project: str, - title: str, - description: str, - dataset: RetentionTimeDataset = None, - ): - """Create WandB report for comparing runs. - - Args: - project (str): Name of the project to be used in wandb_run. - title (str): Title of the report in wandb_run. - description (str): Description of the report in wandb_run. - dataset (RetentionTimeDataset, optional): The retention time dataset if logging the data is desired. Defaults to None, no logging of input data. - """ - self.project = project - self.title = title - self.description = description - self.dataset = dataset - self.entity = wandb.apis.PublicApi().default_entity - self.wandb_api = wandb.Api() - self.table_key_len = "" - self.table_key_rt = "" - self.model_info = [] - - def create_report( - self, - add_config_section=True, - add_data_section=True, - add_train_section=True, - add_val_section=True, - add_train_val_section=True, - add_model_section=True, - ): - """Create a report in wandb_run. - - Args: - add_config_section (bool, optional): Add a section for config parameters and the run to the report. Defaults to True. - add_data_section (bool, optional): Add a section for input data to the report. Defaults to True. - add_train_section (bool, optional): Add a section for training metrics to the report. Defaults to True. - add_val_section (bool, optional): Add a section for validation metrics to the report. Defaults to True. - add_train_val_section (bool, optional): Add a section for train-val metrics to the report. Defaults to True. - add_model_section (bool, optional): Add a section for model summary and number of parameters to the report. Defaults to True. - """ - report = wr.Report( - project=self.project, title=self.title, description=self.description - ) - - report.blocks = [wr.TableOfContents()] - - if add_model_section: - report.blocks += self._build_model_section() - if add_config_section: - report.blocks += self._build_config_section() - if add_data_section and self.dataset is not None: - report.blocks += self._build_data_section() - if add_train_section: - report.blocks += self._build_train_section() - if add_val_section: - report.blocks += self._build_val_section() - if add_train_val_section: - report.blocks += self._build_train_val_section() - - report.save() - - # get metrics of last run in project or from specified run_id - def _get_metrics(self, run_id=None): - if run_id: - # run is specified by <entity>/<project>/<run_id> - run = self.wandb_api.run(path=f"{self.entity}/{self.project}/{run_id}") - metrics_dataframe = run.history() - return metrics_dataframe - else: - # get metrics of latest run - runs = self.wandb_api.runs(path=f"{self.entity}/{self.project}") - run = runs[0] - metrics_dataframe = run.history() - return metrics_dataframe - - # get metric names split into train/val, train is further split into batch/epoch - def _get_metrics_names(self): - metrics = self._get_metrics() - # filter strings from list that are not starting with "_" and do not contain "val" - pre_filter = [string for string in metrics if not string.startswith("_")] - batch_train_metrics_names = [ - string - for string in pre_filter - if ("val" not in string.lower()) - & ("epoch" not in string.lower()) - & ("table" not in string.lower()) - ] - epoch_train_metrics_names = [ - string - for string in pre_filter - if ("val" not in string.lower()) - & ("batch" not in string.lower()) - & ("table" not in string.lower()) - ] - # filter strings from list that contain "val" - epoch_val_metrics_names = list(filter(lambda x: "val" in x.lower(), metrics)) - - # filter strings from train metrics that are 'epoch/learning_rate' and 'epoch/epoch' - strings_to_filter = RetentionTimeReportRunComparisonWandb.METRICS_TO_EXCLUDE - batch_train_metrics_names = [ - string - for string in batch_train_metrics_names - if string not in strings_to_filter - ] - epoch_train_metrics_names = [ - string - for string in epoch_train_metrics_names - if string not in strings_to_filter - ] - batch_train_metrics_names.sort() - epoch_train_metrics_names.sort() - return ( - batch_train_metrics_names, - epoch_train_metrics_names, - epoch_val_metrics_names, - ) - - def get_train_val_metrics_names(self): - ( - _, - epoch_train_metrics_names, - epoch_val_metrics_names, - ) = self._get_metrics_names() - epoch_train_metrics_names.sort() - epoch_val_metrics_names.sort() - return list(zip(epoch_train_metrics_names, epoch_val_metrics_names)) - - def _build_config_section(self): - config_block = [ - wr.H1(text="Config"), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=[wr.RunComparer(layout={"w": 24})], - ), - wr.HorizontalRule(), - ] - return config_block - - def _build_data_section(self): - data_block = [ - wr.H1(text="Data"), - wr.P( - "The following section is showing a simple explorative data analysis of the used dataset. The first histogram shows the distribution of peptide lengths in the data set, while the second histogram shows the distribution of indexed retention times." - ), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=[ - wr.CustomChart( - query={"summaryTable": {"tableKey": self.table_key_len}}, - chart_name=f"{RetentionTimeReportRunComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_peptide_length", - chart_fields={"value": self.dataset.sequence_col}, - ), - wr.CustomChart( - query={"summaryTable": {"tableKey": self.table_key_rt}}, - chart_name=f"{RetentionTimeReportRunComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_irt", - chart_fields={"value": self.dataset.target_col}, - ), - ], - ), - wr.HorizontalRule(), - ] - return data_block - - def _build_train_section(self): - ( - batch_train_metrics_names, - epoch_train_metrics_names, - _, - ) = self._get_metrics_names() - panel_list_batch = [] - panel_list_epoch = [] - for name in batch_train_metrics_names: - panel_list_batch.append(wr.LinePlot(x="Step", y=[name])) - for name in epoch_train_metrics_names: - panel_list_epoch.append(wr.LinePlot(x="Step", y=[name])) - train_block = [ - wr.H1(text="Training metrics"), - wr.P( - "The following section shows the different metrics that were used to track the training. All used metrics are added by default. The first subsection shows the metrics per epoch, whereas the second subsection show the metrics per batch." - ), - wr.H2(text="per batch"), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=panel_list_batch, - ), - wr.H2(text="per epoch"), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=panel_list_epoch, - ), - wr.HorizontalRule(), - ] - return train_block - - def _build_val_section(self): - _, _, epoch_val_metrics_names = self._get_metrics_names() - panel_list_epoch = [] - for name in epoch_val_metrics_names: - panel_list_epoch.append(wr.LinePlot(x="Step", y=[name])) - val_block = [ - wr.H1(text="Validation metrics"), - wr.P( - "The following section shows the different metrics that were used to track the validation. All used metrics are added by default. The metrics are shown per epoch." - ), - wr.H2(text="per epoch"), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=panel_list_epoch, - ), - wr.HorizontalRule(), - ] - return val_block - - def _build_model_section(self): - model_block = [ - wr.H1(text="Model information"), - wr.P( - "The following section shows information about the model. The table below contains information about the models' layers." - ), - wr.UnorderedList(items=self.model_info), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=[wr.WeavePanelSummaryTable("layer_table")], - ), - wr.HorizontalRule(), - ] - return model_block - - def _build_train_val_section(self): - train_val_metrics_names = self.get_train_val_metrics_names() - panel_list_epoch = [] - for name in train_val_metrics_names: - panel_list_epoch.append(wr.LinePlot(x="Step", y=list(name))) - train_val_block = [ - wr.H1(text="Train - Validation metrics"), - wr.P( - "The following section shows the different metrics for both training and validation in comaprison. All used metrics are added by default. The metrics are shown per epoch." - ), - wr.H2(text="per epoch"), - wr.PanelGrid( - runsets=[ - wr.Runset(self.entity, self.project), - ], - panels=panel_list_epoch, - ), - wr.HorizontalRule(), - ] - return train_val_block - - def log_sequence_length_table( - self, data: pd.DataFrame, seq_col: str = "modified_sequence" - ): - """Log sequence length table to wandb_run - - Args: - data (pd.DataFrame): input data - seq_col (str, optional): Name of the column containing the sequences in the data frame. Defaults to "modified_sequence". - - Returns: - str: Name of the histogram created by wandb_run after logging the data. - """ - name_hist = "counts_hist" - counts = self.count_seq_length(data, seq_col) - # convert to df for easier handling - counts_df = counts.to_frame() - table = wandb.Table(dataframe=counts_df) - # log to wandb_run - hist = wandb.plot_table( - vega_spec_name=f"{RetentionTimeReportRunComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_peptide_length", - data_table=table, - fields={"value": seq_col}, - ) - wandb.log({name_hist: hist}) - name_hist_table = name_hist + "_table" - return name_hist_table - - # function to count sequence length - def count_seq_length(self, data: pd.DataFrame, seq_col: str): - pattern = re.compile(r"\[UNIMOD:.*\]", re.IGNORECASE) - data[seq_col].replace(pattern, "", inplace=True) - return data[seq_col].str.len() - - # function to log retention time table to wandb_run - def log_rt_table(self, data: pd.DataFrame, rt_col: str = "indexed_retention_time"): - name_hist = "rt_hist" - rt = data.loc[:, rt_col] - # convert to df for easier handling - rt_df = rt.to_frame() - table = wandb.Table(dataframe=rt_df) - # log to wandb_run - hist = wandb.plot_table( - vega_spec_name=f"{RetentionTimeReportRunComparisonWandb.VEGA_LITE_PRESETS_ID}/histogram_irt", - data_table=table, - fields={"value": rt_col}, - ) - wandb.log({name_hist: hist}) - name_hist_table = name_hist + "_table" - return name_hist_table - - def log_data(self): - # check if datasource is a string - if isinstance(self.dataset.data_source, str): - # read corresponding file - file_extension = os.path.splitext(self.dataset.data_source)[-1].lower() - - if file_extension == ".csv": - data = pd.read_csv(self.dataset.data_source) - if file_extension == ".json": - data = pd.read_json(self.dataset.data_source) - if file_extension == ".parquet": - data = pd.read_parquet(self.dataset.data_source, engine="fastparquet") - self.table_key_len = self.log_sequence_length_table( - data, self.dataset.sequence_col - ) - self.table_key_rt = self.log_rt_table(data, self.dataset.target_col) - - # check if datasource is a tuple of two ndarrays or two lists - if ( - isinstance(self.dataset.data_source, tuple) - and all( - isinstance(item, (np.ndarray, list)) - for item in self.dataset.data_source - ) - and len(self.dataset.data_source) == 2 - ): - data = pd.DataFrame( - { - self.dataset.sequence_col: self.dataset.data_source[0], - self.dataset.target_col: self.dataset.data_source[1], - } - ) - self.table_key_len = self.log_sequence_length_table( - data, self.dataset.sequence_col - ) - self.table_key_rt = self.log_rt_table(data, self.dataset.target_col) - - # check if datasource is a single ndarray or list - # does not work? maybe error in RetentionTimeDataset - if isinstance(self.dataset.data_source, (np.ndarray, list)): - data = pd.DataFrame({self.dataset.sequence_col: self.dataset.data_source}) - self.table_key_len = self.log_sequence_length_table( - data, self.dataset.sequence_col - ) - - def log_model_data(self, model): - import io - model_summary_buffer = io.StringIO() - model.summary(print_fn=lambda x: model_summary_buffer.write(x + "<br>")) - model_summary_lines = model_summary_buffer.getvalue().split("<br>") - - lines = [line.rstrip() for line in model_summary_lines] - - # remove formatting lines - strings_to_remove = ["____", "===="] - cleaned_list = [ - item - for item in lines - if not any(string in item for string in strings_to_remove) - ] - - # split into words by splitting if there are more than two whitespaces - words = [] - for line in cleaned_list: - words.append(re.split(r"\s{2,}", line)) - - # remove lines that contain less than 3 characters - filtered_list_of_lists = [ - sublist for sublist in words if all(len(item) > 3 for item in sublist) - ] - - # extract layer info and model info - layer_info = [sublist for sublist in filtered_list_of_lists if len(sublist) > 2] - model_info = [sublist for sublist in filtered_list_of_lists if len(sublist) < 2] - - # flatten model_info and filter entries with length smaller than 5 - model_info_flat = [item for sublist in model_info for item in sublist] - model_info_flat_filtered = [item for item in model_info_flat if len(item) >= 5] - - # create layer_info_df - column_names = layer_info[0] - layer_info_df = pd.DataFrame(layer_info[1:], columns=column_names) - - # log layer_table to wandb_run - layer_table = wandb.Table(dataframe=layer_info_df) - wandb.log({"layer_table": layer_table}) - - # attach model_info to object - self.model_info = model_info_flat_filtered diff --git a/dlomix/reports/__init__.py b/dlomix/reports/__init__.py deleted file mode 100644 index 8a71bc47eaf4ede089e5759380f7e81ea1989961..0000000000000000000000000000000000000000 --- a/dlomix/reports/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .IntensityReport import IntensityReport -from .RetentionTimeReport import RetentionTimeReport -from .RetentionTimeReportModelComparisonWandb import ( - RetentionTimeReportModelComparisonWandb, -) -from .RetentionTimeReportRunComparisonWandb import RetentionTimeReportRunComparisonWandb - -__all__ = ["RetentionTimeReport", - "IntensityReport", - "RetentionTimeReportRunComparisonWandb", - "RetentionTimeReportModelComparisonWandb", - ] diff --git a/dlomix/reports/postprocessing.py b/dlomix/reports/postprocessing.py deleted file mode 100644 index a75316fe28b09bae36dcf48b1ea142989351a4d6..0000000000000000000000000000000000000000 --- a/dlomix/reports/postprocessing.py +++ /dev/null @@ -1,101 +0,0 @@ -import functools - -import numpy as np -import tensorflow as tf - -from ..losses import masked_spectral_distance - - -def reshape_dims(array): - n, dims = array.shape - assert dims == 174 - nlosses = 1 - return array.reshape([n, 30 - 1, 2, nlosses, 3]) - - -def reshape_flat(array): - s = array.shape - flat_dim = [s[0], functools.reduce(lambda x, y: x * y, s[1:], 1)] - return array.reshape(flat_dim) - - -def normalize_base_peak(array): - # flat - maxima = array.max(axis=1) - array = array / maxima[:, np.newaxis] - return array - - -def mask_outofrange(array, lengths, mask=-1.0): - # dim - for i in range(array.shape[0]): - array[i, lengths[i] - 1 :, :, :, :] = mask - return array - - -def mask_outofcharge(array, charges, mask=-1.0): - # dim - for i in range(array.shape[0]): - if charges[i] < 3: - array[i, :, :, :, charges[i] :] = mask - return array - - -def get_spectral_angle(true, pred, batch_size=600): - n = true.shape[0] - sa = np.zeros([n]) - - def iterate(): - if n > batch_size: - for i in range(n // batch_size): - true_sample = true[i * batch_size : (i + 1) * batch_size] - pred_sample = pred[i * batch_size : (i + 1) * batch_size] - yield i, true_sample, pred_sample - i = n // batch_size - yield i, true[(i) * batch_size :], pred[(i) * batch_size :] - else: - yield 0, true, pred - - for i, t_b, p_b in iterate(): - tf.compat.v1.reset_default_graph() - with tf.compat.v1.Session() as s: - sa_graph = masked_spectral_distance(t_b, p_b) - sa_b = 1 - s.run(sa_graph) - sa[i * batch_size : i * batch_size + sa_b.shape[0]] = sa_b - sa = np.nan_to_num(sa) - return sa - - -def normalize_intensity_predictions(data, batch_size=600): - assert ( - "sequences" in data - ), "Key sequences is missing in the data provided for post-processing" - assert ( - "intensities_pred" in data - ), "Key intensities_pred is missing in the data provided for post-processing" - assert ( - "precursor_charge_onehot" in data - ), "Key precursor_charge_onehot is missing in the data provided for post-processing" - - sequence_lengths = data["sequences"].apply(lambda x: len(x)) - intensities = np.stack(data["intensities_pred"].to_numpy()).astype(np.float32) - precursor_charge_onehot = np.stack(data["precursor_charge_onehot"].to_numpy()) - charges = list(precursor_charge_onehot.argmax(axis=1) + 1) - - intensities[intensities < 0] = 0 - intensities = reshape_dims(intensities) - intensities = mask_outofrange(intensities, sequence_lengths) - intensities = mask_outofcharge(intensities, charges) - intensities = reshape_flat(intensities) - m_idx = intensities == -1 - intensities = normalize_base_peak(intensities) - intensities[m_idx] = -1 - data["intensities_pred"] = intensities - - if "intensities_raw" in data: - data["spectral_angle"] = get_spectral_angle( - np.stack(data["intensities_raw"].to_numpy()).astype(np.float32), - intensities, - batch_size=batch_size, - ) - return data diff --git a/dlomix/utils.py b/dlomix/utils.py deleted file mode 100644 index e2dbb7756ebcb8eda78537ea805cd3915b3cbe49..0000000000000000000000000000000000000000 --- a/dlomix/utils.py +++ /dev/null @@ -1,48 +0,0 @@ -import pickle - -import numpy as np - - -def save_obj(obj, name): - with open(name + ".pkl", "wb") as f: - pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) - - -def load_obj(name): - with open(name + ".pkl", "rb") as f: - return pickle.load(f) - - -def convert_nested_list_to_numpy_array(nested_list, dtype=np.float32): - return np.array([np.array(x, dtype=dtype) for x in nested_list]) - - -def lower_and_trim_strings(strings): - return [s.lower().trim() for s in strings] - - -def get_constructor_call_object_creation(object_instance): - members = [ - attr - for attr in vars(object_instance) - if not callable(getattr(object_instance, attr)) - and not attr.startswith(("_", "__")) - ] - values = [object_instance.__getattribute__(m) for m in members] - - repr_str = ", ".join([f"{m}={v}" for m, v in zip(members, values)]) - - return f"{object_instance.__class__.__name__}({repr_str})" - - -def flatten_dict_for_values(d): - if not isinstance(d, dict): - return d - else: - items = [] - for v in d.values(): - if isinstance(v, dict): - return flatten_dict_for_values(v) - else: - items.append(v) - return items diff --git a/fpdf/font/courier.php b/fpdf/font/courier.php deleted file mode 100644 index 67dbedaa01e783379e8708d5a0fe48a7506fbce6..0000000000000000000000000000000000000000 --- a/fpdf/font/courier.php +++ /dev/null @@ -1,10 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Courier'; -$up = -100; -$ut = 50; -for($i=0;$i<=255;$i++) - $cw[chr($i)] = 600; -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/courierb.php b/fpdf/font/courierb.php deleted file mode 100644 index 62550a4c5b457e57d3f21002202e28f1fd31b4aa..0000000000000000000000000000000000000000 --- a/fpdf/font/courierb.php +++ /dev/null @@ -1,10 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Courier-Bold'; -$up = -100; -$ut = 50; -for($i=0;$i<=255;$i++) - $cw[chr($i)] = 600; -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/courierbi.php b/fpdf/font/courierbi.php deleted file mode 100644 index 6a3ecc62df4c6bd1bf15f7292ce7fc6a1c7c04ab..0000000000000000000000000000000000000000 --- a/fpdf/font/courierbi.php +++ /dev/null @@ -1,10 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Courier-BoldOblique'; -$up = -100; -$ut = 50; -for($i=0;$i<=255;$i++) - $cw[chr($i)] = 600; -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/courieri.php b/fpdf/font/courieri.php deleted file mode 100644 index b88e0980904c487567d0474759b1c055abb07c30..0000000000000000000000000000000000000000 --- a/fpdf/font/courieri.php +++ /dev/null @@ -1,10 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Courier-Oblique'; -$up = -100; -$ut = 50; -for($i=0;$i<=255;$i++) - $cw[chr($i)] = 600; -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/helvetica.php b/fpdf/font/helvetica.php deleted file mode 100644 index 2be3eca1478e7c46fc8e76b295ca3c631131fe5d..0000000000000000000000000000000000000000 --- a/fpdf/font/helvetica.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Helvetica'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>278,chr(1)=>278,chr(2)=>278,chr(3)=>278,chr(4)=>278,chr(5)=>278,chr(6)=>278,chr(7)=>278,chr(8)=>278,chr(9)=>278,chr(10)=>278,chr(11)=>278,chr(12)=>278,chr(13)=>278,chr(14)=>278,chr(15)=>278,chr(16)=>278,chr(17)=>278,chr(18)=>278,chr(19)=>278,chr(20)=>278,chr(21)=>278, - chr(22)=>278,chr(23)=>278,chr(24)=>278,chr(25)=>278,chr(26)=>278,chr(27)=>278,chr(28)=>278,chr(29)=>278,chr(30)=>278,chr(31)=>278,' '=>278,'!'=>278,'"'=>355,'#'=>556,'$'=>556,'%'=>889,'&'=>667,'\''=>191,'('=>333,')'=>333,'*'=>389,'+'=>584, - ','=>278,'-'=>333,'.'=>278,'/'=>278,'0'=>556,'1'=>556,'2'=>556,'3'=>556,'4'=>556,'5'=>556,'6'=>556,'7'=>556,'8'=>556,'9'=>556,':'=>278,';'=>278,'<'=>584,'='=>584,'>'=>584,'?'=>556,'@'=>1015,'A'=>667, - 'B'=>667,'C'=>722,'D'=>722,'E'=>667,'F'=>611,'G'=>778,'H'=>722,'I'=>278,'J'=>500,'K'=>667,'L'=>556,'M'=>833,'N'=>722,'O'=>778,'P'=>667,'Q'=>778,'R'=>722,'S'=>667,'T'=>611,'U'=>722,'V'=>667,'W'=>944, - 'X'=>667,'Y'=>667,'Z'=>611,'['=>278,'\\'=>278,']'=>278,'^'=>469,'_'=>556,'`'=>333,'a'=>556,'b'=>556,'c'=>500,'d'=>556,'e'=>556,'f'=>278,'g'=>556,'h'=>556,'i'=>222,'j'=>222,'k'=>500,'l'=>222,'m'=>833, - 'n'=>556,'o'=>556,'p'=>556,'q'=>556,'r'=>333,'s'=>500,'t'=>278,'u'=>556,'v'=>500,'w'=>722,'x'=>500,'y'=>500,'z'=>500,'{'=>334,'|'=>260,'}'=>334,'~'=>584,chr(127)=>350,chr(128)=>556,chr(129)=>350,chr(130)=>222,chr(131)=>556, - chr(132)=>333,chr(133)=>1000,chr(134)=>556,chr(135)=>556,chr(136)=>333,chr(137)=>1000,chr(138)=>667,chr(139)=>333,chr(140)=>1000,chr(141)=>350,chr(142)=>611,chr(143)=>350,chr(144)=>350,chr(145)=>222,chr(146)=>222,chr(147)=>333,chr(148)=>333,chr(149)=>350,chr(150)=>556,chr(151)=>1000,chr(152)=>333,chr(153)=>1000, - chr(154)=>500,chr(155)=>333,chr(156)=>944,chr(157)=>350,chr(158)=>500,chr(159)=>667,chr(160)=>278,chr(161)=>333,chr(162)=>556,chr(163)=>556,chr(164)=>556,chr(165)=>556,chr(166)=>260,chr(167)=>556,chr(168)=>333,chr(169)=>737,chr(170)=>370,chr(171)=>556,chr(172)=>584,chr(173)=>333,chr(174)=>737,chr(175)=>333, - chr(176)=>400,chr(177)=>584,chr(178)=>333,chr(179)=>333,chr(180)=>333,chr(181)=>556,chr(182)=>537,chr(183)=>278,chr(184)=>333,chr(185)=>333,chr(186)=>365,chr(187)=>556,chr(188)=>834,chr(189)=>834,chr(190)=>834,chr(191)=>611,chr(192)=>667,chr(193)=>667,chr(194)=>667,chr(195)=>667,chr(196)=>667,chr(197)=>667, - chr(198)=>1000,chr(199)=>722,chr(200)=>667,chr(201)=>667,chr(202)=>667,chr(203)=>667,chr(204)=>278,chr(205)=>278,chr(206)=>278,chr(207)=>278,chr(208)=>722,chr(209)=>722,chr(210)=>778,chr(211)=>778,chr(212)=>778,chr(213)=>778,chr(214)=>778,chr(215)=>584,chr(216)=>778,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>667,chr(222)=>667,chr(223)=>611,chr(224)=>556,chr(225)=>556,chr(226)=>556,chr(227)=>556,chr(228)=>556,chr(229)=>556,chr(230)=>889,chr(231)=>500,chr(232)=>556,chr(233)=>556,chr(234)=>556,chr(235)=>556,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>556,chr(241)=>556, - chr(242)=>556,chr(243)=>556,chr(244)=>556,chr(245)=>556,chr(246)=>556,chr(247)=>584,chr(248)=>611,chr(249)=>556,chr(250)=>556,chr(251)=>556,chr(252)=>556,chr(253)=>500,chr(254)=>556,chr(255)=>500); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/helveticab.php b/fpdf/font/helveticab.php deleted file mode 100644 index c88394ce4f03f8c43c2b931789a0d783893d7188..0000000000000000000000000000000000000000 --- a/fpdf/font/helveticab.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Helvetica-Bold'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>278,chr(1)=>278,chr(2)=>278,chr(3)=>278,chr(4)=>278,chr(5)=>278,chr(6)=>278,chr(7)=>278,chr(8)=>278,chr(9)=>278,chr(10)=>278,chr(11)=>278,chr(12)=>278,chr(13)=>278,chr(14)=>278,chr(15)=>278,chr(16)=>278,chr(17)=>278,chr(18)=>278,chr(19)=>278,chr(20)=>278,chr(21)=>278, - chr(22)=>278,chr(23)=>278,chr(24)=>278,chr(25)=>278,chr(26)=>278,chr(27)=>278,chr(28)=>278,chr(29)=>278,chr(30)=>278,chr(31)=>278,' '=>278,'!'=>333,'"'=>474,'#'=>556,'$'=>556,'%'=>889,'&'=>722,'\''=>238,'('=>333,')'=>333,'*'=>389,'+'=>584, - ','=>278,'-'=>333,'.'=>278,'/'=>278,'0'=>556,'1'=>556,'2'=>556,'3'=>556,'4'=>556,'5'=>556,'6'=>556,'7'=>556,'8'=>556,'9'=>556,':'=>333,';'=>333,'<'=>584,'='=>584,'>'=>584,'?'=>611,'@'=>975,'A'=>722, - 'B'=>722,'C'=>722,'D'=>722,'E'=>667,'F'=>611,'G'=>778,'H'=>722,'I'=>278,'J'=>556,'K'=>722,'L'=>611,'M'=>833,'N'=>722,'O'=>778,'P'=>667,'Q'=>778,'R'=>722,'S'=>667,'T'=>611,'U'=>722,'V'=>667,'W'=>944, - 'X'=>667,'Y'=>667,'Z'=>611,'['=>333,'\\'=>278,']'=>333,'^'=>584,'_'=>556,'`'=>333,'a'=>556,'b'=>611,'c'=>556,'d'=>611,'e'=>556,'f'=>333,'g'=>611,'h'=>611,'i'=>278,'j'=>278,'k'=>556,'l'=>278,'m'=>889, - 'n'=>611,'o'=>611,'p'=>611,'q'=>611,'r'=>389,'s'=>556,'t'=>333,'u'=>611,'v'=>556,'w'=>778,'x'=>556,'y'=>556,'z'=>500,'{'=>389,'|'=>280,'}'=>389,'~'=>584,chr(127)=>350,chr(128)=>556,chr(129)=>350,chr(130)=>278,chr(131)=>556, - chr(132)=>500,chr(133)=>1000,chr(134)=>556,chr(135)=>556,chr(136)=>333,chr(137)=>1000,chr(138)=>667,chr(139)=>333,chr(140)=>1000,chr(141)=>350,chr(142)=>611,chr(143)=>350,chr(144)=>350,chr(145)=>278,chr(146)=>278,chr(147)=>500,chr(148)=>500,chr(149)=>350,chr(150)=>556,chr(151)=>1000,chr(152)=>333,chr(153)=>1000, - chr(154)=>556,chr(155)=>333,chr(156)=>944,chr(157)=>350,chr(158)=>500,chr(159)=>667,chr(160)=>278,chr(161)=>333,chr(162)=>556,chr(163)=>556,chr(164)=>556,chr(165)=>556,chr(166)=>280,chr(167)=>556,chr(168)=>333,chr(169)=>737,chr(170)=>370,chr(171)=>556,chr(172)=>584,chr(173)=>333,chr(174)=>737,chr(175)=>333, - chr(176)=>400,chr(177)=>584,chr(178)=>333,chr(179)=>333,chr(180)=>333,chr(181)=>611,chr(182)=>556,chr(183)=>278,chr(184)=>333,chr(185)=>333,chr(186)=>365,chr(187)=>556,chr(188)=>834,chr(189)=>834,chr(190)=>834,chr(191)=>611,chr(192)=>722,chr(193)=>722,chr(194)=>722,chr(195)=>722,chr(196)=>722,chr(197)=>722, - chr(198)=>1000,chr(199)=>722,chr(200)=>667,chr(201)=>667,chr(202)=>667,chr(203)=>667,chr(204)=>278,chr(205)=>278,chr(206)=>278,chr(207)=>278,chr(208)=>722,chr(209)=>722,chr(210)=>778,chr(211)=>778,chr(212)=>778,chr(213)=>778,chr(214)=>778,chr(215)=>584,chr(216)=>778,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>667,chr(222)=>667,chr(223)=>611,chr(224)=>556,chr(225)=>556,chr(226)=>556,chr(227)=>556,chr(228)=>556,chr(229)=>556,chr(230)=>889,chr(231)=>556,chr(232)=>556,chr(233)=>556,chr(234)=>556,chr(235)=>556,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>611,chr(241)=>611, - chr(242)=>611,chr(243)=>611,chr(244)=>611,chr(245)=>611,chr(246)=>611,chr(247)=>584,chr(248)=>611,chr(249)=>611,chr(250)=>611,chr(251)=>611,chr(252)=>611,chr(253)=>556,chr(254)=>611,chr(255)=>556); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/helveticabi.php b/fpdf/font/helveticabi.php deleted file mode 100644 index bcea8079071fa694833f151332406cff80a364f3..0000000000000000000000000000000000000000 --- a/fpdf/font/helveticabi.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Helvetica-BoldOblique'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>278,chr(1)=>278,chr(2)=>278,chr(3)=>278,chr(4)=>278,chr(5)=>278,chr(6)=>278,chr(7)=>278,chr(8)=>278,chr(9)=>278,chr(10)=>278,chr(11)=>278,chr(12)=>278,chr(13)=>278,chr(14)=>278,chr(15)=>278,chr(16)=>278,chr(17)=>278,chr(18)=>278,chr(19)=>278,chr(20)=>278,chr(21)=>278, - chr(22)=>278,chr(23)=>278,chr(24)=>278,chr(25)=>278,chr(26)=>278,chr(27)=>278,chr(28)=>278,chr(29)=>278,chr(30)=>278,chr(31)=>278,' '=>278,'!'=>333,'"'=>474,'#'=>556,'$'=>556,'%'=>889,'&'=>722,'\''=>238,'('=>333,')'=>333,'*'=>389,'+'=>584, - ','=>278,'-'=>333,'.'=>278,'/'=>278,'0'=>556,'1'=>556,'2'=>556,'3'=>556,'4'=>556,'5'=>556,'6'=>556,'7'=>556,'8'=>556,'9'=>556,':'=>333,';'=>333,'<'=>584,'='=>584,'>'=>584,'?'=>611,'@'=>975,'A'=>722, - 'B'=>722,'C'=>722,'D'=>722,'E'=>667,'F'=>611,'G'=>778,'H'=>722,'I'=>278,'J'=>556,'K'=>722,'L'=>611,'M'=>833,'N'=>722,'O'=>778,'P'=>667,'Q'=>778,'R'=>722,'S'=>667,'T'=>611,'U'=>722,'V'=>667,'W'=>944, - 'X'=>667,'Y'=>667,'Z'=>611,'['=>333,'\\'=>278,']'=>333,'^'=>584,'_'=>556,'`'=>333,'a'=>556,'b'=>611,'c'=>556,'d'=>611,'e'=>556,'f'=>333,'g'=>611,'h'=>611,'i'=>278,'j'=>278,'k'=>556,'l'=>278,'m'=>889, - 'n'=>611,'o'=>611,'p'=>611,'q'=>611,'r'=>389,'s'=>556,'t'=>333,'u'=>611,'v'=>556,'w'=>778,'x'=>556,'y'=>556,'z'=>500,'{'=>389,'|'=>280,'}'=>389,'~'=>584,chr(127)=>350,chr(128)=>556,chr(129)=>350,chr(130)=>278,chr(131)=>556, - chr(132)=>500,chr(133)=>1000,chr(134)=>556,chr(135)=>556,chr(136)=>333,chr(137)=>1000,chr(138)=>667,chr(139)=>333,chr(140)=>1000,chr(141)=>350,chr(142)=>611,chr(143)=>350,chr(144)=>350,chr(145)=>278,chr(146)=>278,chr(147)=>500,chr(148)=>500,chr(149)=>350,chr(150)=>556,chr(151)=>1000,chr(152)=>333,chr(153)=>1000, - chr(154)=>556,chr(155)=>333,chr(156)=>944,chr(157)=>350,chr(158)=>500,chr(159)=>667,chr(160)=>278,chr(161)=>333,chr(162)=>556,chr(163)=>556,chr(164)=>556,chr(165)=>556,chr(166)=>280,chr(167)=>556,chr(168)=>333,chr(169)=>737,chr(170)=>370,chr(171)=>556,chr(172)=>584,chr(173)=>333,chr(174)=>737,chr(175)=>333, - chr(176)=>400,chr(177)=>584,chr(178)=>333,chr(179)=>333,chr(180)=>333,chr(181)=>611,chr(182)=>556,chr(183)=>278,chr(184)=>333,chr(185)=>333,chr(186)=>365,chr(187)=>556,chr(188)=>834,chr(189)=>834,chr(190)=>834,chr(191)=>611,chr(192)=>722,chr(193)=>722,chr(194)=>722,chr(195)=>722,chr(196)=>722,chr(197)=>722, - chr(198)=>1000,chr(199)=>722,chr(200)=>667,chr(201)=>667,chr(202)=>667,chr(203)=>667,chr(204)=>278,chr(205)=>278,chr(206)=>278,chr(207)=>278,chr(208)=>722,chr(209)=>722,chr(210)=>778,chr(211)=>778,chr(212)=>778,chr(213)=>778,chr(214)=>778,chr(215)=>584,chr(216)=>778,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>667,chr(222)=>667,chr(223)=>611,chr(224)=>556,chr(225)=>556,chr(226)=>556,chr(227)=>556,chr(228)=>556,chr(229)=>556,chr(230)=>889,chr(231)=>556,chr(232)=>556,chr(233)=>556,chr(234)=>556,chr(235)=>556,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>611,chr(241)=>611, - chr(242)=>611,chr(243)=>611,chr(244)=>611,chr(245)=>611,chr(246)=>611,chr(247)=>584,chr(248)=>611,chr(249)=>611,chr(250)=>611,chr(251)=>611,chr(252)=>611,chr(253)=>556,chr(254)=>611,chr(255)=>556); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/helveticai.php b/fpdf/font/helveticai.php deleted file mode 100644 index a328b046a5d226b123248e10bd59e5e2b8d905cb..0000000000000000000000000000000000000000 --- a/fpdf/font/helveticai.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Helvetica-Oblique'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>278,chr(1)=>278,chr(2)=>278,chr(3)=>278,chr(4)=>278,chr(5)=>278,chr(6)=>278,chr(7)=>278,chr(8)=>278,chr(9)=>278,chr(10)=>278,chr(11)=>278,chr(12)=>278,chr(13)=>278,chr(14)=>278,chr(15)=>278,chr(16)=>278,chr(17)=>278,chr(18)=>278,chr(19)=>278,chr(20)=>278,chr(21)=>278, - chr(22)=>278,chr(23)=>278,chr(24)=>278,chr(25)=>278,chr(26)=>278,chr(27)=>278,chr(28)=>278,chr(29)=>278,chr(30)=>278,chr(31)=>278,' '=>278,'!'=>278,'"'=>355,'#'=>556,'$'=>556,'%'=>889,'&'=>667,'\''=>191,'('=>333,')'=>333,'*'=>389,'+'=>584, - ','=>278,'-'=>333,'.'=>278,'/'=>278,'0'=>556,'1'=>556,'2'=>556,'3'=>556,'4'=>556,'5'=>556,'6'=>556,'7'=>556,'8'=>556,'9'=>556,':'=>278,';'=>278,'<'=>584,'='=>584,'>'=>584,'?'=>556,'@'=>1015,'A'=>667, - 'B'=>667,'C'=>722,'D'=>722,'E'=>667,'F'=>611,'G'=>778,'H'=>722,'I'=>278,'J'=>500,'K'=>667,'L'=>556,'M'=>833,'N'=>722,'O'=>778,'P'=>667,'Q'=>778,'R'=>722,'S'=>667,'T'=>611,'U'=>722,'V'=>667,'W'=>944, - 'X'=>667,'Y'=>667,'Z'=>611,'['=>278,'\\'=>278,']'=>278,'^'=>469,'_'=>556,'`'=>333,'a'=>556,'b'=>556,'c'=>500,'d'=>556,'e'=>556,'f'=>278,'g'=>556,'h'=>556,'i'=>222,'j'=>222,'k'=>500,'l'=>222,'m'=>833, - 'n'=>556,'o'=>556,'p'=>556,'q'=>556,'r'=>333,'s'=>500,'t'=>278,'u'=>556,'v'=>500,'w'=>722,'x'=>500,'y'=>500,'z'=>500,'{'=>334,'|'=>260,'}'=>334,'~'=>584,chr(127)=>350,chr(128)=>556,chr(129)=>350,chr(130)=>222,chr(131)=>556, - chr(132)=>333,chr(133)=>1000,chr(134)=>556,chr(135)=>556,chr(136)=>333,chr(137)=>1000,chr(138)=>667,chr(139)=>333,chr(140)=>1000,chr(141)=>350,chr(142)=>611,chr(143)=>350,chr(144)=>350,chr(145)=>222,chr(146)=>222,chr(147)=>333,chr(148)=>333,chr(149)=>350,chr(150)=>556,chr(151)=>1000,chr(152)=>333,chr(153)=>1000, - chr(154)=>500,chr(155)=>333,chr(156)=>944,chr(157)=>350,chr(158)=>500,chr(159)=>667,chr(160)=>278,chr(161)=>333,chr(162)=>556,chr(163)=>556,chr(164)=>556,chr(165)=>556,chr(166)=>260,chr(167)=>556,chr(168)=>333,chr(169)=>737,chr(170)=>370,chr(171)=>556,chr(172)=>584,chr(173)=>333,chr(174)=>737,chr(175)=>333, - chr(176)=>400,chr(177)=>584,chr(178)=>333,chr(179)=>333,chr(180)=>333,chr(181)=>556,chr(182)=>537,chr(183)=>278,chr(184)=>333,chr(185)=>333,chr(186)=>365,chr(187)=>556,chr(188)=>834,chr(189)=>834,chr(190)=>834,chr(191)=>611,chr(192)=>667,chr(193)=>667,chr(194)=>667,chr(195)=>667,chr(196)=>667,chr(197)=>667, - chr(198)=>1000,chr(199)=>722,chr(200)=>667,chr(201)=>667,chr(202)=>667,chr(203)=>667,chr(204)=>278,chr(205)=>278,chr(206)=>278,chr(207)=>278,chr(208)=>722,chr(209)=>722,chr(210)=>778,chr(211)=>778,chr(212)=>778,chr(213)=>778,chr(214)=>778,chr(215)=>584,chr(216)=>778,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>667,chr(222)=>667,chr(223)=>611,chr(224)=>556,chr(225)=>556,chr(226)=>556,chr(227)=>556,chr(228)=>556,chr(229)=>556,chr(230)=>889,chr(231)=>500,chr(232)=>556,chr(233)=>556,chr(234)=>556,chr(235)=>556,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>556,chr(241)=>556, - chr(242)=>556,chr(243)=>556,chr(244)=>556,chr(245)=>556,chr(246)=>556,chr(247)=>584,chr(248)=>611,chr(249)=>556,chr(250)=>556,chr(251)=>556,chr(252)=>556,chr(253)=>500,chr(254)=>556,chr(255)=>500); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/symbol.php b/fpdf/font/symbol.php deleted file mode 100644 index 5b9147bd6ae1b04e55ea62e82e21a649554366b3..0000000000000000000000000000000000000000 --- a/fpdf/font/symbol.php +++ /dev/null @@ -1,20 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Symbol'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>250,chr(1)=>250,chr(2)=>250,chr(3)=>250,chr(4)=>250,chr(5)=>250,chr(6)=>250,chr(7)=>250,chr(8)=>250,chr(9)=>250,chr(10)=>250,chr(11)=>250,chr(12)=>250,chr(13)=>250,chr(14)=>250,chr(15)=>250,chr(16)=>250,chr(17)=>250,chr(18)=>250,chr(19)=>250,chr(20)=>250,chr(21)=>250, - chr(22)=>250,chr(23)=>250,chr(24)=>250,chr(25)=>250,chr(26)=>250,chr(27)=>250,chr(28)=>250,chr(29)=>250,chr(30)=>250,chr(31)=>250,' '=>250,'!'=>333,'"'=>713,'#'=>500,'$'=>549,'%'=>833,'&'=>778,'\''=>439,'('=>333,')'=>333,'*'=>500,'+'=>549, - ','=>250,'-'=>549,'.'=>250,'/'=>278,'0'=>500,'1'=>500,'2'=>500,'3'=>500,'4'=>500,'5'=>500,'6'=>500,'7'=>500,'8'=>500,'9'=>500,':'=>278,';'=>278,'<'=>549,'='=>549,'>'=>549,'?'=>444,'@'=>549,'A'=>722, - 'B'=>667,'C'=>722,'D'=>612,'E'=>611,'F'=>763,'G'=>603,'H'=>722,'I'=>333,'J'=>631,'K'=>722,'L'=>686,'M'=>889,'N'=>722,'O'=>722,'P'=>768,'Q'=>741,'R'=>556,'S'=>592,'T'=>611,'U'=>690,'V'=>439,'W'=>768, - 'X'=>645,'Y'=>795,'Z'=>611,'['=>333,'\\'=>863,']'=>333,'^'=>658,'_'=>500,'`'=>500,'a'=>631,'b'=>549,'c'=>549,'d'=>494,'e'=>439,'f'=>521,'g'=>411,'h'=>603,'i'=>329,'j'=>603,'k'=>549,'l'=>549,'m'=>576, - 'n'=>521,'o'=>549,'p'=>549,'q'=>521,'r'=>549,'s'=>603,'t'=>439,'u'=>576,'v'=>713,'w'=>686,'x'=>493,'y'=>686,'z'=>494,'{'=>480,'|'=>200,'}'=>480,'~'=>549,chr(127)=>0,chr(128)=>0,chr(129)=>0,chr(130)=>0,chr(131)=>0, - chr(132)=>0,chr(133)=>0,chr(134)=>0,chr(135)=>0,chr(136)=>0,chr(137)=>0,chr(138)=>0,chr(139)=>0,chr(140)=>0,chr(141)=>0,chr(142)=>0,chr(143)=>0,chr(144)=>0,chr(145)=>0,chr(146)=>0,chr(147)=>0,chr(148)=>0,chr(149)=>0,chr(150)=>0,chr(151)=>0,chr(152)=>0,chr(153)=>0, - chr(154)=>0,chr(155)=>0,chr(156)=>0,chr(157)=>0,chr(158)=>0,chr(159)=>0,chr(160)=>750,chr(161)=>620,chr(162)=>247,chr(163)=>549,chr(164)=>167,chr(165)=>713,chr(166)=>500,chr(167)=>753,chr(168)=>753,chr(169)=>753,chr(170)=>753,chr(171)=>1042,chr(172)=>987,chr(173)=>603,chr(174)=>987,chr(175)=>603, - chr(176)=>400,chr(177)=>549,chr(178)=>411,chr(179)=>549,chr(180)=>549,chr(181)=>713,chr(182)=>494,chr(183)=>460,chr(184)=>549,chr(185)=>549,chr(186)=>549,chr(187)=>549,chr(188)=>1000,chr(189)=>603,chr(190)=>1000,chr(191)=>658,chr(192)=>823,chr(193)=>686,chr(194)=>795,chr(195)=>987,chr(196)=>768,chr(197)=>768, - chr(198)=>823,chr(199)=>768,chr(200)=>768,chr(201)=>713,chr(202)=>713,chr(203)=>713,chr(204)=>713,chr(205)=>713,chr(206)=>713,chr(207)=>713,chr(208)=>768,chr(209)=>713,chr(210)=>790,chr(211)=>790,chr(212)=>890,chr(213)=>823,chr(214)=>549,chr(215)=>250,chr(216)=>713,chr(217)=>603,chr(218)=>603,chr(219)=>1042, - chr(220)=>987,chr(221)=>603,chr(222)=>987,chr(223)=>603,chr(224)=>494,chr(225)=>329,chr(226)=>790,chr(227)=>790,chr(228)=>786,chr(229)=>713,chr(230)=>384,chr(231)=>384,chr(232)=>384,chr(233)=>384,chr(234)=>384,chr(235)=>384,chr(236)=>494,chr(237)=>494,chr(238)=>494,chr(239)=>494,chr(240)=>0,chr(241)=>329, - chr(242)=>274,chr(243)=>686,chr(244)=>686,chr(245)=>686,chr(246)=>384,chr(247)=>384,chr(248)=>384,chr(249)=>384,chr(250)=>384,chr(251)=>384,chr(252)=>494,chr(253)=>494,chr(254)=>494,chr(255)=>0); -$uv = array(32=>160,33=>33,34=>8704,35=>35,36=>8707,37=>array(37,2),39=>8715,40=>array(40,2),42=>8727,43=>array(43,2),45=>8722,46=>array(46,18),64=>8773,65=>array(913,2),67=>935,68=>array(916,2),70=>934,71=>915,72=>919,73=>921,74=>977,75=>array(922,4),79=>array(927,2),81=>920,82=>929,83=>array(931,3),86=>962,87=>937,88=>926,89=>936,90=>918,91=>91,92=>8756,93=>93,94=>8869,95=>95,96=>63717,97=>array(945,2),99=>967,100=>array(948,2),102=>966,103=>947,104=>951,105=>953,106=>981,107=>array(954,4),111=>array(959,2),113=>952,114=>961,115=>array(963,3),118=>982,119=>969,120=>958,121=>968,122=>950,123=>array(123,3),126=>8764,160=>8364,161=>978,162=>8242,163=>8804,164=>8725,165=>8734,166=>402,167=>9827,168=>9830,169=>9829,170=>9824,171=>8596,172=>array(8592,4),176=>array(176,2),178=>8243,179=>8805,180=>215,181=>8733,182=>8706,183=>8226,184=>247,185=>array(8800,2),187=>8776,188=>8230,189=>array(63718,2),191=>8629,192=>8501,193=>8465,194=>8476,195=>8472,196=>8855,197=>8853,198=>8709,199=>array(8745,2),201=>8835,202=>8839,203=>8836,204=>8834,205=>8838,206=>array(8712,2),208=>8736,209=>8711,210=>63194,211=>63193,212=>63195,213=>8719,214=>8730,215=>8901,216=>172,217=>array(8743,2),219=>8660,220=>array(8656,4),224=>9674,225=>9001,226=>array(63720,3),229=>8721,230=>array(63723,10),241=>9002,242=>8747,243=>8992,244=>63733,245=>8993,246=>array(63734,9)); -?> diff --git a/fpdf/font/times.php b/fpdf/font/times.php deleted file mode 100644 index f78850f967c3c62f273673854153110cd23517bb..0000000000000000000000000000000000000000 --- a/fpdf/font/times.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Times-Roman'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>250,chr(1)=>250,chr(2)=>250,chr(3)=>250,chr(4)=>250,chr(5)=>250,chr(6)=>250,chr(7)=>250,chr(8)=>250,chr(9)=>250,chr(10)=>250,chr(11)=>250,chr(12)=>250,chr(13)=>250,chr(14)=>250,chr(15)=>250,chr(16)=>250,chr(17)=>250,chr(18)=>250,chr(19)=>250,chr(20)=>250,chr(21)=>250, - chr(22)=>250,chr(23)=>250,chr(24)=>250,chr(25)=>250,chr(26)=>250,chr(27)=>250,chr(28)=>250,chr(29)=>250,chr(30)=>250,chr(31)=>250,' '=>250,'!'=>333,'"'=>408,'#'=>500,'$'=>500,'%'=>833,'&'=>778,'\''=>180,'('=>333,')'=>333,'*'=>500,'+'=>564, - ','=>250,'-'=>333,'.'=>250,'/'=>278,'0'=>500,'1'=>500,'2'=>500,'3'=>500,'4'=>500,'5'=>500,'6'=>500,'7'=>500,'8'=>500,'9'=>500,':'=>278,';'=>278,'<'=>564,'='=>564,'>'=>564,'?'=>444,'@'=>921,'A'=>722, - 'B'=>667,'C'=>667,'D'=>722,'E'=>611,'F'=>556,'G'=>722,'H'=>722,'I'=>333,'J'=>389,'K'=>722,'L'=>611,'M'=>889,'N'=>722,'O'=>722,'P'=>556,'Q'=>722,'R'=>667,'S'=>556,'T'=>611,'U'=>722,'V'=>722,'W'=>944, - 'X'=>722,'Y'=>722,'Z'=>611,'['=>333,'\\'=>278,']'=>333,'^'=>469,'_'=>500,'`'=>333,'a'=>444,'b'=>500,'c'=>444,'d'=>500,'e'=>444,'f'=>333,'g'=>500,'h'=>500,'i'=>278,'j'=>278,'k'=>500,'l'=>278,'m'=>778, - 'n'=>500,'o'=>500,'p'=>500,'q'=>500,'r'=>333,'s'=>389,'t'=>278,'u'=>500,'v'=>500,'w'=>722,'x'=>500,'y'=>500,'z'=>444,'{'=>480,'|'=>200,'}'=>480,'~'=>541,chr(127)=>350,chr(128)=>500,chr(129)=>350,chr(130)=>333,chr(131)=>500, - chr(132)=>444,chr(133)=>1000,chr(134)=>500,chr(135)=>500,chr(136)=>333,chr(137)=>1000,chr(138)=>556,chr(139)=>333,chr(140)=>889,chr(141)=>350,chr(142)=>611,chr(143)=>350,chr(144)=>350,chr(145)=>333,chr(146)=>333,chr(147)=>444,chr(148)=>444,chr(149)=>350,chr(150)=>500,chr(151)=>1000,chr(152)=>333,chr(153)=>980, - chr(154)=>389,chr(155)=>333,chr(156)=>722,chr(157)=>350,chr(158)=>444,chr(159)=>722,chr(160)=>250,chr(161)=>333,chr(162)=>500,chr(163)=>500,chr(164)=>500,chr(165)=>500,chr(166)=>200,chr(167)=>500,chr(168)=>333,chr(169)=>760,chr(170)=>276,chr(171)=>500,chr(172)=>564,chr(173)=>333,chr(174)=>760,chr(175)=>333, - chr(176)=>400,chr(177)=>564,chr(178)=>300,chr(179)=>300,chr(180)=>333,chr(181)=>500,chr(182)=>453,chr(183)=>250,chr(184)=>333,chr(185)=>300,chr(186)=>310,chr(187)=>500,chr(188)=>750,chr(189)=>750,chr(190)=>750,chr(191)=>444,chr(192)=>722,chr(193)=>722,chr(194)=>722,chr(195)=>722,chr(196)=>722,chr(197)=>722, - chr(198)=>889,chr(199)=>667,chr(200)=>611,chr(201)=>611,chr(202)=>611,chr(203)=>611,chr(204)=>333,chr(205)=>333,chr(206)=>333,chr(207)=>333,chr(208)=>722,chr(209)=>722,chr(210)=>722,chr(211)=>722,chr(212)=>722,chr(213)=>722,chr(214)=>722,chr(215)=>564,chr(216)=>722,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>722,chr(222)=>556,chr(223)=>500,chr(224)=>444,chr(225)=>444,chr(226)=>444,chr(227)=>444,chr(228)=>444,chr(229)=>444,chr(230)=>667,chr(231)=>444,chr(232)=>444,chr(233)=>444,chr(234)=>444,chr(235)=>444,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>500,chr(241)=>500, - chr(242)=>500,chr(243)=>500,chr(244)=>500,chr(245)=>500,chr(246)=>500,chr(247)=>564,chr(248)=>500,chr(249)=>500,chr(250)=>500,chr(251)=>500,chr(252)=>500,chr(253)=>500,chr(254)=>500,chr(255)=>500); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/timesb.php b/fpdf/font/timesb.php deleted file mode 100644 index 05167502f7a5cf9b01944cee885aba2e33c071d9..0000000000000000000000000000000000000000 --- a/fpdf/font/timesb.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Times-Bold'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>250,chr(1)=>250,chr(2)=>250,chr(3)=>250,chr(4)=>250,chr(5)=>250,chr(6)=>250,chr(7)=>250,chr(8)=>250,chr(9)=>250,chr(10)=>250,chr(11)=>250,chr(12)=>250,chr(13)=>250,chr(14)=>250,chr(15)=>250,chr(16)=>250,chr(17)=>250,chr(18)=>250,chr(19)=>250,chr(20)=>250,chr(21)=>250, - chr(22)=>250,chr(23)=>250,chr(24)=>250,chr(25)=>250,chr(26)=>250,chr(27)=>250,chr(28)=>250,chr(29)=>250,chr(30)=>250,chr(31)=>250,' '=>250,'!'=>333,'"'=>555,'#'=>500,'$'=>500,'%'=>1000,'&'=>833,'\''=>278,'('=>333,')'=>333,'*'=>500,'+'=>570, - ','=>250,'-'=>333,'.'=>250,'/'=>278,'0'=>500,'1'=>500,'2'=>500,'3'=>500,'4'=>500,'5'=>500,'6'=>500,'7'=>500,'8'=>500,'9'=>500,':'=>333,';'=>333,'<'=>570,'='=>570,'>'=>570,'?'=>500,'@'=>930,'A'=>722, - 'B'=>667,'C'=>722,'D'=>722,'E'=>667,'F'=>611,'G'=>778,'H'=>778,'I'=>389,'J'=>500,'K'=>778,'L'=>667,'M'=>944,'N'=>722,'O'=>778,'P'=>611,'Q'=>778,'R'=>722,'S'=>556,'T'=>667,'U'=>722,'V'=>722,'W'=>1000, - 'X'=>722,'Y'=>722,'Z'=>667,'['=>333,'\\'=>278,']'=>333,'^'=>581,'_'=>500,'`'=>333,'a'=>500,'b'=>556,'c'=>444,'d'=>556,'e'=>444,'f'=>333,'g'=>500,'h'=>556,'i'=>278,'j'=>333,'k'=>556,'l'=>278,'m'=>833, - 'n'=>556,'o'=>500,'p'=>556,'q'=>556,'r'=>444,'s'=>389,'t'=>333,'u'=>556,'v'=>500,'w'=>722,'x'=>500,'y'=>500,'z'=>444,'{'=>394,'|'=>220,'}'=>394,'~'=>520,chr(127)=>350,chr(128)=>500,chr(129)=>350,chr(130)=>333,chr(131)=>500, - chr(132)=>500,chr(133)=>1000,chr(134)=>500,chr(135)=>500,chr(136)=>333,chr(137)=>1000,chr(138)=>556,chr(139)=>333,chr(140)=>1000,chr(141)=>350,chr(142)=>667,chr(143)=>350,chr(144)=>350,chr(145)=>333,chr(146)=>333,chr(147)=>500,chr(148)=>500,chr(149)=>350,chr(150)=>500,chr(151)=>1000,chr(152)=>333,chr(153)=>1000, - chr(154)=>389,chr(155)=>333,chr(156)=>722,chr(157)=>350,chr(158)=>444,chr(159)=>722,chr(160)=>250,chr(161)=>333,chr(162)=>500,chr(163)=>500,chr(164)=>500,chr(165)=>500,chr(166)=>220,chr(167)=>500,chr(168)=>333,chr(169)=>747,chr(170)=>300,chr(171)=>500,chr(172)=>570,chr(173)=>333,chr(174)=>747,chr(175)=>333, - chr(176)=>400,chr(177)=>570,chr(178)=>300,chr(179)=>300,chr(180)=>333,chr(181)=>556,chr(182)=>540,chr(183)=>250,chr(184)=>333,chr(185)=>300,chr(186)=>330,chr(187)=>500,chr(188)=>750,chr(189)=>750,chr(190)=>750,chr(191)=>500,chr(192)=>722,chr(193)=>722,chr(194)=>722,chr(195)=>722,chr(196)=>722,chr(197)=>722, - chr(198)=>1000,chr(199)=>722,chr(200)=>667,chr(201)=>667,chr(202)=>667,chr(203)=>667,chr(204)=>389,chr(205)=>389,chr(206)=>389,chr(207)=>389,chr(208)=>722,chr(209)=>722,chr(210)=>778,chr(211)=>778,chr(212)=>778,chr(213)=>778,chr(214)=>778,chr(215)=>570,chr(216)=>778,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>722,chr(222)=>611,chr(223)=>556,chr(224)=>500,chr(225)=>500,chr(226)=>500,chr(227)=>500,chr(228)=>500,chr(229)=>500,chr(230)=>722,chr(231)=>444,chr(232)=>444,chr(233)=>444,chr(234)=>444,chr(235)=>444,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>500,chr(241)=>556, - chr(242)=>500,chr(243)=>500,chr(244)=>500,chr(245)=>500,chr(246)=>500,chr(247)=>570,chr(248)=>500,chr(249)=>556,chr(250)=>556,chr(251)=>556,chr(252)=>556,chr(253)=>500,chr(254)=>556,chr(255)=>500); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/timesbi.php b/fpdf/font/timesbi.php deleted file mode 100644 index 32fe25e612116cd9c3e6699622f296fd5a5957d4..0000000000000000000000000000000000000000 --- a/fpdf/font/timesbi.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Times-BoldItalic'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>250,chr(1)=>250,chr(2)=>250,chr(3)=>250,chr(4)=>250,chr(5)=>250,chr(6)=>250,chr(7)=>250,chr(8)=>250,chr(9)=>250,chr(10)=>250,chr(11)=>250,chr(12)=>250,chr(13)=>250,chr(14)=>250,chr(15)=>250,chr(16)=>250,chr(17)=>250,chr(18)=>250,chr(19)=>250,chr(20)=>250,chr(21)=>250, - chr(22)=>250,chr(23)=>250,chr(24)=>250,chr(25)=>250,chr(26)=>250,chr(27)=>250,chr(28)=>250,chr(29)=>250,chr(30)=>250,chr(31)=>250,' '=>250,'!'=>389,'"'=>555,'#'=>500,'$'=>500,'%'=>833,'&'=>778,'\''=>278,'('=>333,')'=>333,'*'=>500,'+'=>570, - ','=>250,'-'=>333,'.'=>250,'/'=>278,'0'=>500,'1'=>500,'2'=>500,'3'=>500,'4'=>500,'5'=>500,'6'=>500,'7'=>500,'8'=>500,'9'=>500,':'=>333,';'=>333,'<'=>570,'='=>570,'>'=>570,'?'=>500,'@'=>832,'A'=>667, - 'B'=>667,'C'=>667,'D'=>722,'E'=>667,'F'=>667,'G'=>722,'H'=>778,'I'=>389,'J'=>500,'K'=>667,'L'=>611,'M'=>889,'N'=>722,'O'=>722,'P'=>611,'Q'=>722,'R'=>667,'S'=>556,'T'=>611,'U'=>722,'V'=>667,'W'=>889, - 'X'=>667,'Y'=>611,'Z'=>611,'['=>333,'\\'=>278,']'=>333,'^'=>570,'_'=>500,'`'=>333,'a'=>500,'b'=>500,'c'=>444,'d'=>500,'e'=>444,'f'=>333,'g'=>500,'h'=>556,'i'=>278,'j'=>278,'k'=>500,'l'=>278,'m'=>778, - 'n'=>556,'o'=>500,'p'=>500,'q'=>500,'r'=>389,'s'=>389,'t'=>278,'u'=>556,'v'=>444,'w'=>667,'x'=>500,'y'=>444,'z'=>389,'{'=>348,'|'=>220,'}'=>348,'~'=>570,chr(127)=>350,chr(128)=>500,chr(129)=>350,chr(130)=>333,chr(131)=>500, - chr(132)=>500,chr(133)=>1000,chr(134)=>500,chr(135)=>500,chr(136)=>333,chr(137)=>1000,chr(138)=>556,chr(139)=>333,chr(140)=>944,chr(141)=>350,chr(142)=>611,chr(143)=>350,chr(144)=>350,chr(145)=>333,chr(146)=>333,chr(147)=>500,chr(148)=>500,chr(149)=>350,chr(150)=>500,chr(151)=>1000,chr(152)=>333,chr(153)=>1000, - chr(154)=>389,chr(155)=>333,chr(156)=>722,chr(157)=>350,chr(158)=>389,chr(159)=>611,chr(160)=>250,chr(161)=>389,chr(162)=>500,chr(163)=>500,chr(164)=>500,chr(165)=>500,chr(166)=>220,chr(167)=>500,chr(168)=>333,chr(169)=>747,chr(170)=>266,chr(171)=>500,chr(172)=>606,chr(173)=>333,chr(174)=>747,chr(175)=>333, - chr(176)=>400,chr(177)=>570,chr(178)=>300,chr(179)=>300,chr(180)=>333,chr(181)=>576,chr(182)=>500,chr(183)=>250,chr(184)=>333,chr(185)=>300,chr(186)=>300,chr(187)=>500,chr(188)=>750,chr(189)=>750,chr(190)=>750,chr(191)=>500,chr(192)=>667,chr(193)=>667,chr(194)=>667,chr(195)=>667,chr(196)=>667,chr(197)=>667, - chr(198)=>944,chr(199)=>667,chr(200)=>667,chr(201)=>667,chr(202)=>667,chr(203)=>667,chr(204)=>389,chr(205)=>389,chr(206)=>389,chr(207)=>389,chr(208)=>722,chr(209)=>722,chr(210)=>722,chr(211)=>722,chr(212)=>722,chr(213)=>722,chr(214)=>722,chr(215)=>570,chr(216)=>722,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>611,chr(222)=>611,chr(223)=>500,chr(224)=>500,chr(225)=>500,chr(226)=>500,chr(227)=>500,chr(228)=>500,chr(229)=>500,chr(230)=>722,chr(231)=>444,chr(232)=>444,chr(233)=>444,chr(234)=>444,chr(235)=>444,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>500,chr(241)=>556, - chr(242)=>500,chr(243)=>500,chr(244)=>500,chr(245)=>500,chr(246)=>500,chr(247)=>570,chr(248)=>500,chr(249)=>556,chr(250)=>556,chr(251)=>556,chr(252)=>556,chr(253)=>444,chr(254)=>500,chr(255)=>444); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/timesi.php b/fpdf/font/timesi.php deleted file mode 100644 index b0e5a6200330591e3756728d1bec1495be933df3..0000000000000000000000000000000000000000 --- a/fpdf/font/timesi.php +++ /dev/null @@ -1,21 +0,0 @@ -<?php -$type = 'Core'; -$name = 'Times-Italic'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>250,chr(1)=>250,chr(2)=>250,chr(3)=>250,chr(4)=>250,chr(5)=>250,chr(6)=>250,chr(7)=>250,chr(8)=>250,chr(9)=>250,chr(10)=>250,chr(11)=>250,chr(12)=>250,chr(13)=>250,chr(14)=>250,chr(15)=>250,chr(16)=>250,chr(17)=>250,chr(18)=>250,chr(19)=>250,chr(20)=>250,chr(21)=>250, - chr(22)=>250,chr(23)=>250,chr(24)=>250,chr(25)=>250,chr(26)=>250,chr(27)=>250,chr(28)=>250,chr(29)=>250,chr(30)=>250,chr(31)=>250,' '=>250,'!'=>333,'"'=>420,'#'=>500,'$'=>500,'%'=>833,'&'=>778,'\''=>214,'('=>333,')'=>333,'*'=>500,'+'=>675, - ','=>250,'-'=>333,'.'=>250,'/'=>278,'0'=>500,'1'=>500,'2'=>500,'3'=>500,'4'=>500,'5'=>500,'6'=>500,'7'=>500,'8'=>500,'9'=>500,':'=>333,';'=>333,'<'=>675,'='=>675,'>'=>675,'?'=>500,'@'=>920,'A'=>611, - 'B'=>611,'C'=>667,'D'=>722,'E'=>611,'F'=>611,'G'=>722,'H'=>722,'I'=>333,'J'=>444,'K'=>667,'L'=>556,'M'=>833,'N'=>667,'O'=>722,'P'=>611,'Q'=>722,'R'=>611,'S'=>500,'T'=>556,'U'=>722,'V'=>611,'W'=>833, - 'X'=>611,'Y'=>556,'Z'=>556,'['=>389,'\\'=>278,']'=>389,'^'=>422,'_'=>500,'`'=>333,'a'=>500,'b'=>500,'c'=>444,'d'=>500,'e'=>444,'f'=>278,'g'=>500,'h'=>500,'i'=>278,'j'=>278,'k'=>444,'l'=>278,'m'=>722, - 'n'=>500,'o'=>500,'p'=>500,'q'=>500,'r'=>389,'s'=>389,'t'=>278,'u'=>500,'v'=>444,'w'=>667,'x'=>444,'y'=>444,'z'=>389,'{'=>400,'|'=>275,'}'=>400,'~'=>541,chr(127)=>350,chr(128)=>500,chr(129)=>350,chr(130)=>333,chr(131)=>500, - chr(132)=>556,chr(133)=>889,chr(134)=>500,chr(135)=>500,chr(136)=>333,chr(137)=>1000,chr(138)=>500,chr(139)=>333,chr(140)=>944,chr(141)=>350,chr(142)=>556,chr(143)=>350,chr(144)=>350,chr(145)=>333,chr(146)=>333,chr(147)=>556,chr(148)=>556,chr(149)=>350,chr(150)=>500,chr(151)=>889,chr(152)=>333,chr(153)=>980, - chr(154)=>389,chr(155)=>333,chr(156)=>667,chr(157)=>350,chr(158)=>389,chr(159)=>556,chr(160)=>250,chr(161)=>389,chr(162)=>500,chr(163)=>500,chr(164)=>500,chr(165)=>500,chr(166)=>275,chr(167)=>500,chr(168)=>333,chr(169)=>760,chr(170)=>276,chr(171)=>500,chr(172)=>675,chr(173)=>333,chr(174)=>760,chr(175)=>333, - chr(176)=>400,chr(177)=>675,chr(178)=>300,chr(179)=>300,chr(180)=>333,chr(181)=>500,chr(182)=>523,chr(183)=>250,chr(184)=>333,chr(185)=>300,chr(186)=>310,chr(187)=>500,chr(188)=>750,chr(189)=>750,chr(190)=>750,chr(191)=>500,chr(192)=>611,chr(193)=>611,chr(194)=>611,chr(195)=>611,chr(196)=>611,chr(197)=>611, - chr(198)=>889,chr(199)=>667,chr(200)=>611,chr(201)=>611,chr(202)=>611,chr(203)=>611,chr(204)=>333,chr(205)=>333,chr(206)=>333,chr(207)=>333,chr(208)=>722,chr(209)=>667,chr(210)=>722,chr(211)=>722,chr(212)=>722,chr(213)=>722,chr(214)=>722,chr(215)=>675,chr(216)=>722,chr(217)=>722,chr(218)=>722,chr(219)=>722, - chr(220)=>722,chr(221)=>556,chr(222)=>611,chr(223)=>500,chr(224)=>500,chr(225)=>500,chr(226)=>500,chr(227)=>500,chr(228)=>500,chr(229)=>500,chr(230)=>667,chr(231)=>444,chr(232)=>444,chr(233)=>444,chr(234)=>444,chr(235)=>444,chr(236)=>278,chr(237)=>278,chr(238)=>278,chr(239)=>278,chr(240)=>500,chr(241)=>500, - chr(242)=>500,chr(243)=>500,chr(244)=>500,chr(245)=>500,chr(246)=>500,chr(247)=>675,chr(248)=>500,chr(249)=>500,chr(250)=>500,chr(251)=>500,chr(252)=>500,chr(253)=>444,chr(254)=>500,chr(255)=>444); -$enc = 'cp1252'; -$uv = array(0=>array(0,128),128=>8364,130=>8218,131=>402,132=>8222,133=>8230,134=>array(8224,2),136=>710,137=>8240,138=>352,139=>8249,140=>338,142=>381,145=>array(8216,2),147=>array(8220,2),149=>8226,150=>array(8211,2),152=>732,153=>8482,154=>353,155=>8250,156=>339,158=>382,159=>376,160=>array(160,96)); -?> diff --git a/fpdf/font/zapfdingbats.php b/fpdf/font/zapfdingbats.php deleted file mode 100644 index b9d03090854ce88afb742c217f3bb66bba7df419..0000000000000000000000000000000000000000 --- a/fpdf/font/zapfdingbats.php +++ /dev/null @@ -1,20 +0,0 @@ -<?php -$type = 'Core'; -$name = 'ZapfDingbats'; -$up = -100; -$ut = 50; -$cw = array( - chr(0)=>0,chr(1)=>0,chr(2)=>0,chr(3)=>0,chr(4)=>0,chr(5)=>0,chr(6)=>0,chr(7)=>0,chr(8)=>0,chr(9)=>0,chr(10)=>0,chr(11)=>0,chr(12)=>0,chr(13)=>0,chr(14)=>0,chr(15)=>0,chr(16)=>0,chr(17)=>0,chr(18)=>0,chr(19)=>0,chr(20)=>0,chr(21)=>0, - chr(22)=>0,chr(23)=>0,chr(24)=>0,chr(25)=>0,chr(26)=>0,chr(27)=>0,chr(28)=>0,chr(29)=>0,chr(30)=>0,chr(31)=>0,' '=>278,'!'=>974,'"'=>961,'#'=>974,'$'=>980,'%'=>719,'&'=>789,'\''=>790,'('=>791,')'=>690,'*'=>960,'+'=>939, - ','=>549,'-'=>855,'.'=>911,'/'=>933,'0'=>911,'1'=>945,'2'=>974,'3'=>755,'4'=>846,'5'=>762,'6'=>761,'7'=>571,'8'=>677,'9'=>763,':'=>760,';'=>759,'<'=>754,'='=>494,'>'=>552,'?'=>537,'@'=>577,'A'=>692, - 'B'=>786,'C'=>788,'D'=>788,'E'=>790,'F'=>793,'G'=>794,'H'=>816,'I'=>823,'J'=>789,'K'=>841,'L'=>823,'M'=>833,'N'=>816,'O'=>831,'P'=>923,'Q'=>744,'R'=>723,'S'=>749,'T'=>790,'U'=>792,'V'=>695,'W'=>776, - 'X'=>768,'Y'=>792,'Z'=>759,'['=>707,'\\'=>708,']'=>682,'^'=>701,'_'=>826,'`'=>815,'a'=>789,'b'=>789,'c'=>707,'d'=>687,'e'=>696,'f'=>689,'g'=>786,'h'=>787,'i'=>713,'j'=>791,'k'=>785,'l'=>791,'m'=>873, - 'n'=>761,'o'=>762,'p'=>762,'q'=>759,'r'=>759,'s'=>892,'t'=>892,'u'=>788,'v'=>784,'w'=>438,'x'=>138,'y'=>277,'z'=>415,'{'=>392,'|'=>392,'}'=>668,'~'=>668,chr(127)=>0,chr(128)=>390,chr(129)=>390,chr(130)=>317,chr(131)=>317, - chr(132)=>276,chr(133)=>276,chr(134)=>509,chr(135)=>509,chr(136)=>410,chr(137)=>410,chr(138)=>234,chr(139)=>234,chr(140)=>334,chr(141)=>334,chr(142)=>0,chr(143)=>0,chr(144)=>0,chr(145)=>0,chr(146)=>0,chr(147)=>0,chr(148)=>0,chr(149)=>0,chr(150)=>0,chr(151)=>0,chr(152)=>0,chr(153)=>0, - chr(154)=>0,chr(155)=>0,chr(156)=>0,chr(157)=>0,chr(158)=>0,chr(159)=>0,chr(160)=>0,chr(161)=>732,chr(162)=>544,chr(163)=>544,chr(164)=>910,chr(165)=>667,chr(166)=>760,chr(167)=>760,chr(168)=>776,chr(169)=>595,chr(170)=>694,chr(171)=>626,chr(172)=>788,chr(173)=>788,chr(174)=>788,chr(175)=>788, - chr(176)=>788,chr(177)=>788,chr(178)=>788,chr(179)=>788,chr(180)=>788,chr(181)=>788,chr(182)=>788,chr(183)=>788,chr(184)=>788,chr(185)=>788,chr(186)=>788,chr(187)=>788,chr(188)=>788,chr(189)=>788,chr(190)=>788,chr(191)=>788,chr(192)=>788,chr(193)=>788,chr(194)=>788,chr(195)=>788,chr(196)=>788,chr(197)=>788, - chr(198)=>788,chr(199)=>788,chr(200)=>788,chr(201)=>788,chr(202)=>788,chr(203)=>788,chr(204)=>788,chr(205)=>788,chr(206)=>788,chr(207)=>788,chr(208)=>788,chr(209)=>788,chr(210)=>788,chr(211)=>788,chr(212)=>894,chr(213)=>838,chr(214)=>1016,chr(215)=>458,chr(216)=>748,chr(217)=>924,chr(218)=>748,chr(219)=>918, - chr(220)=>927,chr(221)=>928,chr(222)=>928,chr(223)=>834,chr(224)=>873,chr(225)=>828,chr(226)=>924,chr(227)=>924,chr(228)=>917,chr(229)=>930,chr(230)=>931,chr(231)=>463,chr(232)=>883,chr(233)=>836,chr(234)=>836,chr(235)=>867,chr(236)=>867,chr(237)=>696,chr(238)=>696,chr(239)=>874,chr(240)=>0,chr(241)=>874, - chr(242)=>760,chr(243)=>946,chr(244)=>771,chr(245)=>865,chr(246)=>771,chr(247)=>888,chr(248)=>967,chr(249)=>888,chr(250)=>831,chr(251)=>873,chr(252)=>927,chr(253)=>970,chr(254)=>918,chr(255)=>0); -$uv = array(32=>32,33=>array(9985,4),37=>9742,38=>array(9990,4),42=>9755,43=>9758,44=>array(9996,28),72=>9733,73=>array(10025,35),108=>9679,109=>10061,110=>9632,111=>array(10063,4),115=>9650,116=>9660,117=>9670,118=>10070,119=>9687,120=>array(10072,7),128=>array(10088,14),161=>array(10081,7),168=>9827,169=>9830,170=>9829,171=>9824,172=>array(9312,10),182=>array(10102,31),213=>8594,214=>array(8596,2),216=>array(10136,24),241=>array(10161,14)); -?> diff --git a/fpdf/fpdf.css b/fpdf/fpdf.css deleted file mode 100644 index dd2c540029a3010d9c857f32ea9aba47606f118a..0000000000000000000000000000000000000000 --- a/fpdf/fpdf.css +++ /dev/null @@ -1,21 +0,0 @@ -body {font-family:"Times New Roman",serif} -h1 {font:bold 135% Arial,sans-serif; color:#4000A0; margin-bottom:0.9em} -h2 {font:bold 95% Arial,sans-serif; color:#900000; margin-top:1.5em; margin-bottom:1em} -dl.param dt {text-decoration:underline} -dl.param dd {margin-top:1em; margin-bottom:1em} -dl.param ul {margin-top:1em; margin-bottom:1em} -tt, code, kbd {font-family:"Courier New",Courier,monospace; font-size:82%} -div.source {margin-top:1.4em; margin-bottom:1.3em} -div.source pre {display:table; border:1px solid #24246A; width:100%; margin:0em; font-family:inherit; font-size:100%} -div.source code {display:block; border:1px solid #C5C5EC; background-color:#F0F5FF; padding:6px; color:#000000} -div.doc-source {margin-top:1.4em; margin-bottom:1.3em} -div.doc-source pre {display:table; width:100%; margin:0em; font-family:inherit; font-size:100%} -div.doc-source code {display:block; background-color:#E0E0E0; padding:4px} -.kw {color:#000080; font-weight:bold} -.str {color:#CC0000} -.cmt {color:#008000} -p.demo {text-align:center; margin-top:-0.9em} -a.demo {text-decoration:none; font-weight:bold; color:#0000CC} -a.demo:link {text-decoration:none; font-weight:bold; color:#0000CC} -a.demo:hover {text-decoration:none; font-weight:bold; color:#0000FF} -a.demo:active {text-decoration:none; font-weight:bold; color:#0000FF} diff --git a/fpdf/fpdf.php b/fpdf/fpdf.php deleted file mode 100644 index ebee95810b33a117ef36e35e9da261c74f619b1e..0000000000000000000000000000000000000000 --- a/fpdf/fpdf.php +++ /dev/null @@ -1,1934 +0,0 @@ -<?php -/******************************************************************************* -* FPDF * -* * -* Version: 1.86 * -* Date: 2023-06-25 * -* Author: Olivier PLATHEY * -*******************************************************************************/ - -class FPDF -{ -const VERSION = '1.86'; -protected $page; // current page number -protected $n; // current object number -protected $offsets; // array of object offsets -protected $buffer; // buffer holding in-memory PDF -protected $pages; // array containing pages -protected $state; // current document state -protected $compress; // compression flag -protected $iconv; // whether iconv is available -protected $k; // scale factor (number of points in user unit) -protected $DefOrientation; // default orientation -protected $CurOrientation; // current orientation -protected $StdPageSizes; // standard page sizes -protected $DefPageSize; // default page size -protected $CurPageSize; // current page size -protected $CurRotation; // current page rotation -protected $PageInfo; // page-related data -protected $wPt, $hPt; // dimensions of current page in points -protected $w, $h; // dimensions of current page in user unit -protected $lMargin; // left margin -protected $tMargin; // top margin -protected $rMargin; // right margin -protected $bMargin; // page break margin -protected $cMargin; // cell margin -protected $x, $y; // current position in user unit -protected $lasth; // height of last printed cell -protected $LineWidth; // line width in user unit -protected $fontpath; // directory containing fonts -protected $CoreFonts; // array of core font names -protected $fonts; // array of used fonts -protected $FontFiles; // array of font files -protected $encodings; // array of encodings -protected $cmaps; // array of ToUnicode CMaps -protected $FontFamily; // current font family -protected $FontStyle; // current font style -protected $underline; // underlining flag -protected $CurrentFont; // current font info -protected $FontSizePt; // current font size in points -protected $FontSize; // current font size in user unit -protected $DrawColor; // commands for drawing color -protected $FillColor; // commands for filling color -protected $TextColor; // commands for text color -protected $ColorFlag; // indicates whether fill and text colors are different -protected $WithAlpha; // indicates whether alpha channel is used -protected $ws; // word spacing -protected $images; // array of used images -protected $PageLinks; // array of links in pages -protected $links; // array of internal links -protected $AutoPageBreak; // automatic page breaking -protected $PageBreakTrigger; // threshold used to trigger page breaks -protected $InHeader; // flag set when processing header -protected $InFooter; // flag set when processing footer -protected $AliasNbPages; // alias for total number of pages -protected $ZoomMode; // zoom display mode -protected $LayoutMode; // layout display mode -protected $metadata; // document properties -protected $CreationDate; // document creation date -protected $PDFVersion; // PDF version number - -/******************************************************************************* -* Public methods * -*******************************************************************************/ - -function __construct($orientation='P', $unit='mm', $size='A4') -{ - // Initialization of properties - $this->state = 0; - $this->page = 0; - $this->n = 2; - $this->buffer = ''; - $this->pages = array(); - $this->PageInfo = array(); - $this->fonts = array(); - $this->FontFiles = array(); - $this->encodings = array(); - $this->cmaps = array(); - $this->images = array(); - $this->links = array(); - $this->InHeader = false; - $this->InFooter = false; - $this->lasth = 0; - $this->FontFamily = ''; - $this->FontStyle = ''; - $this->FontSizePt = 12; - $this->underline = false; - $this->DrawColor = '0 G'; - $this->FillColor = '0 g'; - $this->TextColor = '0 g'; - $this->ColorFlag = false; - $this->WithAlpha = false; - $this->ws = 0; - $this->iconv = function_exists('iconv'); - // Font path - if(defined('FPDF_FONTPATH')) - $this->fontpath = FPDF_FONTPATH; - else - $this->fontpath = dirname(__FILE__).'/font/'; - // Core fonts - $this->CoreFonts = array('courier', 'helvetica', 'times', 'symbol', 'zapfdingbats'); - // Scale factor - if($unit=='pt') - $this->k = 1; - elseif($unit=='mm') - $this->k = 72/25.4; - elseif($unit=='cm') - $this->k = 72/2.54; - elseif($unit=='in') - $this->k = 72; - else - $this->Error('Incorrect unit: '.$unit); - // Page sizes - $this->StdPageSizes = array('a3'=>array(841.89,1190.55), 'a4'=>array(595.28,841.89), 'a5'=>array(420.94,595.28), - 'letter'=>array(612,792), 'legal'=>array(612,1008)); - $size = $this->_getpagesize($size); - $this->DefPageSize = $size; - $this->CurPageSize = $size; - // Page orientation - $orientation = strtolower($orientation); - if($orientation=='p' || $orientation=='portrait') - { - $this->DefOrientation = 'P'; - $this->w = $size[0]; - $this->h = $size[1]; - } - elseif($orientation=='l' || $orientation=='landscape') - { - $this->DefOrientation = 'L'; - $this->w = $size[1]; - $this->h = $size[0]; - } - else - $this->Error('Incorrect orientation: '.$orientation); - $this->CurOrientation = $this->DefOrientation; - $this->wPt = $this->w*$this->k; - $this->hPt = $this->h*$this->k; - // Page rotation - $this->CurRotation = 0; - // Page margins (1 cm) - $margin = 28.35/$this->k; - $this->SetMargins($margin,$margin); - // Interior cell margin (1 mm) - $this->cMargin = $margin/10; - // Line width (0.2 mm) - $this->LineWidth = .567/$this->k; - // Automatic page break - $this->SetAutoPageBreak(true,2*$margin); - // Default display mode - $this->SetDisplayMode('default'); - // Enable compression - $this->SetCompression(true); - // Metadata - $this->metadata = array('Producer'=>'FPDF '.self::VERSION); - // Set default PDF version number - $this->PDFVersion = '1.3'; -} - -function SetMargins($left, $top, $right=null) -{ - // Set left, top and right margins - $this->lMargin = $left; - $this->tMargin = $top; - if($right===null) - $right = $left; - $this->rMargin = $right; -} - -function SetLeftMargin($margin) -{ - // Set left margin - $this->lMargin = $margin; - if($this->page>0 && $this->x<$margin) - $this->x = $margin; -} - -function SetTopMargin($margin) -{ - // Set top margin - $this->tMargin = $margin; -} - -function SetRightMargin($margin) -{ - // Set right margin - $this->rMargin = $margin; -} - -function SetAutoPageBreak($auto, $margin=0) -{ - // Set auto page break mode and triggering margin - $this->AutoPageBreak = $auto; - $this->bMargin = $margin; - $this->PageBreakTrigger = $this->h-$margin; -} - -function SetDisplayMode($zoom, $layout='default') -{ - // Set display mode in viewer - if($zoom=='fullpage' || $zoom=='fullwidth' || $zoom=='real' || $zoom=='default' || !is_string($zoom)) - $this->ZoomMode = $zoom; - else - $this->Error('Incorrect zoom display mode: '.$zoom); - if($layout=='single' || $layout=='continuous' || $layout=='two' || $layout=='default') - $this->LayoutMode = $layout; - else - $this->Error('Incorrect layout display mode: '.$layout); -} - -function SetCompression($compress) -{ - // Set page compression - if(function_exists('gzcompress')) - $this->compress = $compress; - else - $this->compress = false; -} - -function SetTitle($title, $isUTF8=false) -{ - // Title of document - $this->metadata['Title'] = $isUTF8 ? $title : $this->_UTF8encode($title); -} - -function SetAuthor($author, $isUTF8=false) -{ - // Author of document - $this->metadata['Author'] = $isUTF8 ? $author : $this->_UTF8encode($author); -} - -function SetSubject($subject, $isUTF8=false) -{ - // Subject of document - $this->metadata['Subject'] = $isUTF8 ? $subject : $this->_UTF8encode($subject); -} - -function SetKeywords($keywords, $isUTF8=false) -{ - // Keywords of document - $this->metadata['Keywords'] = $isUTF8 ? $keywords : $this->_UTF8encode($keywords); -} - -function SetCreator($creator, $isUTF8=false) -{ - // Creator of document - $this->metadata['Creator'] = $isUTF8 ? $creator : $this->_UTF8encode($creator); -} - -function AliasNbPages($alias='{nb}') -{ - // Define an alias for total number of pages - $this->AliasNbPages = $alias; -} - -function Error($msg) -{ - // Fatal error - throw new Exception('FPDF error: '.$msg); -} - -function Close() -{ - // Terminate document - if($this->state==3) - return; - if($this->page==0) - $this->AddPage(); - // Page footer - $this->InFooter = true; - $this->Footer(); - $this->InFooter = false; - // Close page - $this->_endpage(); - // Close document - $this->_enddoc(); -} - -function AddPage($orientation='', $size='', $rotation=0) -{ - // Start a new page - if($this->state==3) - $this->Error('The document is closed'); - $family = $this->FontFamily; - $style = $this->FontStyle.($this->underline ? 'U' : ''); - $fontsize = $this->FontSizePt; - $lw = $this->LineWidth; - $dc = $this->DrawColor; - $fc = $this->FillColor; - $tc = $this->TextColor; - $cf = $this->ColorFlag; - if($this->page>0) - { - // Page footer - $this->InFooter = true; - $this->Footer(); - $this->InFooter = false; - // Close page - $this->_endpage(); - } - // Start new page - $this->_beginpage($orientation,$size,$rotation); - // Set line cap style to square - $this->_out('2 J'); - // Set line width - $this->LineWidth = $lw; - $this->_out(sprintf('%.2F w',$lw*$this->k)); - // Set font - if($family) - $this->SetFont($family,$style,$fontsize); - // Set colors - $this->DrawColor = $dc; - if($dc!='0 G') - $this->_out($dc); - $this->FillColor = $fc; - if($fc!='0 g') - $this->_out($fc); - $this->TextColor = $tc; - $this->ColorFlag = $cf; - // Page header - $this->InHeader = true; - $this->Header(); - $this->InHeader = false; - // Restore line width - if($this->LineWidth!=$lw) - { - $this->LineWidth = $lw; - $this->_out(sprintf('%.2F w',$lw*$this->k)); - } - // Restore font - if($family) - $this->SetFont($family,$style,$fontsize); - // Restore colors - if($this->DrawColor!=$dc) - { - $this->DrawColor = $dc; - $this->_out($dc); - } - if($this->FillColor!=$fc) - { - $this->FillColor = $fc; - $this->_out($fc); - } - $this->TextColor = $tc; - $this->ColorFlag = $cf; -} - -function Header() -{ - // To be implemented in your own inherited class -} - -function Footer() -{ - // To be implemented in your own inherited class -} - -function PageNo() -{ - // Get current page number - return $this->page; -} - -function SetDrawColor($r, $g=null, $b=null) -{ - // Set color for all stroking operations - if(($r==0 && $g==0 && $b==0) || $g===null) - $this->DrawColor = sprintf('%.3F G',$r/255); - else - $this->DrawColor = sprintf('%.3F %.3F %.3F RG',$r/255,$g/255,$b/255); - if($this->page>0) - $this->_out($this->DrawColor); -} - -function SetFillColor($r, $g=null, $b=null) -{ - // Set color for all filling operations - if(($r==0 && $g==0 && $b==0) || $g===null) - $this->FillColor = sprintf('%.3F g',$r/255); - else - $this->FillColor = sprintf('%.3F %.3F %.3F rg',$r/255,$g/255,$b/255); - $this->ColorFlag = ($this->FillColor!=$this->TextColor); - if($this->page>0) - $this->_out($this->FillColor); -} - -function SetTextColor($r, $g=null, $b=null) -{ - // Set color for text - if(($r==0 && $g==0 && $b==0) || $g===null) - $this->TextColor = sprintf('%.3F g',$r/255); - else - $this->TextColor = sprintf('%.3F %.3F %.3F rg',$r/255,$g/255,$b/255); - $this->ColorFlag = ($this->FillColor!=$this->TextColor); -} - -function GetStringWidth($s) -{ - // Get width of a string in the current font - $cw = $this->CurrentFont['cw']; - $w = 0; - $s = (string)$s; - $l = strlen($s); - for($i=0;$i<$l;$i++) - $w += $cw[$s[$i]]; - return $w*$this->FontSize/1000; -} - -function SetLineWidth($width) -{ - // Set line width - $this->LineWidth = $width; - if($this->page>0) - $this->_out(sprintf('%.2F w',$width*$this->k)); -} - -function Line($x1, $y1, $x2, $y2) -{ - // Draw a line - $this->_out(sprintf('%.2F %.2F m %.2F %.2F l S',$x1*$this->k,($this->h-$y1)*$this->k,$x2*$this->k,($this->h-$y2)*$this->k)); -} - -function Rect($x, $y, $w, $h, $style='') -{ - // Draw a rectangle - if($style=='F') - $op = 'f'; - elseif($style=='FD' || $style=='DF') - $op = 'B'; - else - $op = 'S'; - $this->_out(sprintf('%.2F %.2F %.2F %.2F re %s',$x*$this->k,($this->h-$y)*$this->k,$w*$this->k,-$h*$this->k,$op)); -} - -function AddFont($family, $style='', $file='', $dir='') -{ - // Add a TrueType, OpenType or Type1 font - $family = strtolower($family); - if($file=='') - $file = str_replace(' ','',$family).strtolower($style).'.php'; - $style = strtoupper($style); - if($style=='IB') - $style = 'BI'; - $fontkey = $family.$style; - if(isset($this->fonts[$fontkey])) - return; - if(strpos($file,'/')!==false || strpos($file,"\\")!==false) - $this->Error('Incorrect font definition file name: '.$file); - if($dir=='') - $dir = $this->fontpath; - if(substr($dir,-1)!='/' && substr($dir,-1)!='\\') - $dir .= '/'; - $info = $this->_loadfont($dir.$file); - $info['i'] = count($this->fonts)+1; - if(!empty($info['file'])) - { - // Embedded font - $info['file'] = $dir.$info['file']; - if($info['type']=='TrueType') - $this->FontFiles[$info['file']] = array('length1'=>$info['originalsize']); - else - $this->FontFiles[$info['file']] = array('length1'=>$info['size1'], 'length2'=>$info['size2']); - } - $this->fonts[$fontkey] = $info; -} - -function SetFont($family, $style='', $size=0) -{ - // Select a font; size given in points - if($family=='') - $family = $this->FontFamily; - else - $family = strtolower($family); - $style = strtoupper($style); - if(strpos($style,'U')!==false) - { - $this->underline = true; - $style = str_replace('U','',$style); - } - else - $this->underline = false; - if($style=='IB') - $style = 'BI'; - if($size==0) - $size = $this->FontSizePt; - // Test if font is already selected - if($this->FontFamily==$family && $this->FontStyle==$style && $this->FontSizePt==$size) - return; - // Test if font is already loaded - $fontkey = $family.$style; - if(!isset($this->fonts[$fontkey])) - { - // Test if one of the core fonts - if($family=='arial') - $family = 'helvetica'; - if(in_array($family,$this->CoreFonts)) - { - if($family=='symbol' || $family=='zapfdingbats') - $style = ''; - $fontkey = $family.$style; - if(!isset($this->fonts[$fontkey])) - $this->AddFont($family,$style); - } - else - $this->Error('Undefined font: '.$family.' '.$style); - } - // Select it - $this->FontFamily = $family; - $this->FontStyle = $style; - $this->FontSizePt = $size; - $this->FontSize = $size/$this->k; - $this->CurrentFont = $this->fonts[$fontkey]; - if($this->page>0) - $this->_out(sprintf('BT /F%d %.2F Tf ET',$this->CurrentFont['i'],$this->FontSizePt)); -} - -function SetFontSize($size) -{ - // Set font size in points - if($this->FontSizePt==$size) - return; - $this->FontSizePt = $size; - $this->FontSize = $size/$this->k; - if($this->page>0 && isset($this->CurrentFont)) - $this->_out(sprintf('BT /F%d %.2F Tf ET',$this->CurrentFont['i'],$this->FontSizePt)); -} - -function AddLink() -{ - // Create a new internal link - $n = count($this->links)+1; - $this->links[$n] = array(0, 0); - return $n; -} - -function SetLink($link, $y=0, $page=-1) -{ - // Set destination of internal link - if($y==-1) - $y = $this->y; - if($page==-1) - $page = $this->page; - $this->links[$link] = array($page, $y); -} - -function Link($x, $y, $w, $h, $link) -{ - // Put a link on the page - $this->PageLinks[$this->page][] = array($x*$this->k, $this->hPt-$y*$this->k, $w*$this->k, $h*$this->k, $link); -} - -function Text($x, $y, $txt) -{ - // Output a string - if(!isset($this->CurrentFont)) - $this->Error('No font has been set'); - $txt = (string)$txt; - $s = sprintf('BT %.2F %.2F Td (%s) Tj ET',$x*$this->k,($this->h-$y)*$this->k,$this->_escape($txt)); - if($this->underline && $txt!=='') - $s .= ' '.$this->_dounderline($x,$y,$txt); - if($this->ColorFlag) - $s = 'q '.$this->TextColor.' '.$s.' Q'; - $this->_out($s); -} - -function AcceptPageBreak() -{ - // Accept automatic page break or not - return $this->AutoPageBreak; -} - -function Cell($w, $h=0, $txt='', $border=0, $ln=0, $align='', $fill=false, $link='') -{ - // Output a cell - $k = $this->k; - if($this->y+$h>$this->PageBreakTrigger && !$this->InHeader && !$this->InFooter && $this->AcceptPageBreak()) - { - // Automatic page break - $x = $this->x; - $ws = $this->ws; - if($ws>0) - { - $this->ws = 0; - $this->_out('0 Tw'); - } - $this->AddPage($this->CurOrientation,$this->CurPageSize,$this->CurRotation); - $this->x = $x; - if($ws>0) - { - $this->ws = $ws; - $this->_out(sprintf('%.3F Tw',$ws*$k)); - } - } - if($w==0) - $w = $this->w-$this->rMargin-$this->x; - $s = ''; - if($fill || $border==1) - { - if($fill) - $op = ($border==1) ? 'B' : 'f'; - else - $op = 'S'; - $s = sprintf('%.2F %.2F %.2F %.2F re %s ',$this->x*$k,($this->h-$this->y)*$k,$w*$k,-$h*$k,$op); - } - if(is_string($border)) - { - $x = $this->x; - $y = $this->y; - if(strpos($border,'L')!==false) - $s .= sprintf('%.2F %.2F m %.2F %.2F l S ',$x*$k,($this->h-$y)*$k,$x*$k,($this->h-($y+$h))*$k); - if(strpos($border,'T')!==false) - $s .= sprintf('%.2F %.2F m %.2F %.2F l S ',$x*$k,($this->h-$y)*$k,($x+$w)*$k,($this->h-$y)*$k); - if(strpos($border,'R')!==false) - $s .= sprintf('%.2F %.2F m %.2F %.2F l S ',($x+$w)*$k,($this->h-$y)*$k,($x+$w)*$k,($this->h-($y+$h))*$k); - if(strpos($border,'B')!==false) - $s .= sprintf('%.2F %.2F m %.2F %.2F l S ',$x*$k,($this->h-($y+$h))*$k,($x+$w)*$k,($this->h-($y+$h))*$k); - } - $txt = (string)$txt; - if($txt!=='') - { - if(!isset($this->CurrentFont)) - $this->Error('No font has been set'); - if($align=='R') - $dx = $w-$this->cMargin-$this->GetStringWidth($txt); - elseif($align=='C') - $dx = ($w-$this->GetStringWidth($txt))/2; - else - $dx = $this->cMargin; - if($this->ColorFlag) - $s .= 'q '.$this->TextColor.' '; - $s .= sprintf('BT %.2F %.2F Td (%s) Tj ET',($this->x+$dx)*$k,($this->h-($this->y+.5*$h+.3*$this->FontSize))*$k,$this->_escape($txt)); - if($this->underline) - $s .= ' '.$this->_dounderline($this->x+$dx,$this->y+.5*$h+.3*$this->FontSize,$txt); - if($this->ColorFlag) - $s .= ' Q'; - if($link) - $this->Link($this->x+$dx,$this->y+.5*$h-.5*$this->FontSize,$this->GetStringWidth($txt),$this->FontSize,$link); - } - if($s) - $this->_out($s); - $this->lasth = $h; - if($ln>0) - { - // Go to next line - $this->y += $h; - if($ln==1) - $this->x = $this->lMargin; - } - else - $this->x += $w; -} - -function MultiCell($w, $h, $txt, $border=0, $align='J', $fill=false) -{ - // Output text with automatic or explicit line breaks - if(!isset($this->CurrentFont)) - $this->Error('No font has been set'); - $cw = $this->CurrentFont['cw']; - if($w==0) - $w = $this->w-$this->rMargin-$this->x; - $wmax = ($w-2*$this->cMargin)*1000/$this->FontSize; - $s = str_replace("\r",'',(string)$txt); - $nb = strlen($s); - if($nb>0 && $s[$nb-1]=="\n") - $nb--; - $b = 0; - if($border) - { - if($border==1) - { - $border = 'LTRB'; - $b = 'LRT'; - $b2 = 'LR'; - } - else - { - $b2 = ''; - if(strpos($border,'L')!==false) - $b2 .= 'L'; - if(strpos($border,'R')!==false) - $b2 .= 'R'; - $b = (strpos($border,'T')!==false) ? $b2.'T' : $b2; - } - } - $sep = -1; - $i = 0; - $j = 0; - $l = 0; - $ns = 0; - $nl = 1; - while($i<$nb) - { - // Get next character - $c = $s[$i]; - if($c=="\n") - { - // Explicit line break - if($this->ws>0) - { - $this->ws = 0; - $this->_out('0 Tw'); - } - $this->Cell($w,$h,substr($s,$j,$i-$j),$b,2,$align,$fill); - $i++; - $sep = -1; - $j = $i; - $l = 0; - $ns = 0; - $nl++; - if($border && $nl==2) - $b = $b2; - continue; - } - if($c==' ') - { - $sep = $i; - $ls = $l; - $ns++; - } - $l += $cw[$c]; - if($l>$wmax) - { - // Automatic line break - if($sep==-1) - { - if($i==$j) - $i++; - if($this->ws>0) - { - $this->ws = 0; - $this->_out('0 Tw'); - } - $this->Cell($w,$h,substr($s,$j,$i-$j),$b,2,$align,$fill); - } - else - { - if($align=='J') - { - $this->ws = ($ns>1) ? ($wmax-$ls)/1000*$this->FontSize/($ns-1) : 0; - $this->_out(sprintf('%.3F Tw',$this->ws*$this->k)); - } - $this->Cell($w,$h,substr($s,$j,$sep-$j),$b,2,$align,$fill); - $i = $sep+1; - } - $sep = -1; - $j = $i; - $l = 0; - $ns = 0; - $nl++; - if($border && $nl==2) - $b = $b2; - } - else - $i++; - } - // Last chunk - if($this->ws>0) - { - $this->ws = 0; - $this->_out('0 Tw'); - } - if($border && strpos($border,'B')!==false) - $b .= 'B'; - $this->Cell($w,$h,substr($s,$j,$i-$j),$b,2,$align,$fill); - $this->x = $this->lMargin; -} - -function Write($h, $txt, $link='') -{ - // Output text in flowing mode - if(!isset($this->CurrentFont)) - $this->Error('No font has been set'); - $cw = $this->CurrentFont['cw']; - $w = $this->w-$this->rMargin-$this->x; - $wmax = ($w-2*$this->cMargin)*1000/$this->FontSize; - $s = str_replace("\r",'',(string)$txt); - $nb = strlen($s); - $sep = -1; - $i = 0; - $j = 0; - $l = 0; - $nl = 1; - while($i<$nb) - { - // Get next character - $c = $s[$i]; - if($c=="\n") - { - // Explicit line break - $this->Cell($w,$h,substr($s,$j,$i-$j),0,2,'',false,$link); - $i++; - $sep = -1; - $j = $i; - $l = 0; - if($nl==1) - { - $this->x = $this->lMargin; - $w = $this->w-$this->rMargin-$this->x; - $wmax = ($w-2*$this->cMargin)*1000/$this->FontSize; - } - $nl++; - continue; - } - if($c==' ') - $sep = $i; - $l += $cw[$c]; - if($l>$wmax) - { - // Automatic line break - if($sep==-1) - { - if($this->x>$this->lMargin) - { - // Move to next line - $this->x = $this->lMargin; - $this->y += $h; - $w = $this->w-$this->rMargin-$this->x; - $wmax = ($w-2*$this->cMargin)*1000/$this->FontSize; - $i++; - $nl++; - continue; - } - if($i==$j) - $i++; - $this->Cell($w,$h,substr($s,$j,$i-$j),0,2,'',false,$link); - } - else - { - $this->Cell($w,$h,substr($s,$j,$sep-$j),0,2,'',false,$link); - $i = $sep+1; - } - $sep = -1; - $j = $i; - $l = 0; - if($nl==1) - { - $this->x = $this->lMargin; - $w = $this->w-$this->rMargin-$this->x; - $wmax = ($w-2*$this->cMargin)*1000/$this->FontSize; - } - $nl++; - } - else - $i++; - } - // Last chunk - if($i!=$j) - $this->Cell($l/1000*$this->FontSize,$h,substr($s,$j),0,0,'',false,$link); -} - -function Ln($h=null) -{ - // Line feed; default value is the last cell height - $this->x = $this->lMargin; - if($h===null) - $this->y += $this->lasth; - else - $this->y += $h; -} - -function Image($file, $x=null, $y=null, $w=0, $h=0, $type='', $link='') -{ - // Put an image on the page - if($file=='') - $this->Error('Image file name is empty'); - if(!isset($this->images[$file])) - { - // First use of this image, get info - if($type=='') - { - $pos = strrpos($file,'.'); - if(!$pos) - $this->Error('Image file has no extension and no type was specified: '.$file); - $type = substr($file,$pos+1); - } - $type = strtolower($type); - if($type=='jpeg') - $type = 'jpg'; - $mtd = '_parse'.$type; - if(!method_exists($this,$mtd)) - $this->Error('Unsupported image type: '.$type); - $info = $this->$mtd($file); - $info['i'] = count($this->images)+1; - $this->images[$file] = $info; - } - else - $info = $this->images[$file]; - - // Automatic width and height calculation if needed - if($w==0 && $h==0) - { - // Put image at 96 dpi - $w = -96; - $h = -96; - } - if($w<0) - $w = -$info['w']*72/$w/$this->k; - if($h<0) - $h = -$info['h']*72/$h/$this->k; - if($w==0) - $w = $h*$info['w']/$info['h']; - if($h==0) - $h = $w*$info['h']/$info['w']; - - // Flowing mode - if($y===null) - { - if($this->y+$h>$this->PageBreakTrigger && !$this->InHeader && !$this->InFooter && $this->AcceptPageBreak()) - { - // Automatic page break - $x2 = $this->x; - $this->AddPage($this->CurOrientation,$this->CurPageSize,$this->CurRotation); - $this->x = $x2; - } - $y = $this->y; - $this->y += $h; - } - - if($x===null) - $x = $this->x; - $this->_out(sprintf('q %.2F 0 0 %.2F %.2F %.2F cm /I%d Do Q',$w*$this->k,$h*$this->k,$x*$this->k,($this->h-($y+$h))*$this->k,$info['i'])); - if($link) - $this->Link($x,$y,$w,$h,$link); -} - -function GetPageWidth() -{ - // Get current page width - return $this->w; -} - -function GetPageHeight() -{ - // Get current page height - return $this->h; -} - -function GetX() -{ - // Get x position - return $this->x; -} - -function SetX($x) -{ - // Set x position - if($x>=0) - $this->x = $x; - else - $this->x = $this->w+$x; -} - -function GetY() -{ - // Get y position - return $this->y; -} - -function SetY($y, $resetX=true) -{ - // Set y position and optionally reset x - if($y>=0) - $this->y = $y; - else - $this->y = $this->h+$y; - if($resetX) - $this->x = $this->lMargin; -} - -function SetXY($x, $y) -{ - // Set x and y positions - $this->SetX($x); - $this->SetY($y,false); -} - -function Output($dest='', $name='', $isUTF8=false) -{ - // Output PDF to some destination - $this->Close(); - if(strlen($name)==1 && strlen($dest)!=1) - { - // Fix parameter order - $tmp = $dest; - $dest = $name; - $name = $tmp; - } - if($dest=='') - $dest = 'I'; - if($name=='') - $name = 'doc.pdf'; - switch(strtoupper($dest)) - { - case 'I': - // Send to standard output - $this->_checkoutput(); - if(PHP_SAPI!='cli') - { - // We send to a browser - header('Content-Type: application/pdf'); - header('Content-Disposition: inline; '.$this->_httpencode('filename',$name,$isUTF8)); - header('Cache-Control: private, max-age=0, must-revalidate'); - header('Pragma: public'); - } - echo $this->buffer; - break; - case 'D': - // Download file - $this->_checkoutput(); - header('Content-Type: application/pdf'); - header('Content-Disposition: attachment; '.$this->_httpencode('filename',$name,$isUTF8)); - header('Cache-Control: private, max-age=0, must-revalidate'); - header('Pragma: public'); - echo $this->buffer; - break; - case 'F': - // Save to local file - if(!file_put_contents($name,$this->buffer)) - $this->Error('Unable to create output file: '.$name); - break; - case 'S': - // Return as a string - return $this->buffer; - default: - $this->Error('Incorrect output destination: '.$dest); - } - return ''; -} - -/******************************************************************************* -* Protected methods * -*******************************************************************************/ - -protected function _checkoutput() -{ - if(PHP_SAPI!='cli') - { - if(headers_sent($file,$line)) - $this->Error("Some data has already been output, can't send PDF file (output started at $file:$line)"); - } - if(ob_get_length()) - { - // The output buffer is not empty - if(preg_match('/^(\xEF\xBB\xBF)?\s*$/',ob_get_contents())) - { - // It contains only a UTF-8 BOM and/or whitespace, let's clean it - ob_clean(); - } - else - $this->Error("Some data has already been output, can't send PDF file"); - } -} - -protected function _getpagesize($size) -{ - if(is_string($size)) - { - $size = strtolower($size); - if(!isset($this->StdPageSizes[$size])) - $this->Error('Unknown page size: '.$size); - $a = $this->StdPageSizes[$size]; - return array($a[0]/$this->k, $a[1]/$this->k); - } - else - { - if($size[0]>$size[1]) - return array($size[1], $size[0]); - else - return $size; - } -} - -protected function _beginpage($orientation, $size, $rotation) -{ - $this->page++; - $this->pages[$this->page] = ''; - $this->PageLinks[$this->page] = array(); - $this->state = 2; - $this->x = $this->lMargin; - $this->y = $this->tMargin; - $this->FontFamily = ''; - // Check page size and orientation - if($orientation=='') - $orientation = $this->DefOrientation; - else - $orientation = strtoupper($orientation[0]); - if($size=='') - $size = $this->DefPageSize; - else - $size = $this->_getpagesize($size); - if($orientation!=$this->CurOrientation || $size[0]!=$this->CurPageSize[0] || $size[1]!=$this->CurPageSize[1]) - { - // New size or orientation - if($orientation=='P') - { - $this->w = $size[0]; - $this->h = $size[1]; - } - else - { - $this->w = $size[1]; - $this->h = $size[0]; - } - $this->wPt = $this->w*$this->k; - $this->hPt = $this->h*$this->k; - $this->PageBreakTrigger = $this->h-$this->bMargin; - $this->CurOrientation = $orientation; - $this->CurPageSize = $size; - } - if($orientation!=$this->DefOrientation || $size[0]!=$this->DefPageSize[0] || $size[1]!=$this->DefPageSize[1]) - $this->PageInfo[$this->page]['size'] = array($this->wPt, $this->hPt); - if($rotation!=0) - { - if($rotation%90!=0) - $this->Error('Incorrect rotation value: '.$rotation); - $this->PageInfo[$this->page]['rotation'] = $rotation; - } - $this->CurRotation = $rotation; -} - -protected function _endpage() -{ - $this->state = 1; -} - -protected function _loadfont($path) -{ - // Load a font definition file - include($path); - if(!isset($name)) - $this->Error('Could not include font definition file: '.$path); - if(isset($enc)) - $enc = strtolower($enc); - if(!isset($subsetted)) - $subsetted = false; - return get_defined_vars(); -} - -protected function _isascii($s) -{ - // Test if string is ASCII - $nb = strlen($s); - for($i=0;$i<$nb;$i++) - { - if(ord($s[$i])>127) - return false; - } - return true; -} - -protected function _httpencode($param, $value, $isUTF8) -{ - // Encode HTTP header field parameter - if($this->_isascii($value)) - return $param.'="'.$value.'"'; - if(!$isUTF8) - $value = $this->_UTF8encode($value); - return $param."*=UTF-8''".rawurlencode($value); -} - -protected function _UTF8encode($s) -{ - // Convert ISO-8859-1 to UTF-8 - if($this->iconv) - return iconv('ISO-8859-1','UTF-8',$s); - $res = ''; - $nb = strlen($s); - for($i=0;$i<$nb;$i++) - { - $c = $s[$i]; - $v = ord($c); - if($v>=128) - { - $res .= chr(0xC0 | ($v >> 6)); - $res .= chr(0x80 | ($v & 0x3F)); - } - else - $res .= $c; - } - return $res; -} - -protected function _UTF8toUTF16($s) -{ - // Convert UTF-8 to UTF-16BE with BOM - $res = "\xFE\xFF"; - if($this->iconv) - return $res.iconv('UTF-8','UTF-16BE',$s); - $nb = strlen($s); - $i = 0; - while($i<$nb) - { - $c1 = ord($s[$i++]); - if($c1>=224) - { - // 3-byte character - $c2 = ord($s[$i++]); - $c3 = ord($s[$i++]); - $res .= chr((($c1 & 0x0F)<<4) + (($c2 & 0x3C)>>2)); - $res .= chr((($c2 & 0x03)<<6) + ($c3 & 0x3F)); - } - elseif($c1>=192) - { - // 2-byte character - $c2 = ord($s[$i++]); - $res .= chr(($c1 & 0x1C)>>2); - $res .= chr((($c1 & 0x03)<<6) + ($c2 & 0x3F)); - } - else - { - // Single-byte character - $res .= "\0".chr($c1); - } - } - return $res; -} - -protected function _escape($s) -{ - // Escape special characters - if(strpos($s,'(')!==false || strpos($s,')')!==false || strpos($s,'\\')!==false || strpos($s,"\r")!==false) - return str_replace(array('\\','(',')',"\r"), array('\\\\','\\(','\\)','\\r'), $s); - else - return $s; -} - -protected function _textstring($s) -{ - // Format a text string - if(!$this->_isascii($s)) - $s = $this->_UTF8toUTF16($s); - return '('.$this->_escape($s).')'; -} - -protected function _dounderline($x, $y, $txt) -{ - // Underline text - $up = $this->CurrentFont['up']; - $ut = $this->CurrentFont['ut']; - $w = $this->GetStringWidth($txt)+$this->ws*substr_count($txt,' '); - return sprintf('%.2F %.2F %.2F %.2F re f',$x*$this->k,($this->h-($y-$up/1000*$this->FontSize))*$this->k,$w*$this->k,-$ut/1000*$this->FontSizePt); -} - -protected function _parsejpg($file) -{ - // Extract info from a JPEG file - $a = getimagesize($file); - if(!$a) - $this->Error('Missing or incorrect image file: '.$file); - if($a[2]!=2) - $this->Error('Not a JPEG file: '.$file); - if(!isset($a['channels']) || $a['channels']==3) - $colspace = 'DeviceRGB'; - elseif($a['channels']==4) - $colspace = 'DeviceCMYK'; - else - $colspace = 'DeviceGray'; - $bpc = isset($a['bits']) ? $a['bits'] : 8; - $data = file_get_contents($file); - return array('w'=>$a[0], 'h'=>$a[1], 'cs'=>$colspace, 'bpc'=>$bpc, 'f'=>'DCTDecode', 'data'=>$data); -} - -protected function _parsepng($file) -{ - // Extract info from a PNG file - $f = fopen($file,'rb'); - if(!$f) - $this->Error('Can\'t open image file: '.$file); - $info = $this->_parsepngstream($f,$file); - fclose($f); - return $info; -} - -protected function _parsepngstream($f, $file) -{ - // Check signature - if($this->_readstream($f,8)!=chr(137).'PNG'.chr(13).chr(10).chr(26).chr(10)) - $this->Error('Not a PNG file: '.$file); - - // Read header chunk - $this->_readstream($f,4); - if($this->_readstream($f,4)!='IHDR') - $this->Error('Incorrect PNG file: '.$file); - $w = $this->_readint($f); - $h = $this->_readint($f); - $bpc = ord($this->_readstream($f,1)); - if($bpc>8) - $this->Error('16-bit depth not supported: '.$file); - $ct = ord($this->_readstream($f,1)); - if($ct==0 || $ct==4) - $colspace = 'DeviceGray'; - elseif($ct==2 || $ct==6) - $colspace = 'DeviceRGB'; - elseif($ct==3) - $colspace = 'Indexed'; - else - $this->Error('Unknown color type: '.$file); - if(ord($this->_readstream($f,1))!=0) - $this->Error('Unknown compression method: '.$file); - if(ord($this->_readstream($f,1))!=0) - $this->Error('Unknown filter method: '.$file); - if(ord($this->_readstream($f,1))!=0) - $this->Error('Interlacing not supported: '.$file); - $this->_readstream($f,4); - $dp = '/Predictor 15 /Colors '.($colspace=='DeviceRGB' ? 3 : 1).' /BitsPerComponent '.$bpc.' /Columns '.$w; - - // Scan chunks looking for palette, transparency and image data - $pal = ''; - $trns = ''; - $data = ''; - do - { - $n = $this->_readint($f); - $type = $this->_readstream($f,4); - if($type=='PLTE') - { - // Read palette - $pal = $this->_readstream($f,$n); - $this->_readstream($f,4); - } - elseif($type=='tRNS') - { - // Read transparency info - $t = $this->_readstream($f,$n); - if($ct==0) - $trns = array(ord(substr($t,1,1))); - elseif($ct==2) - $trns = array(ord(substr($t,1,1)), ord(substr($t,3,1)), ord(substr($t,5,1))); - else - { - $pos = strpos($t,chr(0)); - if($pos!==false) - $trns = array($pos); - } - $this->_readstream($f,4); - } - elseif($type=='IDAT') - { - // Read image data block - $data .= $this->_readstream($f,$n); - $this->_readstream($f,4); - } - elseif($type=='IEND') - break; - else - $this->_readstream($f,$n+4); - } - while($n); - - if($colspace=='Indexed' && empty($pal)) - $this->Error('Missing palette in '.$file); - $info = array('w'=>$w, 'h'=>$h, 'cs'=>$colspace, 'bpc'=>$bpc, 'f'=>'FlateDecode', 'dp'=>$dp, 'pal'=>$pal, 'trns'=>$trns); - if($ct>=4) - { - // Extract alpha channel - if(!function_exists('gzuncompress')) - $this->Error('Zlib not available, can\'t handle alpha channel: '.$file); - $data = gzuncompress($data); - $color = ''; - $alpha = ''; - if($ct==4) - { - // Gray image - $len = 2*$w; - for($i=0;$i<$h;$i++) - { - $pos = (1+$len)*$i; - $color .= $data[$pos]; - $alpha .= $data[$pos]; - $line = substr($data,$pos+1,$len); - $color .= preg_replace('/(.)./s','$1',$line); - $alpha .= preg_replace('/.(.)/s','$1',$line); - } - } - else - { - // RGB image - $len = 4*$w; - for($i=0;$i<$h;$i++) - { - $pos = (1+$len)*$i; - $color .= $data[$pos]; - $alpha .= $data[$pos]; - $line = substr($data,$pos+1,$len); - $color .= preg_replace('/(.{3})./s','$1',$line); - $alpha .= preg_replace('/.{3}(.)/s','$1',$line); - } - } - unset($data); - $data = gzcompress($color); - $info['smask'] = gzcompress($alpha); - $this->WithAlpha = true; - if($this->PDFVersion<'1.4') - $this->PDFVersion = '1.4'; - } - $info['data'] = $data; - return $info; -} - -protected function _readstream($f, $n) -{ - // Read n bytes from stream - $res = ''; - while($n>0 && !feof($f)) - { - $s = fread($f,$n); - if($s===false) - $this->Error('Error while reading stream'); - $n -= strlen($s); - $res .= $s; - } - if($n>0) - $this->Error('Unexpected end of stream'); - return $res; -} - -protected function _readint($f) -{ - // Read a 4-byte integer from stream - $a = unpack('Ni',$this->_readstream($f,4)); - return $a['i']; -} - -protected function _parsegif($file) -{ - // Extract info from a GIF file (via PNG conversion) - if(!function_exists('imagepng')) - $this->Error('GD extension is required for GIF support'); - if(!function_exists('imagecreatefromgif')) - $this->Error('GD has no GIF read support'); - $im = imagecreatefromgif($file); - if(!$im) - $this->Error('Missing or incorrect image file: '.$file); - imageinterlace($im,0); - ob_start(); - imagepng($im); - $data = ob_get_clean(); - imagedestroy($im); - $f = fopen('php://temp','rb+'); - if(!$f) - $this->Error('Unable to create memory stream'); - fwrite($f,$data); - rewind($f); - $info = $this->_parsepngstream($f,$file); - fclose($f); - return $info; -} - -protected function _out($s) -{ - // Add a line to the current page - if($this->state==2) - $this->pages[$this->page] .= $s."\n"; - elseif($this->state==0) - $this->Error('No page has been added yet'); - elseif($this->state==1) - $this->Error('Invalid call'); - elseif($this->state==3) - $this->Error('The document is closed'); -} - -protected function _put($s) -{ - // Add a line to the document - $this->buffer .= $s."\n"; -} - -protected function _getoffset() -{ - return strlen($this->buffer); -} - -protected function _newobj($n=null) -{ - // Begin a new object - if($n===null) - $n = ++$this->n; - $this->offsets[$n] = $this->_getoffset(); - $this->_put($n.' 0 obj'); -} - -protected function _putstream($data) -{ - $this->_put('stream'); - $this->_put($data); - $this->_put('endstream'); -} - -protected function _putstreamobject($data) -{ - if($this->compress) - { - $entries = '/Filter /FlateDecode '; - $data = gzcompress($data); - } - else - $entries = ''; - $entries .= '/Length '.strlen($data); - $this->_newobj(); - $this->_put('<<'.$entries.'>>'); - $this->_putstream($data); - $this->_put('endobj'); -} - -protected function _putlinks($n) -{ - foreach($this->PageLinks[$n] as $pl) - { - $this->_newobj(); - $rect = sprintf('%.2F %.2F %.2F %.2F',$pl[0],$pl[1],$pl[0]+$pl[2],$pl[1]-$pl[3]); - $s = '<</Type /Annot /Subtype /Link /Rect ['.$rect.'] /Border [0 0 0] '; - if(is_string($pl[4])) - $s .= '/A <</S /URI /URI '.$this->_textstring($pl[4]).'>>>>'; - else - { - $l = $this->links[$pl[4]]; - if(isset($this->PageInfo[$l[0]]['size'])) - $h = $this->PageInfo[$l[0]]['size'][1]; - else - $h = ($this->DefOrientation=='P') ? $this->DefPageSize[1]*$this->k : $this->DefPageSize[0]*$this->k; - $s .= sprintf('/Dest [%d 0 R /XYZ 0 %.2F null]>>',$this->PageInfo[$l[0]]['n'],$h-$l[1]*$this->k); - } - $this->_put($s); - $this->_put('endobj'); - } -} - -protected function _putpage($n) -{ - $this->_newobj(); - $this->_put('<</Type /Page'); - $this->_put('/Parent 1 0 R'); - if(isset($this->PageInfo[$n]['size'])) - $this->_put(sprintf('/MediaBox [0 0 %.2F %.2F]',$this->PageInfo[$n]['size'][0],$this->PageInfo[$n]['size'][1])); - if(isset($this->PageInfo[$n]['rotation'])) - $this->_put('/Rotate '.$this->PageInfo[$n]['rotation']); - $this->_put('/Resources 2 0 R'); - if(!empty($this->PageLinks[$n])) - { - $s = '/Annots ['; - foreach($this->PageLinks[$n] as $pl) - $s .= $pl[5].' 0 R '; - $s .= ']'; - $this->_put($s); - } - if($this->WithAlpha) - $this->_put('/Group <</Type /Group /S /Transparency /CS /DeviceRGB>>'); - $this->_put('/Contents '.($this->n+1).' 0 R>>'); - $this->_put('endobj'); - // Page content - if(!empty($this->AliasNbPages)) - $this->pages[$n] = str_replace($this->AliasNbPages,$this->page,$this->pages[$n]); - $this->_putstreamobject($this->pages[$n]); - // Link annotations - $this->_putlinks($n); -} - -protected function _putpages() -{ - $nb = $this->page; - $n = $this->n; - for($i=1;$i<=$nb;$i++) - { - $this->PageInfo[$i]['n'] = ++$n; - $n++; - foreach($this->PageLinks[$i] as &$pl) - $pl[5] = ++$n; - unset($pl); - } - for($i=1;$i<=$nb;$i++) - $this->_putpage($i); - // Pages root - $this->_newobj(1); - $this->_put('<</Type /Pages'); - $kids = '/Kids ['; - for($i=1;$i<=$nb;$i++) - $kids .= $this->PageInfo[$i]['n'].' 0 R '; - $kids .= ']'; - $this->_put($kids); - $this->_put('/Count '.$nb); - if($this->DefOrientation=='P') - { - $w = $this->DefPageSize[0]; - $h = $this->DefPageSize[1]; - } - else - { - $w = $this->DefPageSize[1]; - $h = $this->DefPageSize[0]; - } - $this->_put(sprintf('/MediaBox [0 0 %.2F %.2F]',$w*$this->k,$h*$this->k)); - $this->_put('>>'); - $this->_put('endobj'); -} - -protected function _putfonts() -{ - foreach($this->FontFiles as $file=>$info) - { - // Font file embedding - $this->_newobj(); - $this->FontFiles[$file]['n'] = $this->n; - $font = file_get_contents($file); - if(!$font) - $this->Error('Font file not found: '.$file); - $compressed = (substr($file,-2)=='.z'); - if(!$compressed && isset($info['length2'])) - $font = substr($font,6,$info['length1']).substr($font,6+$info['length1']+6,$info['length2']); - $this->_put('<</Length '.strlen($font)); - if($compressed) - $this->_put('/Filter /FlateDecode'); - $this->_put('/Length1 '.$info['length1']); - if(isset($info['length2'])) - $this->_put('/Length2 '.$info['length2'].' /Length3 0'); - $this->_put('>>'); - $this->_putstream($font); - $this->_put('endobj'); - } - foreach($this->fonts as $k=>$font) - { - // Encoding - if(isset($font['diff'])) - { - if(!isset($this->encodings[$font['enc']])) - { - $this->_newobj(); - $this->_put('<</Type /Encoding /BaseEncoding /WinAnsiEncoding /Differences ['.$font['diff'].']>>'); - $this->_put('endobj'); - $this->encodings[$font['enc']] = $this->n; - } - } - // ToUnicode CMap - if(isset($font['uv'])) - { - if(isset($font['enc'])) - $cmapkey = $font['enc']; - else - $cmapkey = $font['name']; - if(!isset($this->cmaps[$cmapkey])) - { - $cmap = $this->_tounicodecmap($font['uv']); - $this->_putstreamobject($cmap); - $this->cmaps[$cmapkey] = $this->n; - } - } - // Font object - $this->fonts[$k]['n'] = $this->n+1; - $type = $font['type']; - $name = $font['name']; - if($font['subsetted']) - $name = 'AAAAAA+'.$name; - if($type=='Core') - { - // Core font - $this->_newobj(); - $this->_put('<</Type /Font'); - $this->_put('/BaseFont /'.$name); - $this->_put('/Subtype /Type1'); - if($name!='Symbol' && $name!='ZapfDingbats') - $this->_put('/Encoding /WinAnsiEncoding'); - if(isset($font['uv'])) - $this->_put('/ToUnicode '.$this->cmaps[$cmapkey].' 0 R'); - $this->_put('>>'); - $this->_put('endobj'); - } - elseif($type=='Type1' || $type=='TrueType') - { - // Additional Type1 or TrueType/OpenType font - $this->_newobj(); - $this->_put('<</Type /Font'); - $this->_put('/BaseFont /'.$name); - $this->_put('/Subtype /'.$type); - $this->_put('/FirstChar 32 /LastChar 255'); - $this->_put('/Widths '.($this->n+1).' 0 R'); - $this->_put('/FontDescriptor '.($this->n+2).' 0 R'); - if(isset($font['diff'])) - $this->_put('/Encoding '.$this->encodings[$font['enc']].' 0 R'); - else - $this->_put('/Encoding /WinAnsiEncoding'); - if(isset($font['uv'])) - $this->_put('/ToUnicode '.$this->cmaps[$cmapkey].' 0 R'); - $this->_put('>>'); - $this->_put('endobj'); - // Widths - $this->_newobj(); - $cw = $font['cw']; - $s = '['; - for($i=32;$i<=255;$i++) - $s .= $cw[chr($i)].' '; - $this->_put($s.']'); - $this->_put('endobj'); - // Descriptor - $this->_newobj(); - $s = '<</Type /FontDescriptor /FontName /'.$name; - foreach($font['desc'] as $k=>$v) - $s .= ' /'.$k.' '.$v; - if(!empty($font['file'])) - $s .= ' /FontFile'.($type=='Type1' ? '' : '2').' '.$this->FontFiles[$font['file']]['n'].' 0 R'; - $this->_put($s.'>>'); - $this->_put('endobj'); - } - else - { - // Allow for additional types - $mtd = '_put'.strtolower($type); - if(!method_exists($this,$mtd)) - $this->Error('Unsupported font type: '.$type); - $this->$mtd($font); - } - } -} - -protected function _tounicodecmap($uv) -{ - $ranges = ''; - $nbr = 0; - $chars = ''; - $nbc = 0; - foreach($uv as $c=>$v) - { - if(is_array($v)) - { - $ranges .= sprintf("<%02X> <%02X> <%04X>\n",$c,$c+$v[1]-1,$v[0]); - $nbr++; - } - else - { - $chars .= sprintf("<%02X> <%04X>\n",$c,$v); - $nbc++; - } - } - $s = "/CIDInit /ProcSet findresource begin\n"; - $s .= "12 dict begin\n"; - $s .= "begincmap\n"; - $s .= "/CIDSystemInfo\n"; - $s .= "<</Registry (Adobe)\n"; - $s .= "/Ordering (UCS)\n"; - $s .= "/Supplement 0\n"; - $s .= ">> def\n"; - $s .= "/CMapName /Adobe-Identity-UCS def\n"; - $s .= "/CMapType 2 def\n"; - $s .= "1 begincodespacerange\n"; - $s .= "<00> <FF>\n"; - $s .= "endcodespacerange\n"; - if($nbr>0) - { - $s .= "$nbr beginbfrange\n"; - $s .= $ranges; - $s .= "endbfrange\n"; - } - if($nbc>0) - { - $s .= "$nbc beginbfchar\n"; - $s .= $chars; - $s .= "endbfchar\n"; - } - $s .= "endcmap\n"; - $s .= "CMapName currentdict /CMap defineresource pop\n"; - $s .= "end\n"; - $s .= "end"; - return $s; -} - -protected function _putimages() -{ - foreach(array_keys($this->images) as $file) - { - $this->_putimage($this->images[$file]); - unset($this->images[$file]['data']); - unset($this->images[$file]['smask']); - } -} - -protected function _putimage(&$info) -{ - $this->_newobj(); - $info['n'] = $this->n; - $this->_put('<</Type /XObject'); - $this->_put('/Subtype /Image'); - $this->_put('/Width '.$info['w']); - $this->_put('/Height '.$info['h']); - if($info['cs']=='Indexed') - $this->_put('/ColorSpace [/Indexed /DeviceRGB '.(strlen($info['pal'])/3-1).' '.($this->n+1).' 0 R]'); - else - { - $this->_put('/ColorSpace /'.$info['cs']); - if($info['cs']=='DeviceCMYK') - $this->_put('/Decode [1 0 1 0 1 0 1 0]'); - } - $this->_put('/BitsPerComponent '.$info['bpc']); - if(isset($info['f'])) - $this->_put('/Filter /'.$info['f']); - if(isset($info['dp'])) - $this->_put('/DecodeParms <<'.$info['dp'].'>>'); - if(isset($info['trns']) && is_array($info['trns'])) - { - $trns = ''; - for($i=0;$i<count($info['trns']);$i++) - $trns .= $info['trns'][$i].' '.$info['trns'][$i].' '; - $this->_put('/Mask ['.$trns.']'); - } - if(isset($info['smask'])) - $this->_put('/SMask '.($this->n+1).' 0 R'); - $this->_put('/Length '.strlen($info['data']).'>>'); - $this->_putstream($info['data']); - $this->_put('endobj'); - // Soft mask - if(isset($info['smask'])) - { - $dp = '/Predictor 15 /Colors 1 /BitsPerComponent 8 /Columns '.$info['w']; - $smask = array('w'=>$info['w'], 'h'=>$info['h'], 'cs'=>'DeviceGray', 'bpc'=>8, 'f'=>$info['f'], 'dp'=>$dp, 'data'=>$info['smask']); - $this->_putimage($smask); - } - // Palette - if($info['cs']=='Indexed') - $this->_putstreamobject($info['pal']); -} - -protected function _putxobjectdict() -{ - foreach($this->images as $image) - $this->_put('/I'.$image['i'].' '.$image['n'].' 0 R'); -} - -protected function _putresourcedict() -{ - $this->_put('/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]'); - $this->_put('/Font <<'); - foreach($this->fonts as $font) - $this->_put('/F'.$font['i'].' '.$font['n'].' 0 R'); - $this->_put('>>'); - $this->_put('/XObject <<'); - $this->_putxobjectdict(); - $this->_put('>>'); -} - -protected function _putresources() -{ - $this->_putfonts(); - $this->_putimages(); - // Resource dictionary - $this->_newobj(2); - $this->_put('<<'); - $this->_putresourcedict(); - $this->_put('>>'); - $this->_put('endobj'); -} - -protected function _putinfo() -{ - $date = @date('YmdHisO',$this->CreationDate); - $this->metadata['CreationDate'] = 'D:'.substr($date,0,-2)."'".substr($date,-2)."'"; - foreach($this->metadata as $key=>$value) - $this->_put('/'.$key.' '.$this->_textstring($value)); -} - -protected function _putcatalog() -{ - $n = $this->PageInfo[1]['n']; - $this->_put('/Type /Catalog'); - $this->_put('/Pages 1 0 R'); - if($this->ZoomMode=='fullpage') - $this->_put('/OpenAction ['.$n.' 0 R /Fit]'); - elseif($this->ZoomMode=='fullwidth') - $this->_put('/OpenAction ['.$n.' 0 R /FitH null]'); - elseif($this->ZoomMode=='real') - $this->_put('/OpenAction ['.$n.' 0 R /XYZ null null 1]'); - elseif(!is_string($this->ZoomMode)) - $this->_put('/OpenAction ['.$n.' 0 R /XYZ null null '.sprintf('%.2F',$this->ZoomMode/100).']'); - if($this->LayoutMode=='single') - $this->_put('/PageLayout /SinglePage'); - elseif($this->LayoutMode=='continuous') - $this->_put('/PageLayout /OneColumn'); - elseif($this->LayoutMode=='two') - $this->_put('/PageLayout /TwoColumnLeft'); -} - -protected function _putheader() -{ - $this->_put('%PDF-'.$this->PDFVersion); -} - -protected function _puttrailer() -{ - $this->_put('/Size '.($this->n+1)); - $this->_put('/Root '.$this->n.' 0 R'); - $this->_put('/Info '.($this->n-1).' 0 R'); -} - -protected function _enddoc() -{ - $this->CreationDate = time(); - $this->_putheader(); - $this->_putpages(); - $this->_putresources(); - // Info - $this->_newobj(); - $this->_put('<<'); - $this->_putinfo(); - $this->_put('>>'); - $this->_put('endobj'); - // Catalog - $this->_newobj(); - $this->_put('<<'); - $this->_putcatalog(); - $this->_put('>>'); - $this->_put('endobj'); - // Cross-ref - $offset = $this->_getoffset(); - $this->_put('xref'); - $this->_put('0 '.($this->n+1)); - $this->_put('0000000000 65535 f '); - for($i=1;$i<=$this->n;$i++) - $this->_put(sprintf('%010d 00000 n ',$this->offsets[$i])); - // Trailer - $this->_put('trailer'); - $this->_put('<<'); - $this->_puttrailer(); - $this->_put('>>'); - $this->_put('startxref'); - $this->_put($offset); - $this->_put('%%EOF'); - $this->state = 3; -} -} -?> diff --git a/fpdf/license.txt b/fpdf/license.txt deleted file mode 100644 index fd811c6f56d9f0bb27d6ee98587dcb1928aa6db5..0000000000000000000000000000000000000000 --- a/fpdf/license.txt +++ /dev/null @@ -1,6 +0,0 @@ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software to use, copy, modify, distribute, sublicense, and/or sell -copies of the software, and to permit persons to whom the software is furnished -to do so. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. \ No newline at end of file diff --git a/fpdf/makefont/cp1250.map b/fpdf/makefont/cp1250.map deleted file mode 100644 index ec110af06108ab961c9eafd5fc45a7488ca6cce0..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1250.map +++ /dev/null @@ -1,251 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!89 U+2030 perthousand -!8A U+0160 Scaron -!8B U+2039 guilsinglleft -!8C U+015A Sacute -!8D U+0164 Tcaron -!8E U+017D Zcaron -!8F U+0179 Zacute -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!99 U+2122 trademark -!9A U+0161 scaron -!9B U+203A guilsinglright -!9C U+015B sacute -!9D U+0165 tcaron -!9E U+017E zcaron -!9F U+017A zacute -!A0 U+00A0 space -!A1 U+02C7 caron -!A2 U+02D8 breve -!A3 U+0141 Lslash -!A4 U+00A4 currency -!A5 U+0104 Aogonek -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+015E Scedilla -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+017B Zdotaccent -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+02DB ogonek -!B3 U+0142 lslash -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00B8 cedilla -!B9 U+0105 aogonek -!BA U+015F scedilla -!BB U+00BB guillemotright -!BC U+013D Lcaron -!BD U+02DD hungarumlaut -!BE U+013E lcaron -!BF U+017C zdotaccent -!C0 U+0154 Racute -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+0102 Abreve -!C4 U+00C4 Adieresis -!C5 U+0139 Lacute -!C6 U+0106 Cacute -!C7 U+00C7 Ccedilla -!C8 U+010C Ccaron -!C9 U+00C9 Eacute -!CA U+0118 Eogonek -!CB U+00CB Edieresis -!CC U+011A Ecaron -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+010E Dcaron -!D0 U+0110 Dcroat -!D1 U+0143 Nacute -!D2 U+0147 Ncaron -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+0150 Ohungarumlaut -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+0158 Rcaron -!D9 U+016E Uring -!DA U+00DA Uacute -!DB U+0170 Uhungarumlaut -!DC U+00DC Udieresis -!DD U+00DD Yacute -!DE U+0162 Tcommaaccent -!DF U+00DF germandbls -!E0 U+0155 racute -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+0103 abreve -!E4 U+00E4 adieresis -!E5 U+013A lacute -!E6 U+0107 cacute -!E7 U+00E7 ccedilla -!E8 U+010D ccaron -!E9 U+00E9 eacute -!EA U+0119 eogonek -!EB U+00EB edieresis -!EC U+011B ecaron -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+010F dcaron -!F0 U+0111 dcroat -!F1 U+0144 nacute -!F2 U+0148 ncaron -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+0151 ohungarumlaut -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+0159 rcaron -!F9 U+016F uring -!FA U+00FA uacute -!FB U+0171 uhungarumlaut -!FC U+00FC udieresis -!FD U+00FD yacute -!FE U+0163 tcommaaccent -!FF U+02D9 dotaccent diff --git a/fpdf/makefont/cp1251.map b/fpdf/makefont/cp1251.map deleted file mode 100644 index de6a198d99d9d17db29f02633e3b0e66c9a60e98..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1251.map +++ /dev/null @@ -1,255 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0402 afii10051 -!81 U+0403 afii10052 -!82 U+201A quotesinglbase -!83 U+0453 afii10100 -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!88 U+20AC Euro -!89 U+2030 perthousand -!8A U+0409 afii10058 -!8B U+2039 guilsinglleft -!8C U+040A afii10059 -!8D U+040C afii10061 -!8E U+040B afii10060 -!8F U+040F afii10145 -!90 U+0452 afii10099 -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!99 U+2122 trademark -!9A U+0459 afii10106 -!9B U+203A guilsinglright -!9C U+045A afii10107 -!9D U+045C afii10109 -!9E U+045B afii10108 -!9F U+045F afii10193 -!A0 U+00A0 space -!A1 U+040E afii10062 -!A2 U+045E afii10110 -!A3 U+0408 afii10057 -!A4 U+00A4 currency -!A5 U+0490 afii10050 -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+0401 afii10023 -!A9 U+00A9 copyright -!AA U+0404 afii10053 -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+0407 afii10056 -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+0406 afii10055 -!B3 U+0456 afii10103 -!B4 U+0491 afii10098 -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+0451 afii10071 -!B9 U+2116 afii61352 -!BA U+0454 afii10101 -!BB U+00BB guillemotright -!BC U+0458 afii10105 -!BD U+0405 afii10054 -!BE U+0455 afii10102 -!BF U+0457 afii10104 -!C0 U+0410 afii10017 -!C1 U+0411 afii10018 -!C2 U+0412 afii10019 -!C3 U+0413 afii10020 -!C4 U+0414 afii10021 -!C5 U+0415 afii10022 -!C6 U+0416 afii10024 -!C7 U+0417 afii10025 -!C8 U+0418 afii10026 -!C9 U+0419 afii10027 -!CA U+041A afii10028 -!CB U+041B afii10029 -!CC U+041C afii10030 -!CD U+041D afii10031 -!CE U+041E afii10032 -!CF U+041F afii10033 -!D0 U+0420 afii10034 -!D1 U+0421 afii10035 -!D2 U+0422 afii10036 -!D3 U+0423 afii10037 -!D4 U+0424 afii10038 -!D5 U+0425 afii10039 -!D6 U+0426 afii10040 -!D7 U+0427 afii10041 -!D8 U+0428 afii10042 -!D9 U+0429 afii10043 -!DA U+042A afii10044 -!DB U+042B afii10045 -!DC U+042C afii10046 -!DD U+042D afii10047 -!DE U+042E afii10048 -!DF U+042F afii10049 -!E0 U+0430 afii10065 -!E1 U+0431 afii10066 -!E2 U+0432 afii10067 -!E3 U+0433 afii10068 -!E4 U+0434 afii10069 -!E5 U+0435 afii10070 -!E6 U+0436 afii10072 -!E7 U+0437 afii10073 -!E8 U+0438 afii10074 -!E9 U+0439 afii10075 -!EA U+043A afii10076 -!EB U+043B afii10077 -!EC U+043C afii10078 -!ED U+043D afii10079 -!EE U+043E afii10080 -!EF U+043F afii10081 -!F0 U+0440 afii10082 -!F1 U+0441 afii10083 -!F2 U+0442 afii10084 -!F3 U+0443 afii10085 -!F4 U+0444 afii10086 -!F5 U+0445 afii10087 -!F6 U+0446 afii10088 -!F7 U+0447 afii10089 -!F8 U+0448 afii10090 -!F9 U+0449 afii10091 -!FA U+044A afii10092 -!FB U+044B afii10093 -!FC U+044C afii10094 -!FD U+044D afii10095 -!FE U+044E afii10096 -!FF U+044F afii10097 diff --git a/fpdf/makefont/cp1252.map b/fpdf/makefont/cp1252.map deleted file mode 100644 index dd490e5961485ea47e527508691007e31e376fe9..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1252.map +++ /dev/null @@ -1,251 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!83 U+0192 florin -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!88 U+02C6 circumflex -!89 U+2030 perthousand -!8A U+0160 Scaron -!8B U+2039 guilsinglleft -!8C U+0152 OE -!8E U+017D Zcaron -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!98 U+02DC tilde -!99 U+2122 trademark -!9A U+0161 scaron -!9B U+203A guilsinglright -!9C U+0153 oe -!9E U+017E zcaron -!9F U+0178 Ydieresis -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+00AA ordfeminine -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00B8 cedilla -!B9 U+00B9 onesuperior -!BA U+00BA ordmasculine -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00BF questiondown -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+00C3 Atilde -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+00CC Igrave -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+00D0 Eth -!D1 U+00D1 Ntilde -!D2 U+00D2 Ograve -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+00DD Yacute -!DE U+00DE Thorn -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+00E3 atilde -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+00EC igrave -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+00F0 eth -!F1 U+00F1 ntilde -!F2 U+00F2 ograve -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+00FD yacute -!FE U+00FE thorn -!FF U+00FF ydieresis diff --git a/fpdf/makefont/cp1253.map b/fpdf/makefont/cp1253.map deleted file mode 100644 index 4bd826fb2652c285e2d5ada788827e5d0085c31f..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1253.map +++ /dev/null @@ -1,239 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!83 U+0192 florin -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!89 U+2030 perthousand -!8B U+2039 guilsinglleft -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!99 U+2122 trademark -!9B U+203A guilsinglright -!A0 U+00A0 space -!A1 U+0385 dieresistonos -!A2 U+0386 Alphatonos -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+2015 afii00208 -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+0384 tonos -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+0388 Epsilontonos -!B9 U+0389 Etatonos -!BA U+038A Iotatonos -!BB U+00BB guillemotright -!BC U+038C Omicrontonos -!BD U+00BD onehalf -!BE U+038E Upsilontonos -!BF U+038F Omegatonos -!C0 U+0390 iotadieresistonos -!C1 U+0391 Alpha -!C2 U+0392 Beta -!C3 U+0393 Gamma -!C4 U+0394 Delta -!C5 U+0395 Epsilon -!C6 U+0396 Zeta -!C7 U+0397 Eta -!C8 U+0398 Theta -!C9 U+0399 Iota -!CA U+039A Kappa -!CB U+039B Lambda -!CC U+039C Mu -!CD U+039D Nu -!CE U+039E Xi -!CF U+039F Omicron -!D0 U+03A0 Pi -!D1 U+03A1 Rho -!D3 U+03A3 Sigma -!D4 U+03A4 Tau -!D5 U+03A5 Upsilon -!D6 U+03A6 Phi -!D7 U+03A7 Chi -!D8 U+03A8 Psi -!D9 U+03A9 Omega -!DA U+03AA Iotadieresis -!DB U+03AB Upsilondieresis -!DC U+03AC alphatonos -!DD U+03AD epsilontonos -!DE U+03AE etatonos -!DF U+03AF iotatonos -!E0 U+03B0 upsilondieresistonos -!E1 U+03B1 alpha -!E2 U+03B2 beta -!E3 U+03B3 gamma -!E4 U+03B4 delta -!E5 U+03B5 epsilon -!E6 U+03B6 zeta -!E7 U+03B7 eta -!E8 U+03B8 theta -!E9 U+03B9 iota -!EA U+03BA kappa -!EB U+03BB lambda -!EC U+03BC mu -!ED U+03BD nu -!EE U+03BE xi -!EF U+03BF omicron -!F0 U+03C0 pi -!F1 U+03C1 rho -!F2 U+03C2 sigma1 -!F3 U+03C3 sigma -!F4 U+03C4 tau -!F5 U+03C5 upsilon -!F6 U+03C6 phi -!F7 U+03C7 chi -!F8 U+03C8 psi -!F9 U+03C9 omega -!FA U+03CA iotadieresis -!FB U+03CB upsilondieresis -!FC U+03CC omicrontonos -!FD U+03CD upsilontonos -!FE U+03CE omegatonos diff --git a/fpdf/makefont/cp1254.map b/fpdf/makefont/cp1254.map deleted file mode 100644 index 829473b28c5e53c7f89c68808151f7e45d5dc89e..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1254.map +++ /dev/null @@ -1,249 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!83 U+0192 florin -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!88 U+02C6 circumflex -!89 U+2030 perthousand -!8A U+0160 Scaron -!8B U+2039 guilsinglleft -!8C U+0152 OE -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!98 U+02DC tilde -!99 U+2122 trademark -!9A U+0161 scaron -!9B U+203A guilsinglright -!9C U+0153 oe -!9F U+0178 Ydieresis -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+00AA ordfeminine -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00B8 cedilla -!B9 U+00B9 onesuperior -!BA U+00BA ordmasculine -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00BF questiondown -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+00C3 Atilde -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+00CC Igrave -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+011E Gbreve -!D1 U+00D1 Ntilde -!D2 U+00D2 Ograve -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+0130 Idotaccent -!DE U+015E Scedilla -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+00E3 atilde -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+00EC igrave -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+011F gbreve -!F1 U+00F1 ntilde -!F2 U+00F2 ograve -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+0131 dotlessi -!FE U+015F scedilla -!FF U+00FF ydieresis diff --git a/fpdf/makefont/cp1255.map b/fpdf/makefont/cp1255.map deleted file mode 100644 index 079e10c61cd8e6360bb266cd95cca7672d3872f0..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1255.map +++ /dev/null @@ -1,233 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!83 U+0192 florin -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!88 U+02C6 circumflex -!89 U+2030 perthousand -!8B U+2039 guilsinglleft -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!98 U+02DC tilde -!99 U+2122 trademark -!9B U+203A guilsinglright -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+20AA afii57636 -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+00D7 multiply -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD sfthyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 middot -!B8 U+00B8 cedilla -!B9 U+00B9 onesuperior -!BA U+00F7 divide -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00BF questiondown -!C0 U+05B0 afii57799 -!C1 U+05B1 afii57801 -!C2 U+05B2 afii57800 -!C3 U+05B3 afii57802 -!C4 U+05B4 afii57793 -!C5 U+05B5 afii57794 -!C6 U+05B6 afii57795 -!C7 U+05B7 afii57798 -!C8 U+05B8 afii57797 -!C9 U+05B9 afii57806 -!CB U+05BB afii57796 -!CC U+05BC afii57807 -!CD U+05BD afii57839 -!CE U+05BE afii57645 -!CF U+05BF afii57841 -!D0 U+05C0 afii57842 -!D1 U+05C1 afii57804 -!D2 U+05C2 afii57803 -!D3 U+05C3 afii57658 -!D4 U+05F0 afii57716 -!D5 U+05F1 afii57717 -!D6 U+05F2 afii57718 -!D7 U+05F3 gereshhebrew -!D8 U+05F4 gershayimhebrew -!E0 U+05D0 afii57664 -!E1 U+05D1 afii57665 -!E2 U+05D2 afii57666 -!E3 U+05D3 afii57667 -!E4 U+05D4 afii57668 -!E5 U+05D5 afii57669 -!E6 U+05D6 afii57670 -!E7 U+05D7 afii57671 -!E8 U+05D8 afii57672 -!E9 U+05D9 afii57673 -!EA U+05DA afii57674 -!EB U+05DB afii57675 -!EC U+05DC afii57676 -!ED U+05DD afii57677 -!EE U+05DE afii57678 -!EF U+05DF afii57679 -!F0 U+05E0 afii57680 -!F1 U+05E1 afii57681 -!F2 U+05E2 afii57682 -!F3 U+05E3 afii57683 -!F4 U+05E4 afii57684 -!F5 U+05E5 afii57685 -!F6 U+05E6 afii57686 -!F7 U+05E7 afii57687 -!F8 U+05E8 afii57688 -!F9 U+05E9 afii57689 -!FA U+05EA afii57690 -!FD U+200E afii299 -!FE U+200F afii300 diff --git a/fpdf/makefont/cp1257.map b/fpdf/makefont/cp1257.map deleted file mode 100644 index 2f2ecfa21dabe90c8cfa15e1738f2cd3c149d2a2..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1257.map +++ /dev/null @@ -1,244 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!89 U+2030 perthousand -!8B U+2039 guilsinglleft -!8D U+00A8 dieresis -!8E U+02C7 caron -!8F U+00B8 cedilla -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!99 U+2122 trademark -!9B U+203A guilsinglright -!9D U+00AF macron -!9E U+02DB ogonek -!A0 U+00A0 space -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00D8 Oslash -!A9 U+00A9 copyright -!AA U+0156 Rcommaaccent -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00C6 AE -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00F8 oslash -!B9 U+00B9 onesuperior -!BA U+0157 rcommaaccent -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00E6 ae -!C0 U+0104 Aogonek -!C1 U+012E Iogonek -!C2 U+0100 Amacron -!C3 U+0106 Cacute -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+0118 Eogonek -!C7 U+0112 Emacron -!C8 U+010C Ccaron -!C9 U+00C9 Eacute -!CA U+0179 Zacute -!CB U+0116 Edotaccent -!CC U+0122 Gcommaaccent -!CD U+0136 Kcommaaccent -!CE U+012A Imacron -!CF U+013B Lcommaaccent -!D0 U+0160 Scaron -!D1 U+0143 Nacute -!D2 U+0145 Ncommaaccent -!D3 U+00D3 Oacute -!D4 U+014C Omacron -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+0172 Uogonek -!D9 U+0141 Lslash -!DA U+015A Sacute -!DB U+016A Umacron -!DC U+00DC Udieresis -!DD U+017B Zdotaccent -!DE U+017D Zcaron -!DF U+00DF germandbls -!E0 U+0105 aogonek -!E1 U+012F iogonek -!E2 U+0101 amacron -!E3 U+0107 cacute -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+0119 eogonek -!E7 U+0113 emacron -!E8 U+010D ccaron -!E9 U+00E9 eacute -!EA U+017A zacute -!EB U+0117 edotaccent -!EC U+0123 gcommaaccent -!ED U+0137 kcommaaccent -!EE U+012B imacron -!EF U+013C lcommaaccent -!F0 U+0161 scaron -!F1 U+0144 nacute -!F2 U+0146 ncommaaccent -!F3 U+00F3 oacute -!F4 U+014D omacron -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+0173 uogonek -!F9 U+0142 lslash -!FA U+015B sacute -!FB U+016B umacron -!FC U+00FC udieresis -!FD U+017C zdotaccent -!FE U+017E zcaron -!FF U+02D9 dotaccent diff --git a/fpdf/makefont/cp1258.map b/fpdf/makefont/cp1258.map deleted file mode 100644 index fed915f7152ca24e30fb33d1922de45177d84428..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp1258.map +++ /dev/null @@ -1,247 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!82 U+201A quotesinglbase -!83 U+0192 florin -!84 U+201E quotedblbase -!85 U+2026 ellipsis -!86 U+2020 dagger -!87 U+2021 daggerdbl -!88 U+02C6 circumflex -!89 U+2030 perthousand -!8B U+2039 guilsinglleft -!8C U+0152 OE -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!98 U+02DC tilde -!99 U+2122 trademark -!9B U+203A guilsinglright -!9C U+0153 oe -!9F U+0178 Ydieresis -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+00AA ordfeminine -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00B8 cedilla -!B9 U+00B9 onesuperior -!BA U+00BA ordmasculine -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00BF questiondown -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+0102 Abreve -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+0300 gravecomb -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+0110 Dcroat -!D1 U+00D1 Ntilde -!D2 U+0309 hookabovecomb -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+01A0 Ohorn -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+01AF Uhorn -!DE U+0303 tildecomb -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+0103 abreve -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+0301 acutecomb -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+0111 dcroat -!F1 U+00F1 ntilde -!F2 U+0323 dotbelowcomb -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+01A1 ohorn -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+01B0 uhorn -!FE U+20AB dong -!FF U+00FF ydieresis diff --git a/fpdf/makefont/cp874.map b/fpdf/makefont/cp874.map deleted file mode 100644 index 1006e6b17f2a9d3cbbd8fc4fadd1c944c562cc1c..0000000000000000000000000000000000000000 --- a/fpdf/makefont/cp874.map +++ /dev/null @@ -1,225 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+20AC Euro -!85 U+2026 ellipsis -!91 U+2018 quoteleft -!92 U+2019 quoteright -!93 U+201C quotedblleft -!94 U+201D quotedblright -!95 U+2022 bullet -!96 U+2013 endash -!97 U+2014 emdash -!A0 U+00A0 space -!A1 U+0E01 kokaithai -!A2 U+0E02 khokhaithai -!A3 U+0E03 khokhuatthai -!A4 U+0E04 khokhwaithai -!A5 U+0E05 khokhonthai -!A6 U+0E06 khorakhangthai -!A7 U+0E07 ngonguthai -!A8 U+0E08 chochanthai -!A9 U+0E09 chochingthai -!AA U+0E0A chochangthai -!AB U+0E0B sosothai -!AC U+0E0C chochoethai -!AD U+0E0D yoyingthai -!AE U+0E0E dochadathai -!AF U+0E0F topatakthai -!B0 U+0E10 thothanthai -!B1 U+0E11 thonangmonthothai -!B2 U+0E12 thophuthaothai -!B3 U+0E13 nonenthai -!B4 U+0E14 dodekthai -!B5 U+0E15 totaothai -!B6 U+0E16 thothungthai -!B7 U+0E17 thothahanthai -!B8 U+0E18 thothongthai -!B9 U+0E19 nonuthai -!BA U+0E1A bobaimaithai -!BB U+0E1B poplathai -!BC U+0E1C phophungthai -!BD U+0E1D fofathai -!BE U+0E1E phophanthai -!BF U+0E1F fofanthai -!C0 U+0E20 phosamphaothai -!C1 U+0E21 momathai -!C2 U+0E22 yoyakthai -!C3 U+0E23 roruathai -!C4 U+0E24 ruthai -!C5 U+0E25 lolingthai -!C6 U+0E26 luthai -!C7 U+0E27 wowaenthai -!C8 U+0E28 sosalathai -!C9 U+0E29 sorusithai -!CA U+0E2A sosuathai -!CB U+0E2B hohipthai -!CC U+0E2C lochulathai -!CD U+0E2D oangthai -!CE U+0E2E honokhukthai -!CF U+0E2F paiyannoithai -!D0 U+0E30 saraathai -!D1 U+0E31 maihanakatthai -!D2 U+0E32 saraaathai -!D3 U+0E33 saraamthai -!D4 U+0E34 saraithai -!D5 U+0E35 saraiithai -!D6 U+0E36 sarauethai -!D7 U+0E37 saraueethai -!D8 U+0E38 sarauthai -!D9 U+0E39 sarauuthai -!DA U+0E3A phinthuthai -!DF U+0E3F bahtthai -!E0 U+0E40 saraethai -!E1 U+0E41 saraaethai -!E2 U+0E42 saraothai -!E3 U+0E43 saraaimaimuanthai -!E4 U+0E44 saraaimaimalaithai -!E5 U+0E45 lakkhangyaothai -!E6 U+0E46 maiyamokthai -!E7 U+0E47 maitaikhuthai -!E8 U+0E48 maiekthai -!E9 U+0E49 maithothai -!EA U+0E4A maitrithai -!EB U+0E4B maichattawathai -!EC U+0E4C thanthakhatthai -!ED U+0E4D nikhahitthai -!EE U+0E4E yamakkanthai -!EF U+0E4F fongmanthai -!F0 U+0E50 zerothai -!F1 U+0E51 onethai -!F2 U+0E52 twothai -!F3 U+0E53 threethai -!F4 U+0E54 fourthai -!F5 U+0E55 fivethai -!F6 U+0E56 sixthai -!F7 U+0E57 seventhai -!F8 U+0E58 eightthai -!F9 U+0E59 ninethai -!FA U+0E5A angkhankhuthai -!FB U+0E5B khomutthai diff --git a/fpdf/makefont/iso-8859-1.map b/fpdf/makefont/iso-8859-1.map deleted file mode 100644 index 61740a38fa3faa456159466766a92581b976d565..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-1.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+00AA ordfeminine -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00B8 cedilla -!B9 U+00B9 onesuperior -!BA U+00BA ordmasculine -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00BF questiondown -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+00C3 Atilde -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+00CC Igrave -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+00D0 Eth -!D1 U+00D1 Ntilde -!D2 U+00D2 Ograve -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+00DD Yacute -!DE U+00DE Thorn -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+00E3 atilde -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+00EC igrave -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+00F0 eth -!F1 U+00F1 ntilde -!F2 U+00F2 ograve -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+00FD yacute -!FE U+00FE thorn -!FF U+00FF ydieresis diff --git a/fpdf/makefont/iso-8859-11.map b/fpdf/makefont/iso-8859-11.map deleted file mode 100644 index 91688120667161d4acf8066f456d67d31a2bc0d9..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-11.map +++ /dev/null @@ -1,248 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+0E01 kokaithai -!A2 U+0E02 khokhaithai -!A3 U+0E03 khokhuatthai -!A4 U+0E04 khokhwaithai -!A5 U+0E05 khokhonthai -!A6 U+0E06 khorakhangthai -!A7 U+0E07 ngonguthai -!A8 U+0E08 chochanthai -!A9 U+0E09 chochingthai -!AA U+0E0A chochangthai -!AB U+0E0B sosothai -!AC U+0E0C chochoethai -!AD U+0E0D yoyingthai -!AE U+0E0E dochadathai -!AF U+0E0F topatakthai -!B0 U+0E10 thothanthai -!B1 U+0E11 thonangmonthothai -!B2 U+0E12 thophuthaothai -!B3 U+0E13 nonenthai -!B4 U+0E14 dodekthai -!B5 U+0E15 totaothai -!B6 U+0E16 thothungthai -!B7 U+0E17 thothahanthai -!B8 U+0E18 thothongthai -!B9 U+0E19 nonuthai -!BA U+0E1A bobaimaithai -!BB U+0E1B poplathai -!BC U+0E1C phophungthai -!BD U+0E1D fofathai -!BE U+0E1E phophanthai -!BF U+0E1F fofanthai -!C0 U+0E20 phosamphaothai -!C1 U+0E21 momathai -!C2 U+0E22 yoyakthai -!C3 U+0E23 roruathai -!C4 U+0E24 ruthai -!C5 U+0E25 lolingthai -!C6 U+0E26 luthai -!C7 U+0E27 wowaenthai -!C8 U+0E28 sosalathai -!C9 U+0E29 sorusithai -!CA U+0E2A sosuathai -!CB U+0E2B hohipthai -!CC U+0E2C lochulathai -!CD U+0E2D oangthai -!CE U+0E2E honokhukthai -!CF U+0E2F paiyannoithai -!D0 U+0E30 saraathai -!D1 U+0E31 maihanakatthai -!D2 U+0E32 saraaathai -!D3 U+0E33 saraamthai -!D4 U+0E34 saraithai -!D5 U+0E35 saraiithai -!D6 U+0E36 sarauethai -!D7 U+0E37 saraueethai -!D8 U+0E38 sarauthai -!D9 U+0E39 sarauuthai -!DA U+0E3A phinthuthai -!DF U+0E3F bahtthai -!E0 U+0E40 saraethai -!E1 U+0E41 saraaethai -!E2 U+0E42 saraothai -!E3 U+0E43 saraaimaimuanthai -!E4 U+0E44 saraaimaimalaithai -!E5 U+0E45 lakkhangyaothai -!E6 U+0E46 maiyamokthai -!E7 U+0E47 maitaikhuthai -!E8 U+0E48 maiekthai -!E9 U+0E49 maithothai -!EA U+0E4A maitrithai -!EB U+0E4B maichattawathai -!EC U+0E4C thanthakhatthai -!ED U+0E4D nikhahitthai -!EE U+0E4E yamakkanthai -!EF U+0E4F fongmanthai -!F0 U+0E50 zerothai -!F1 U+0E51 onethai -!F2 U+0E52 twothai -!F3 U+0E53 threethai -!F4 U+0E54 fourthai -!F5 U+0E55 fivethai -!F6 U+0E56 sixthai -!F7 U+0E57 seventhai -!F8 U+0E58 eightthai -!F9 U+0E59 ninethai -!FA U+0E5A angkhankhuthai -!FB U+0E5B khomutthai diff --git a/fpdf/makefont/iso-8859-15.map b/fpdf/makefont/iso-8859-15.map deleted file mode 100644 index 6c2b5712793d7eed6fec0f72e80ee3cd2ccf79ea..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-15.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+20AC Euro -!A5 U+00A5 yen -!A6 U+0160 Scaron -!A7 U+00A7 section -!A8 U+0161 scaron -!A9 U+00A9 copyright -!AA U+00AA ordfeminine -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+017D Zcaron -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+017E zcaron -!B9 U+00B9 onesuperior -!BA U+00BA ordmasculine -!BB U+00BB guillemotright -!BC U+0152 OE -!BD U+0153 oe -!BE U+0178 Ydieresis -!BF U+00BF questiondown -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+00C3 Atilde -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+00CC Igrave -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+00D0 Eth -!D1 U+00D1 Ntilde -!D2 U+00D2 Ograve -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+00DD Yacute -!DE U+00DE Thorn -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+00E3 atilde -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+00EC igrave -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+00F0 eth -!F1 U+00F1 ntilde -!F2 U+00F2 ograve -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+00FD yacute -!FE U+00FE thorn -!FF U+00FF ydieresis diff --git a/fpdf/makefont/iso-8859-16.map b/fpdf/makefont/iso-8859-16.map deleted file mode 100644 index 202c8fe594186cf762126b1265d7e2f73f7f92ac..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-16.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+0104 Aogonek -!A2 U+0105 aogonek -!A3 U+0141 Lslash -!A4 U+20AC Euro -!A5 U+201E quotedblbase -!A6 U+0160 Scaron -!A7 U+00A7 section -!A8 U+0161 scaron -!A9 U+00A9 copyright -!AA U+0218 Scommaaccent -!AB U+00AB guillemotleft -!AC U+0179 Zacute -!AD U+00AD hyphen -!AE U+017A zacute -!AF U+017B Zdotaccent -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+010C Ccaron -!B3 U+0142 lslash -!B4 U+017D Zcaron -!B5 U+201D quotedblright -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+017E zcaron -!B9 U+010D ccaron -!BA U+0219 scommaaccent -!BB U+00BB guillemotright -!BC U+0152 OE -!BD U+0153 oe -!BE U+0178 Ydieresis -!BF U+017C zdotaccent -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+0102 Abreve -!C4 U+00C4 Adieresis -!C5 U+0106 Cacute -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+00CC Igrave -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+0110 Dcroat -!D1 U+0143 Nacute -!D2 U+00D2 Ograve -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+0150 Ohungarumlaut -!D6 U+00D6 Odieresis -!D7 U+015A Sacute -!D8 U+0170 Uhungarumlaut -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+0118 Eogonek -!DE U+021A Tcommaaccent -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+0103 abreve -!E4 U+00E4 adieresis -!E5 U+0107 cacute -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+00EC igrave -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+0111 dcroat -!F1 U+0144 nacute -!F2 U+00F2 ograve -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+0151 ohungarumlaut -!F6 U+00F6 odieresis -!F7 U+015B sacute -!F8 U+0171 uhungarumlaut -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+0119 eogonek -!FE U+021B tcommaaccent -!FF U+00FF ydieresis diff --git a/fpdf/makefont/iso-8859-2.map b/fpdf/makefont/iso-8859-2.map deleted file mode 100644 index 65ae09f95819ca5841b87ffe81e0e9326318cd75..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-2.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+0104 Aogonek -!A2 U+02D8 breve -!A3 U+0141 Lslash -!A4 U+00A4 currency -!A5 U+013D Lcaron -!A6 U+015A Sacute -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+0160 Scaron -!AA U+015E Scedilla -!AB U+0164 Tcaron -!AC U+0179 Zacute -!AD U+00AD hyphen -!AE U+017D Zcaron -!AF U+017B Zdotaccent -!B0 U+00B0 degree -!B1 U+0105 aogonek -!B2 U+02DB ogonek -!B3 U+0142 lslash -!B4 U+00B4 acute -!B5 U+013E lcaron -!B6 U+015B sacute -!B7 U+02C7 caron -!B8 U+00B8 cedilla -!B9 U+0161 scaron -!BA U+015F scedilla -!BB U+0165 tcaron -!BC U+017A zacute -!BD U+02DD hungarumlaut -!BE U+017E zcaron -!BF U+017C zdotaccent -!C0 U+0154 Racute -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+0102 Abreve -!C4 U+00C4 Adieresis -!C5 U+0139 Lacute -!C6 U+0106 Cacute -!C7 U+00C7 Ccedilla -!C8 U+010C Ccaron -!C9 U+00C9 Eacute -!CA U+0118 Eogonek -!CB U+00CB Edieresis -!CC U+011A Ecaron -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+010E Dcaron -!D0 U+0110 Dcroat -!D1 U+0143 Nacute -!D2 U+0147 Ncaron -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+0150 Ohungarumlaut -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+0158 Rcaron -!D9 U+016E Uring -!DA U+00DA Uacute -!DB U+0170 Uhungarumlaut -!DC U+00DC Udieresis -!DD U+00DD Yacute -!DE U+0162 Tcommaaccent -!DF U+00DF germandbls -!E0 U+0155 racute -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+0103 abreve -!E4 U+00E4 adieresis -!E5 U+013A lacute -!E6 U+0107 cacute -!E7 U+00E7 ccedilla -!E8 U+010D ccaron -!E9 U+00E9 eacute -!EA U+0119 eogonek -!EB U+00EB edieresis -!EC U+011B ecaron -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+010F dcaron -!F0 U+0111 dcroat -!F1 U+0144 nacute -!F2 U+0148 ncaron -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+0151 ohungarumlaut -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+0159 rcaron -!F9 U+016F uring -!FA U+00FA uacute -!FB U+0171 uhungarumlaut -!FC U+00FC udieresis -!FD U+00FD yacute -!FE U+0163 tcommaaccent -!FF U+02D9 dotaccent diff --git a/fpdf/makefont/iso-8859-4.map b/fpdf/makefont/iso-8859-4.map deleted file mode 100644 index a7d87bf3ef2a97e84de2aa4e1b46c4dbb9fec239..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-4.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+0104 Aogonek -!A2 U+0138 kgreenlandic -!A3 U+0156 Rcommaaccent -!A4 U+00A4 currency -!A5 U+0128 Itilde -!A6 U+013B Lcommaaccent -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+0160 Scaron -!AA U+0112 Emacron -!AB U+0122 Gcommaaccent -!AC U+0166 Tbar -!AD U+00AD hyphen -!AE U+017D Zcaron -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+0105 aogonek -!B2 U+02DB ogonek -!B3 U+0157 rcommaaccent -!B4 U+00B4 acute -!B5 U+0129 itilde -!B6 U+013C lcommaaccent -!B7 U+02C7 caron -!B8 U+00B8 cedilla -!B9 U+0161 scaron -!BA U+0113 emacron -!BB U+0123 gcommaaccent -!BC U+0167 tbar -!BD U+014A Eng -!BE U+017E zcaron -!BF U+014B eng -!C0 U+0100 Amacron -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+00C3 Atilde -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+012E Iogonek -!C8 U+010C Ccaron -!C9 U+00C9 Eacute -!CA U+0118 Eogonek -!CB U+00CB Edieresis -!CC U+0116 Edotaccent -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+012A Imacron -!D0 U+0110 Dcroat -!D1 U+0145 Ncommaaccent -!D2 U+014C Omacron -!D3 U+0136 Kcommaaccent -!D4 U+00D4 Ocircumflex -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+0172 Uogonek -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+0168 Utilde -!DE U+016A Umacron -!DF U+00DF germandbls -!E0 U+0101 amacron -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+00E3 atilde -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+012F iogonek -!E8 U+010D ccaron -!E9 U+00E9 eacute -!EA U+0119 eogonek -!EB U+00EB edieresis -!EC U+0117 edotaccent -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+012B imacron -!F0 U+0111 dcroat -!F1 U+0146 ncommaaccent -!F2 U+014D omacron -!F3 U+0137 kcommaaccent -!F4 U+00F4 ocircumflex -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+0173 uogonek -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+0169 utilde -!FE U+016B umacron -!FF U+02D9 dotaccent diff --git a/fpdf/makefont/iso-8859-5.map b/fpdf/makefont/iso-8859-5.map deleted file mode 100644 index f9cd4edcf85de8e6206ff0ad32d64356101ce723..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-5.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+0401 afii10023 -!A2 U+0402 afii10051 -!A3 U+0403 afii10052 -!A4 U+0404 afii10053 -!A5 U+0405 afii10054 -!A6 U+0406 afii10055 -!A7 U+0407 afii10056 -!A8 U+0408 afii10057 -!A9 U+0409 afii10058 -!AA U+040A afii10059 -!AB U+040B afii10060 -!AC U+040C afii10061 -!AD U+00AD hyphen -!AE U+040E afii10062 -!AF U+040F afii10145 -!B0 U+0410 afii10017 -!B1 U+0411 afii10018 -!B2 U+0412 afii10019 -!B3 U+0413 afii10020 -!B4 U+0414 afii10021 -!B5 U+0415 afii10022 -!B6 U+0416 afii10024 -!B7 U+0417 afii10025 -!B8 U+0418 afii10026 -!B9 U+0419 afii10027 -!BA U+041A afii10028 -!BB U+041B afii10029 -!BC U+041C afii10030 -!BD U+041D afii10031 -!BE U+041E afii10032 -!BF U+041F afii10033 -!C0 U+0420 afii10034 -!C1 U+0421 afii10035 -!C2 U+0422 afii10036 -!C3 U+0423 afii10037 -!C4 U+0424 afii10038 -!C5 U+0425 afii10039 -!C6 U+0426 afii10040 -!C7 U+0427 afii10041 -!C8 U+0428 afii10042 -!C9 U+0429 afii10043 -!CA U+042A afii10044 -!CB U+042B afii10045 -!CC U+042C afii10046 -!CD U+042D afii10047 -!CE U+042E afii10048 -!CF U+042F afii10049 -!D0 U+0430 afii10065 -!D1 U+0431 afii10066 -!D2 U+0432 afii10067 -!D3 U+0433 afii10068 -!D4 U+0434 afii10069 -!D5 U+0435 afii10070 -!D6 U+0436 afii10072 -!D7 U+0437 afii10073 -!D8 U+0438 afii10074 -!D9 U+0439 afii10075 -!DA U+043A afii10076 -!DB U+043B afii10077 -!DC U+043C afii10078 -!DD U+043D afii10079 -!DE U+043E afii10080 -!DF U+043F afii10081 -!E0 U+0440 afii10082 -!E1 U+0441 afii10083 -!E2 U+0442 afii10084 -!E3 U+0443 afii10085 -!E4 U+0444 afii10086 -!E5 U+0445 afii10087 -!E6 U+0446 afii10088 -!E7 U+0447 afii10089 -!E8 U+0448 afii10090 -!E9 U+0449 afii10091 -!EA U+044A afii10092 -!EB U+044B afii10093 -!EC U+044C afii10094 -!ED U+044D afii10095 -!EE U+044E afii10096 -!EF U+044F afii10097 -!F0 U+2116 afii61352 -!F1 U+0451 afii10071 -!F2 U+0452 afii10099 -!F3 U+0453 afii10100 -!F4 U+0454 afii10101 -!F5 U+0455 afii10102 -!F6 U+0456 afii10103 -!F7 U+0457 afii10104 -!F8 U+0458 afii10105 -!F9 U+0459 afii10106 -!FA U+045A afii10107 -!FB U+045B afii10108 -!FC U+045C afii10109 -!FD U+00A7 section -!FE U+045E afii10110 -!FF U+045F afii10193 diff --git a/fpdf/makefont/iso-8859-7.map b/fpdf/makefont/iso-8859-7.map deleted file mode 100644 index e163796b1cad3004dc8f80315217c838a6df77aa..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-7.map +++ /dev/null @@ -1,250 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+2018 quoteleft -!A2 U+2019 quoteright -!A3 U+00A3 sterling -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AF U+2015 afii00208 -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+0384 tonos -!B5 U+0385 dieresistonos -!B6 U+0386 Alphatonos -!B7 U+00B7 periodcentered -!B8 U+0388 Epsilontonos -!B9 U+0389 Etatonos -!BA U+038A Iotatonos -!BB U+00BB guillemotright -!BC U+038C Omicrontonos -!BD U+00BD onehalf -!BE U+038E Upsilontonos -!BF U+038F Omegatonos -!C0 U+0390 iotadieresistonos -!C1 U+0391 Alpha -!C2 U+0392 Beta -!C3 U+0393 Gamma -!C4 U+0394 Delta -!C5 U+0395 Epsilon -!C6 U+0396 Zeta -!C7 U+0397 Eta -!C8 U+0398 Theta -!C9 U+0399 Iota -!CA U+039A Kappa -!CB U+039B Lambda -!CC U+039C Mu -!CD U+039D Nu -!CE U+039E Xi -!CF U+039F Omicron -!D0 U+03A0 Pi -!D1 U+03A1 Rho -!D3 U+03A3 Sigma -!D4 U+03A4 Tau -!D5 U+03A5 Upsilon -!D6 U+03A6 Phi -!D7 U+03A7 Chi -!D8 U+03A8 Psi -!D9 U+03A9 Omega -!DA U+03AA Iotadieresis -!DB U+03AB Upsilondieresis -!DC U+03AC alphatonos -!DD U+03AD epsilontonos -!DE U+03AE etatonos -!DF U+03AF iotatonos -!E0 U+03B0 upsilondieresistonos -!E1 U+03B1 alpha -!E2 U+03B2 beta -!E3 U+03B3 gamma -!E4 U+03B4 delta -!E5 U+03B5 epsilon -!E6 U+03B6 zeta -!E7 U+03B7 eta -!E8 U+03B8 theta -!E9 U+03B9 iota -!EA U+03BA kappa -!EB U+03BB lambda -!EC U+03BC mu -!ED U+03BD nu -!EE U+03BE xi -!EF U+03BF omicron -!F0 U+03C0 pi -!F1 U+03C1 rho -!F2 U+03C2 sigma1 -!F3 U+03C3 sigma -!F4 U+03C4 tau -!F5 U+03C5 upsilon -!F6 U+03C6 phi -!F7 U+03C7 chi -!F8 U+03C8 psi -!F9 U+03C9 omega -!FA U+03CA iotadieresis -!FB U+03CB upsilondieresis -!FC U+03CC omicrontonos -!FD U+03CD upsilontonos -!FE U+03CE omegatonos diff --git a/fpdf/makefont/iso-8859-9.map b/fpdf/makefont/iso-8859-9.map deleted file mode 100644 index 48c123ae6f6b6bee1186517e7d6557fb2fee8055..0000000000000000000000000000000000000000 --- a/fpdf/makefont/iso-8859-9.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+0080 .notdef -!81 U+0081 .notdef -!82 U+0082 .notdef -!83 U+0083 .notdef -!84 U+0084 .notdef -!85 U+0085 .notdef -!86 U+0086 .notdef -!87 U+0087 .notdef -!88 U+0088 .notdef -!89 U+0089 .notdef -!8A U+008A .notdef -!8B U+008B .notdef -!8C U+008C .notdef -!8D U+008D .notdef -!8E U+008E .notdef -!8F U+008F .notdef -!90 U+0090 .notdef -!91 U+0091 .notdef -!92 U+0092 .notdef -!93 U+0093 .notdef -!94 U+0094 .notdef -!95 U+0095 .notdef -!96 U+0096 .notdef -!97 U+0097 .notdef -!98 U+0098 .notdef -!99 U+0099 .notdef -!9A U+009A .notdef -!9B U+009B .notdef -!9C U+009C .notdef -!9D U+009D .notdef -!9E U+009E .notdef -!9F U+009F .notdef -!A0 U+00A0 space -!A1 U+00A1 exclamdown -!A2 U+00A2 cent -!A3 U+00A3 sterling -!A4 U+00A4 currency -!A5 U+00A5 yen -!A6 U+00A6 brokenbar -!A7 U+00A7 section -!A8 U+00A8 dieresis -!A9 U+00A9 copyright -!AA U+00AA ordfeminine -!AB U+00AB guillemotleft -!AC U+00AC logicalnot -!AD U+00AD hyphen -!AE U+00AE registered -!AF U+00AF macron -!B0 U+00B0 degree -!B1 U+00B1 plusminus -!B2 U+00B2 twosuperior -!B3 U+00B3 threesuperior -!B4 U+00B4 acute -!B5 U+00B5 mu -!B6 U+00B6 paragraph -!B7 U+00B7 periodcentered -!B8 U+00B8 cedilla -!B9 U+00B9 onesuperior -!BA U+00BA ordmasculine -!BB U+00BB guillemotright -!BC U+00BC onequarter -!BD U+00BD onehalf -!BE U+00BE threequarters -!BF U+00BF questiondown -!C0 U+00C0 Agrave -!C1 U+00C1 Aacute -!C2 U+00C2 Acircumflex -!C3 U+00C3 Atilde -!C4 U+00C4 Adieresis -!C5 U+00C5 Aring -!C6 U+00C6 AE -!C7 U+00C7 Ccedilla -!C8 U+00C8 Egrave -!C9 U+00C9 Eacute -!CA U+00CA Ecircumflex -!CB U+00CB Edieresis -!CC U+00CC Igrave -!CD U+00CD Iacute -!CE U+00CE Icircumflex -!CF U+00CF Idieresis -!D0 U+011E Gbreve -!D1 U+00D1 Ntilde -!D2 U+00D2 Ograve -!D3 U+00D3 Oacute -!D4 U+00D4 Ocircumflex -!D5 U+00D5 Otilde -!D6 U+00D6 Odieresis -!D7 U+00D7 multiply -!D8 U+00D8 Oslash -!D9 U+00D9 Ugrave -!DA U+00DA Uacute -!DB U+00DB Ucircumflex -!DC U+00DC Udieresis -!DD U+0130 Idotaccent -!DE U+015E Scedilla -!DF U+00DF germandbls -!E0 U+00E0 agrave -!E1 U+00E1 aacute -!E2 U+00E2 acircumflex -!E3 U+00E3 atilde -!E4 U+00E4 adieresis -!E5 U+00E5 aring -!E6 U+00E6 ae -!E7 U+00E7 ccedilla -!E8 U+00E8 egrave -!E9 U+00E9 eacute -!EA U+00EA ecircumflex -!EB U+00EB edieresis -!EC U+00EC igrave -!ED U+00ED iacute -!EE U+00EE icircumflex -!EF U+00EF idieresis -!F0 U+011F gbreve -!F1 U+00F1 ntilde -!F2 U+00F2 ograve -!F3 U+00F3 oacute -!F4 U+00F4 ocircumflex -!F5 U+00F5 otilde -!F6 U+00F6 odieresis -!F7 U+00F7 divide -!F8 U+00F8 oslash -!F9 U+00F9 ugrave -!FA U+00FA uacute -!FB U+00FB ucircumflex -!FC U+00FC udieresis -!FD U+0131 dotlessi -!FE U+015F scedilla -!FF U+00FF ydieresis diff --git a/fpdf/makefont/koi8-r.map b/fpdf/makefont/koi8-r.map deleted file mode 100644 index 6ad5d05d0dacf74138044384c23f319f830482ae..0000000000000000000000000000000000000000 --- a/fpdf/makefont/koi8-r.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+2500 SF100000 -!81 U+2502 SF110000 -!82 U+250C SF010000 -!83 U+2510 SF030000 -!84 U+2514 SF020000 -!85 U+2518 SF040000 -!86 U+251C SF080000 -!87 U+2524 SF090000 -!88 U+252C SF060000 -!89 U+2534 SF070000 -!8A U+253C SF050000 -!8B U+2580 upblock -!8C U+2584 dnblock -!8D U+2588 block -!8E U+258C lfblock -!8F U+2590 rtblock -!90 U+2591 ltshade -!91 U+2592 shade -!92 U+2593 dkshade -!93 U+2320 integraltp -!94 U+25A0 filledbox -!95 U+2219 periodcentered -!96 U+221A radical -!97 U+2248 approxequal -!98 U+2264 lessequal -!99 U+2265 greaterequal -!9A U+00A0 space -!9B U+2321 integralbt -!9C U+00B0 degree -!9D U+00B2 twosuperior -!9E U+00B7 periodcentered -!9F U+00F7 divide -!A0 U+2550 SF430000 -!A1 U+2551 SF240000 -!A2 U+2552 SF510000 -!A3 U+0451 afii10071 -!A4 U+2553 SF520000 -!A5 U+2554 SF390000 -!A6 U+2555 SF220000 -!A7 U+2556 SF210000 -!A8 U+2557 SF250000 -!A9 U+2558 SF500000 -!AA U+2559 SF490000 -!AB U+255A SF380000 -!AC U+255B SF280000 -!AD U+255C SF270000 -!AE U+255D SF260000 -!AF U+255E SF360000 -!B0 U+255F SF370000 -!B1 U+2560 SF420000 -!B2 U+2561 SF190000 -!B3 U+0401 afii10023 -!B4 U+2562 SF200000 -!B5 U+2563 SF230000 -!B6 U+2564 SF470000 -!B7 U+2565 SF480000 -!B8 U+2566 SF410000 -!B9 U+2567 SF450000 -!BA U+2568 SF460000 -!BB U+2569 SF400000 -!BC U+256A SF540000 -!BD U+256B SF530000 -!BE U+256C SF440000 -!BF U+00A9 copyright -!C0 U+044E afii10096 -!C1 U+0430 afii10065 -!C2 U+0431 afii10066 -!C3 U+0446 afii10088 -!C4 U+0434 afii10069 -!C5 U+0435 afii10070 -!C6 U+0444 afii10086 -!C7 U+0433 afii10068 -!C8 U+0445 afii10087 -!C9 U+0438 afii10074 -!CA U+0439 afii10075 -!CB U+043A afii10076 -!CC U+043B afii10077 -!CD U+043C afii10078 -!CE U+043D afii10079 -!CF U+043E afii10080 -!D0 U+043F afii10081 -!D1 U+044F afii10097 -!D2 U+0440 afii10082 -!D3 U+0441 afii10083 -!D4 U+0442 afii10084 -!D5 U+0443 afii10085 -!D6 U+0436 afii10072 -!D7 U+0432 afii10067 -!D8 U+044C afii10094 -!D9 U+044B afii10093 -!DA U+0437 afii10073 -!DB U+0448 afii10090 -!DC U+044D afii10095 -!DD U+0449 afii10091 -!DE U+0447 afii10089 -!DF U+044A afii10092 -!E0 U+042E afii10048 -!E1 U+0410 afii10017 -!E2 U+0411 afii10018 -!E3 U+0426 afii10040 -!E4 U+0414 afii10021 -!E5 U+0415 afii10022 -!E6 U+0424 afii10038 -!E7 U+0413 afii10020 -!E8 U+0425 afii10039 -!E9 U+0418 afii10026 -!EA U+0419 afii10027 -!EB U+041A afii10028 -!EC U+041B afii10029 -!ED U+041C afii10030 -!EE U+041D afii10031 -!EF U+041E afii10032 -!F0 U+041F afii10033 -!F1 U+042F afii10049 -!F2 U+0420 afii10034 -!F3 U+0421 afii10035 -!F4 U+0422 afii10036 -!F5 U+0423 afii10037 -!F6 U+0416 afii10024 -!F7 U+0412 afii10019 -!F8 U+042C afii10046 -!F9 U+042B afii10045 -!FA U+0417 afii10025 -!FB U+0428 afii10042 -!FC U+042D afii10047 -!FD U+0429 afii10043 -!FE U+0427 afii10041 -!FF U+042A afii10044 diff --git a/fpdf/makefont/koi8-u.map b/fpdf/makefont/koi8-u.map deleted file mode 100644 index 40a7e4fd7e52a0433e42b5502cf4d9a23cf11e2e..0000000000000000000000000000000000000000 --- a/fpdf/makefont/koi8-u.map +++ /dev/null @@ -1,256 +0,0 @@ -!00 U+0000 .notdef -!01 U+0001 .notdef -!02 U+0002 .notdef -!03 U+0003 .notdef -!04 U+0004 .notdef -!05 U+0005 .notdef -!06 U+0006 .notdef -!07 U+0007 .notdef -!08 U+0008 .notdef -!09 U+0009 .notdef -!0A U+000A .notdef -!0B U+000B .notdef -!0C U+000C .notdef -!0D U+000D .notdef -!0E U+000E .notdef -!0F U+000F .notdef -!10 U+0010 .notdef -!11 U+0011 .notdef -!12 U+0012 .notdef -!13 U+0013 .notdef -!14 U+0014 .notdef -!15 U+0015 .notdef -!16 U+0016 .notdef -!17 U+0017 .notdef -!18 U+0018 .notdef -!19 U+0019 .notdef -!1A U+001A .notdef -!1B U+001B .notdef -!1C U+001C .notdef -!1D U+001D .notdef -!1E U+001E .notdef -!1F U+001F .notdef -!20 U+0020 space -!21 U+0021 exclam -!22 U+0022 quotedbl -!23 U+0023 numbersign -!24 U+0024 dollar -!25 U+0025 percent -!26 U+0026 ampersand -!27 U+0027 quotesingle -!28 U+0028 parenleft -!29 U+0029 parenright -!2A U+002A asterisk -!2B U+002B plus -!2C U+002C comma -!2D U+002D hyphen -!2E U+002E period -!2F U+002F slash -!30 U+0030 zero -!31 U+0031 one -!32 U+0032 two -!33 U+0033 three -!34 U+0034 four -!35 U+0035 five -!36 U+0036 six -!37 U+0037 seven -!38 U+0038 eight -!39 U+0039 nine -!3A U+003A colon -!3B U+003B semicolon -!3C U+003C less -!3D U+003D equal -!3E U+003E greater -!3F U+003F question -!40 U+0040 at -!41 U+0041 A -!42 U+0042 B -!43 U+0043 C -!44 U+0044 D -!45 U+0045 E -!46 U+0046 F -!47 U+0047 G -!48 U+0048 H -!49 U+0049 I -!4A U+004A J -!4B U+004B K -!4C U+004C L -!4D U+004D M -!4E U+004E N -!4F U+004F O -!50 U+0050 P -!51 U+0051 Q -!52 U+0052 R -!53 U+0053 S -!54 U+0054 T -!55 U+0055 U -!56 U+0056 V -!57 U+0057 W -!58 U+0058 X -!59 U+0059 Y -!5A U+005A Z -!5B U+005B bracketleft -!5C U+005C backslash -!5D U+005D bracketright -!5E U+005E asciicircum -!5F U+005F underscore -!60 U+0060 grave -!61 U+0061 a -!62 U+0062 b -!63 U+0063 c -!64 U+0064 d -!65 U+0065 e -!66 U+0066 f -!67 U+0067 g -!68 U+0068 h -!69 U+0069 i -!6A U+006A j -!6B U+006B k -!6C U+006C l -!6D U+006D m -!6E U+006E n -!6F U+006F o -!70 U+0070 p -!71 U+0071 q -!72 U+0072 r -!73 U+0073 s -!74 U+0074 t -!75 U+0075 u -!76 U+0076 v -!77 U+0077 w -!78 U+0078 x -!79 U+0079 y -!7A U+007A z -!7B U+007B braceleft -!7C U+007C bar -!7D U+007D braceright -!7E U+007E asciitilde -!7F U+007F .notdef -!80 U+2500 SF100000 -!81 U+2502 SF110000 -!82 U+250C SF010000 -!83 U+2510 SF030000 -!84 U+2514 SF020000 -!85 U+2518 SF040000 -!86 U+251C SF080000 -!87 U+2524 SF090000 -!88 U+252C SF060000 -!89 U+2534 SF070000 -!8A U+253C SF050000 -!8B U+2580 upblock -!8C U+2584 dnblock -!8D U+2588 block -!8E U+258C lfblock -!8F U+2590 rtblock -!90 U+2591 ltshade -!91 U+2592 shade -!92 U+2593 dkshade -!93 U+2320 integraltp -!94 U+25A0 filledbox -!95 U+2022 bullet -!96 U+221A radical -!97 U+2248 approxequal -!98 U+2264 lessequal -!99 U+2265 greaterequal -!9A U+00A0 space -!9B U+2321 integralbt -!9C U+00B0 degree -!9D U+00B2 twosuperior -!9E U+00B7 periodcentered -!9F U+00F7 divide -!A0 U+2550 SF430000 -!A1 U+2551 SF240000 -!A2 U+2552 SF510000 -!A3 U+0451 afii10071 -!A4 U+0454 afii10101 -!A5 U+2554 SF390000 -!A6 U+0456 afii10103 -!A7 U+0457 afii10104 -!A8 U+2557 SF250000 -!A9 U+2558 SF500000 -!AA U+2559 SF490000 -!AB U+255A SF380000 -!AC U+255B SF280000 -!AD U+0491 afii10098 -!AE U+255D SF260000 -!AF U+255E SF360000 -!B0 U+255F SF370000 -!B1 U+2560 SF420000 -!B2 U+2561 SF190000 -!B3 U+0401 afii10023 -!B4 U+0404 afii10053 -!B5 U+2563 SF230000 -!B6 U+0406 afii10055 -!B7 U+0407 afii10056 -!B8 U+2566 SF410000 -!B9 U+2567 SF450000 -!BA U+2568 SF460000 -!BB U+2569 SF400000 -!BC U+256A SF540000 -!BD U+0490 afii10050 -!BE U+256C SF440000 -!BF U+00A9 copyright -!C0 U+044E afii10096 -!C1 U+0430 afii10065 -!C2 U+0431 afii10066 -!C3 U+0446 afii10088 -!C4 U+0434 afii10069 -!C5 U+0435 afii10070 -!C6 U+0444 afii10086 -!C7 U+0433 afii10068 -!C8 U+0445 afii10087 -!C9 U+0438 afii10074 -!CA U+0439 afii10075 -!CB U+043A afii10076 -!CC U+043B afii10077 -!CD U+043C afii10078 -!CE U+043D afii10079 -!CF U+043E afii10080 -!D0 U+043F afii10081 -!D1 U+044F afii10097 -!D2 U+0440 afii10082 -!D3 U+0441 afii10083 -!D4 U+0442 afii10084 -!D5 U+0443 afii10085 -!D6 U+0436 afii10072 -!D7 U+0432 afii10067 -!D8 U+044C afii10094 -!D9 U+044B afii10093 -!DA U+0437 afii10073 -!DB U+0448 afii10090 -!DC U+044D afii10095 -!DD U+0449 afii10091 -!DE U+0447 afii10089 -!DF U+044A afii10092 -!E0 U+042E afii10048 -!E1 U+0410 afii10017 -!E2 U+0411 afii10018 -!E3 U+0426 afii10040 -!E4 U+0414 afii10021 -!E5 U+0415 afii10022 -!E6 U+0424 afii10038 -!E7 U+0413 afii10020 -!E8 U+0425 afii10039 -!E9 U+0418 afii10026 -!EA U+0419 afii10027 -!EB U+041A afii10028 -!EC U+041B afii10029 -!ED U+041C afii10030 -!EE U+041D afii10031 -!EF U+041E afii10032 -!F0 U+041F afii10033 -!F1 U+042F afii10049 -!F2 U+0420 afii10034 -!F3 U+0421 afii10035 -!F4 U+0422 afii10036 -!F5 U+0423 afii10037 -!F6 U+0416 afii10024 -!F7 U+0412 afii10019 -!F8 U+042C afii10046 -!F9 U+042B afii10045 -!FA U+0417 afii10025 -!FB U+0428 afii10042 -!FC U+042D afii10047 -!FD U+0429 afii10043 -!FE U+0427 afii10041 -!FF U+042A afii10044 diff --git a/fpdf/makefont/makefont.php b/fpdf/makefont/makefont.php deleted file mode 100644 index fbe8dcf6f217fee688ebd576af6452a8fb8b1e99..0000000000000000000000000000000000000000 --- a/fpdf/makefont/makefont.php +++ /dev/null @@ -1,447 +0,0 @@ -<?php -/******************************************************************************* -* Utility to generate font definition files * -* * -* Version: 1.31 * -* Date: 2019-12-07 * -* Author: Olivier PLATHEY * -*******************************************************************************/ - -require('ttfparser.php'); - -function Message($txt, $severity='') -{ - if(PHP_SAPI=='cli') - { - if($severity) - echo "$severity: "; - echo "$txt\n"; - } - else - { - if($severity) - echo "<b>$severity</b>: "; - echo "$txt<br>"; - } -} - -function Notice($txt) -{ - Message($txt, 'Notice'); -} - -function Warning($txt) -{ - Message($txt, 'Warning'); -} - -function Error($txt) -{ - Message($txt, 'Error'); - exit; -} - -function LoadMap($enc) -{ - $file = dirname(__FILE__).'/'.strtolower($enc).'.map'; - $a = file($file); - if(empty($a)) - Error('Encoding not found: '.$enc); - $map = array_fill(0, 256, array('uv'=>-1, 'name'=>'.notdef')); - foreach($a as $line) - { - $e = explode(' ', rtrim($line)); - $c = hexdec(substr($e[0],1)); - $uv = hexdec(substr($e[1],2)); - $name = $e[2]; - $map[$c] = array('uv'=>$uv, 'name'=>$name); - } - return $map; -} - -function GetInfoFromTrueType($file, $embed, $subset, $map) -{ - // Return information from a TrueType font - try - { - $ttf = new TTFParser($file); - $ttf->Parse(); - } - catch(Exception $e) - { - Error($e->getMessage()); - } - if($embed) - { - if(!$ttf->embeddable) - Error('Font license does not allow embedding'); - if($subset) - { - $chars = array(); - foreach($map as $v) - { - if($v['name']!='.notdef') - $chars[] = $v['uv']; - } - $ttf->Subset($chars); - $info['Data'] = $ttf->Build(); - } - else - $info['Data'] = file_get_contents($file); - $info['OriginalSize'] = strlen($info['Data']); - } - $k = 1000/$ttf->unitsPerEm; - $info['FontName'] = $ttf->postScriptName; - $info['Bold'] = $ttf->bold; - $info['ItalicAngle'] = $ttf->italicAngle; - $info['IsFixedPitch'] = $ttf->isFixedPitch; - $info['Ascender'] = round($k*$ttf->typoAscender); - $info['Descender'] = round($k*$ttf->typoDescender); - $info['UnderlineThickness'] = round($k*$ttf->underlineThickness); - $info['UnderlinePosition'] = round($k*$ttf->underlinePosition); - $info['FontBBox'] = array(round($k*$ttf->xMin), round($k*$ttf->yMin), round($k*$ttf->xMax), round($k*$ttf->yMax)); - $info['CapHeight'] = round($k*$ttf->capHeight); - $info['MissingWidth'] = round($k*$ttf->glyphs[0]['w']); - $widths = array_fill(0, 256, $info['MissingWidth']); - foreach($map as $c=>$v) - { - if($v['name']!='.notdef') - { - if(isset($ttf->chars[$v['uv']])) - { - $id = $ttf->chars[$v['uv']]; - $w = $ttf->glyphs[$id]['w']; - $widths[$c] = round($k*$w); - } - else - Warning('Character '.$v['name'].' is missing'); - } - } - $info['Widths'] = $widths; - return $info; -} - -function GetInfoFromType1($file, $embed, $map) -{ - // Return information from a Type1 font - if($embed) - { - $f = fopen($file, 'rb'); - if(!$f) - Error('Can\'t open font file'); - // Read first segment - $a = unpack('Cmarker/Ctype/Vsize', fread($f,6)); - if($a['marker']!=128) - Error('Font file is not a valid binary Type1'); - $size1 = $a['size']; - $data = fread($f, $size1); - // Read second segment - $a = unpack('Cmarker/Ctype/Vsize', fread($f,6)); - if($a['marker']!=128) - Error('Font file is not a valid binary Type1'); - $size2 = $a['size']; - $data .= fread($f, $size2); - fclose($f); - $info['Data'] = $data; - $info['Size1'] = $size1; - $info['Size2'] = $size2; - } - - $afm = substr($file, 0, -3).'afm'; - if(!file_exists($afm)) - Error('AFM font file not found: '.$afm); - $a = file($afm); - if(empty($a)) - Error('AFM file empty or not readable'); - foreach($a as $line) - { - $e = explode(' ', rtrim($line)); - if(count($e)<2) - continue; - $entry = $e[0]; - if($entry=='C') - { - $w = $e[4]; - $name = $e[7]; - $cw[$name] = $w; - } - elseif($entry=='FontName') - $info['FontName'] = $e[1]; - elseif($entry=='Weight') - $info['Weight'] = $e[1]; - elseif($entry=='ItalicAngle') - $info['ItalicAngle'] = (int)$e[1]; - elseif($entry=='Ascender') - $info['Ascender'] = (int)$e[1]; - elseif($entry=='Descender') - $info['Descender'] = (int)$e[1]; - elseif($entry=='UnderlineThickness') - $info['UnderlineThickness'] = (int)$e[1]; - elseif($entry=='UnderlinePosition') - $info['UnderlinePosition'] = (int)$e[1]; - elseif($entry=='IsFixedPitch') - $info['IsFixedPitch'] = ($e[1]=='true'); - elseif($entry=='FontBBox') - $info['FontBBox'] = array((int)$e[1], (int)$e[2], (int)$e[3], (int)$e[4]); - elseif($entry=='CapHeight') - $info['CapHeight'] = (int)$e[1]; - elseif($entry=='StdVW') - $info['StdVW'] = (int)$e[1]; - } - - if(!isset($info['FontName'])) - Error('FontName missing in AFM file'); - if(!isset($info['Ascender'])) - $info['Ascender'] = $info['FontBBox'][3]; - if(!isset($info['Descender'])) - $info['Descender'] = $info['FontBBox'][1]; - $info['Bold'] = isset($info['Weight']) && preg_match('/bold|black/i', $info['Weight']); - if(isset($cw['.notdef'])) - $info['MissingWidth'] = $cw['.notdef']; - else - $info['MissingWidth'] = 0; - $widths = array_fill(0, 256, $info['MissingWidth']); - foreach($map as $c=>$v) - { - if($v['name']!='.notdef') - { - if(isset($cw[$v['name']])) - $widths[$c] = $cw[$v['name']]; - else - Warning('Character '.$v['name'].' is missing'); - } - } - $info['Widths'] = $widths; - return $info; -} - -function MakeFontDescriptor($info) -{ - // Ascent - $fd = "array('Ascent'=>".$info['Ascender']; - // Descent - $fd .= ",'Descent'=>".$info['Descender']; - // CapHeight - if(!empty($info['CapHeight'])) - $fd .= ",'CapHeight'=>".$info['CapHeight']; - else - $fd .= ",'CapHeight'=>".$info['Ascender']; - // Flags - $flags = 0; - if($info['IsFixedPitch']) - $flags += 1<<0; - $flags += 1<<5; - if($info['ItalicAngle']!=0) - $flags += 1<<6; - $fd .= ",'Flags'=>".$flags; - // FontBBox - $fbb = $info['FontBBox']; - $fd .= ",'FontBBox'=>'[".$fbb[0].' '.$fbb[1].' '.$fbb[2].' '.$fbb[3]."]'"; - // ItalicAngle - $fd .= ",'ItalicAngle'=>".$info['ItalicAngle']; - // StemV - if(isset($info['StdVW'])) - $stemv = $info['StdVW']; - elseif($info['Bold']) - $stemv = 120; - else - $stemv = 70; - $fd .= ",'StemV'=>".$stemv; - // MissingWidth - $fd .= ",'MissingWidth'=>".$info['MissingWidth'].')'; - return $fd; -} - -function MakeWidthArray($widths) -{ - $s = "array(\n\t"; - for($c=0;$c<=255;$c++) - { - if(chr($c)=="'") - $s .= "'\\''"; - elseif(chr($c)=="\\") - $s .= "'\\\\'"; - elseif($c>=32 && $c<=126) - $s .= "'".chr($c)."'"; - else - $s .= "chr($c)"; - $s .= '=>'.$widths[$c]; - if($c<255) - $s .= ','; - if(($c+1)%22==0) - $s .= "\n\t"; - } - $s .= ')'; - return $s; -} - -function MakeFontEncoding($map) -{ - // Build differences from reference encoding - $ref = LoadMap('cp1252'); - $s = ''; - $last = 0; - for($c=32;$c<=255;$c++) - { - if($map[$c]['name']!=$ref[$c]['name']) - { - if($c!=$last+1) - $s .= $c.' '; - $last = $c; - $s .= '/'.$map[$c]['name'].' '; - } - } - return rtrim($s); -} - -function MakeUnicodeArray($map) -{ - // Build mapping to Unicode values - $ranges = array(); - foreach($map as $c=>$v) - { - $uv = $v['uv']; - if($uv!=-1) - { - if(isset($range)) - { - if($c==$range[1]+1 && $uv==$range[3]+1) - { - $range[1]++; - $range[3]++; - } - else - { - $ranges[] = $range; - $range = array($c, $c, $uv, $uv); - } - } - else - $range = array($c, $c, $uv, $uv); - } - } - $ranges[] = $range; - - foreach($ranges as $range) - { - if(isset($s)) - $s .= ','; - else - $s = 'array('; - $s .= $range[0].'=>'; - $nb = $range[1]-$range[0]+1; - if($nb>1) - $s .= 'array('.$range[2].','.$nb.')'; - else - $s .= $range[2]; - } - $s .= ')'; - return $s; -} - -function SaveToFile($file, $s, $mode) -{ - $f = fopen($file, 'w'.$mode); - if(!$f) - Error('Can\'t write to file '.$file); - fwrite($f, $s); - fclose($f); -} - -function MakeDefinitionFile($file, $type, $enc, $embed, $subset, $map, $info) -{ - $s = "<?php\n"; - $s .= '$type = \''.$type."';\n"; - $s .= '$name = \''.$info['FontName']."';\n"; - $s .= '$desc = '.MakeFontDescriptor($info).";\n"; - $s .= '$up = '.$info['UnderlinePosition'].";\n"; - $s .= '$ut = '.$info['UnderlineThickness'].";\n"; - $s .= '$cw = '.MakeWidthArray($info['Widths']).";\n"; - $s .= '$enc = \''.$enc."';\n"; - $diff = MakeFontEncoding($map); - if($diff) - $s .= '$diff = \''.$diff."';\n"; - $s .= '$uv = '.MakeUnicodeArray($map).";\n"; - if($embed) - { - $s .= '$file = \''.$info['File']."';\n"; - if($type=='Type1') - { - $s .= '$size1 = '.$info['Size1'].";\n"; - $s .= '$size2 = '.$info['Size2'].";\n"; - } - else - { - $s .= '$originalsize = '.$info['OriginalSize'].";\n"; - if($subset) - $s .= "\$subsetted = true;\n"; - } - } - $s .= "?>\n"; - SaveToFile($file, $s, 't'); -} - -function MakeFont($fontfile, $enc='cp1252', $embed=true, $subset=true) -{ - // Generate a font definition file - if(!file_exists($fontfile)) - Error('Font file not found: '.$fontfile); - $ext = strtolower(substr($fontfile,-3)); - if($ext=='ttf' || $ext=='otf') - $type = 'TrueType'; - elseif($ext=='pfb') - $type = 'Type1'; - else - Error('Unrecognized font file extension: '.$ext); - - $map = LoadMap($enc); - - if($type=='TrueType') - $info = GetInfoFromTrueType($fontfile, $embed, $subset, $map); - else - $info = GetInfoFromType1($fontfile, $embed, $map); - - $basename = substr(basename($fontfile), 0, -4); - if($embed) - { - if(function_exists('gzcompress')) - { - $file = $basename.'.z'; - SaveToFile($file, gzcompress($info['Data']), 'b'); - $info['File'] = $file; - Message('Font file compressed: '.$file); - } - else - { - $info['File'] = basename($fontfile); - $subset = false; - Notice('Font file could not be compressed (zlib extension not available)'); - } - } - - MakeDefinitionFile($basename.'.php', $type, $enc, $embed, $subset, $map, $info); - Message('Font definition file generated: '.$basename.'.php'); -} - -if(PHP_SAPI=='cli') -{ - // Command-line interface - ini_set('log_errors', '0'); - if($argc==1) - die("Usage: php makefont.php fontfile [encoding] [embed] [subset]\n"); - $fontfile = $argv[1]; - if($argc>=3) - $enc = $argv[2]; - else - $enc = 'cp1252'; - if($argc>=4) - $embed = ($argv[3]=='true' || $argv[3]=='1'); - else - $embed = true; - if($argc>=5) - $subset = ($argv[4]=='true' || $argv[4]=='1'); - else - $subset = true; - MakeFont($fontfile, $enc, $embed, $subset); -} -?> diff --git a/fpdf/makefont/ttfparser.php b/fpdf/makefont/ttfparser.php deleted file mode 100644 index b5acf29b14f57bca4330725c7c6b649f08fde5d9..0000000000000000000000000000000000000000 --- a/fpdf/makefont/ttfparser.php +++ /dev/null @@ -1,714 +0,0 @@ -<?php -/******************************************************************************* -* Class to parse and subset TrueType fonts * -* * -* Version: 1.11 * -* Date: 2021-04-18 * -* Author: Olivier PLATHEY * -*******************************************************************************/ - -class TTFParser -{ - protected $f; - protected $tables; - protected $numberOfHMetrics; - protected $numGlyphs; - protected $glyphNames; - protected $indexToLocFormat; - protected $subsettedChars; - protected $subsettedGlyphs; - public $chars; - public $glyphs; - public $unitsPerEm; - public $xMin, $yMin, $xMax, $yMax; - public $postScriptName; - public $embeddable; - public $bold; - public $typoAscender; - public $typoDescender; - public $capHeight; - public $italicAngle; - public $underlinePosition; - public $underlineThickness; - public $isFixedPitch; - - function __construct($file) - { - $this->f = fopen($file, 'rb'); - if(!$this->f) - $this->Error('Can\'t open file: '.$file); - } - - function __destruct() - { - if(is_resource($this->f)) - fclose($this->f); - } - - function Parse() - { - $this->ParseOffsetTable(); - $this->ParseHead(); - $this->ParseHhea(); - $this->ParseMaxp(); - $this->ParseHmtx(); - $this->ParseLoca(); - $this->ParseGlyf(); - $this->ParseCmap(); - $this->ParseName(); - $this->ParseOS2(); - $this->ParsePost(); - } - - function ParseOffsetTable() - { - $version = $this->Read(4); - if($version=='OTTO') - $this->Error('OpenType fonts based on PostScript outlines are not supported'); - if($version!="\x00\x01\x00\x00") - $this->Error('Unrecognized file format'); - $numTables = $this->ReadUShort(); - $this->Skip(3*2); // searchRange, entrySelector, rangeShift - $this->tables = array(); - for($i=0;$i<$numTables;$i++) - { - $tag = $this->Read(4); - $checkSum = $this->Read(4); - $offset = $this->ReadULong(); - $length = $this->ReadULong(); - $this->tables[$tag] = array('offset'=>$offset, 'length'=>$length, 'checkSum'=>$checkSum); - } - } - - function ParseHead() - { - $this->Seek('head'); - $this->Skip(3*4); // version, fontRevision, checkSumAdjustment - $magicNumber = $this->ReadULong(); - if($magicNumber!=0x5F0F3CF5) - $this->Error('Incorrect magic number'); - $this->Skip(2); // flags - $this->unitsPerEm = $this->ReadUShort(); - $this->Skip(2*8); // created, modified - $this->xMin = $this->ReadShort(); - $this->yMin = $this->ReadShort(); - $this->xMax = $this->ReadShort(); - $this->yMax = $this->ReadShort(); - $this->Skip(3*2); // macStyle, lowestRecPPEM, fontDirectionHint - $this->indexToLocFormat = $this->ReadShort(); - } - - function ParseHhea() - { - $this->Seek('hhea'); - $this->Skip(4+15*2); - $this->numberOfHMetrics = $this->ReadUShort(); - } - - function ParseMaxp() - { - $this->Seek('maxp'); - $this->Skip(4); - $this->numGlyphs = $this->ReadUShort(); - } - - function ParseHmtx() - { - $this->Seek('hmtx'); - $this->glyphs = array(); - for($i=0;$i<$this->numberOfHMetrics;$i++) - { - $advanceWidth = $this->ReadUShort(); - $lsb = $this->ReadShort(); - $this->glyphs[$i] = array('w'=>$advanceWidth, 'lsb'=>$lsb); - } - for($i=$this->numberOfHMetrics;$i<$this->numGlyphs;$i++) - { - $lsb = $this->ReadShort(); - $this->glyphs[$i] = array('w'=>$advanceWidth, 'lsb'=>$lsb); - } - } - - function ParseLoca() - { - $this->Seek('loca'); - $offsets = array(); - if($this->indexToLocFormat==0) - { - // Short format - for($i=0;$i<=$this->numGlyphs;$i++) - $offsets[] = 2*$this->ReadUShort(); - } - else - { - // Long format - for($i=0;$i<=$this->numGlyphs;$i++) - $offsets[] = $this->ReadULong(); - } - for($i=0;$i<$this->numGlyphs;$i++) - { - $this->glyphs[$i]['offset'] = $offsets[$i]; - $this->glyphs[$i]['length'] = $offsets[$i+1] - $offsets[$i]; - } - } - - function ParseGlyf() - { - $tableOffset = $this->tables['glyf']['offset']; - foreach($this->glyphs as &$glyph) - { - if($glyph['length']>0) - { - fseek($this->f, $tableOffset+$glyph['offset'], SEEK_SET); - if($this->ReadShort()<0) - { - // Composite glyph - $this->Skip(4*2); // xMin, yMin, xMax, yMax - $offset = 5*2; - $a = array(); - do - { - $flags = $this->ReadUShort(); - $index = $this->ReadUShort(); - $a[$offset+2] = $index; - if($flags & 1) // ARG_1_AND_2_ARE_WORDS - $skip = 2*2; - else - $skip = 2; - if($flags & 8) // WE_HAVE_A_SCALE - $skip += 2; - elseif($flags & 64) // WE_HAVE_AN_X_AND_Y_SCALE - $skip += 2*2; - elseif($flags & 128) // WE_HAVE_A_TWO_BY_TWO - $skip += 4*2; - $this->Skip($skip); - $offset += 2*2 + $skip; - } - while($flags & 32); // MORE_COMPONENTS - $glyph['components'] = $a; - } - } - } - } - - function ParseCmap() - { - $this->Seek('cmap'); - $this->Skip(2); // version - $numTables = $this->ReadUShort(); - $offset31 = 0; - for($i=0;$i<$numTables;$i++) - { - $platformID = $this->ReadUShort(); - $encodingID = $this->ReadUShort(); - $offset = $this->ReadULong(); - if($platformID==3 && $encodingID==1) - $offset31 = $offset; - } - if($offset31==0) - $this->Error('No Unicode encoding found'); - - $startCount = array(); - $endCount = array(); - $idDelta = array(); - $idRangeOffset = array(); - $this->chars = array(); - fseek($this->f, $this->tables['cmap']['offset']+$offset31, SEEK_SET); - $format = $this->ReadUShort(); - if($format!=4) - $this->Error('Unexpected subtable format: '.$format); - $this->Skip(2*2); // length, language - $segCount = $this->ReadUShort()/2; - $this->Skip(3*2); // searchRange, entrySelector, rangeShift - for($i=0;$i<$segCount;$i++) - $endCount[$i] = $this->ReadUShort(); - $this->Skip(2); // reservedPad - for($i=0;$i<$segCount;$i++) - $startCount[$i] = $this->ReadUShort(); - for($i=0;$i<$segCount;$i++) - $idDelta[$i] = $this->ReadShort(); - $offset = ftell($this->f); - for($i=0;$i<$segCount;$i++) - $idRangeOffset[$i] = $this->ReadUShort(); - - for($i=0;$i<$segCount;$i++) - { - $c1 = $startCount[$i]; - $c2 = $endCount[$i]; - $d = $idDelta[$i]; - $ro = $idRangeOffset[$i]; - if($ro>0) - fseek($this->f, $offset+2*$i+$ro, SEEK_SET); - for($c=$c1;$c<=$c2;$c++) - { - if($c==0xFFFF) - break; - if($ro>0) - { - $gid = $this->ReadUShort(); - if($gid>0) - $gid += $d; - } - else - $gid = $c+$d; - if($gid>=65536) - $gid -= 65536; - if($gid>0) - $this->chars[$c] = $gid; - } - } - } - - function ParseName() - { - $this->Seek('name'); - $tableOffset = $this->tables['name']['offset']; - $this->postScriptName = ''; - $this->Skip(2); // format - $count = $this->ReadUShort(); - $stringOffset = $this->ReadUShort(); - for($i=0;$i<$count;$i++) - { - $this->Skip(3*2); // platformID, encodingID, languageID - $nameID = $this->ReadUShort(); - $length = $this->ReadUShort(); - $offset = $this->ReadUShort(); - if($nameID==6) - { - // PostScript name - fseek($this->f, $tableOffset+$stringOffset+$offset, SEEK_SET); - $s = $this->Read($length); - $s = str_replace(chr(0), '', $s); - $s = preg_replace('|[ \[\](){}<>/%]|', '', $s); - $this->postScriptName = $s; - break; - } - } - if($this->postScriptName=='') - $this->Error('PostScript name not found'); - } - - function ParseOS2() - { - $this->Seek('OS/2'); - $version = $this->ReadUShort(); - $this->Skip(3*2); // xAvgCharWidth, usWeightClass, usWidthClass - $fsType = $this->ReadUShort(); - $this->embeddable = ($fsType!=2) && ($fsType & 0x200)==0; - $this->Skip(11*2+10+4*4+4); - $fsSelection = $this->ReadUShort(); - $this->bold = ($fsSelection & 32)!=0; - $this->Skip(2*2); // usFirstCharIndex, usLastCharIndex - $this->typoAscender = $this->ReadShort(); - $this->typoDescender = $this->ReadShort(); - if($version>=2) - { - $this->Skip(3*2+2*4+2); - $this->capHeight = $this->ReadShort(); - } - else - $this->capHeight = 0; - } - - function ParsePost() - { - $this->Seek('post'); - $version = $this->ReadULong(); - $this->italicAngle = $this->ReadShort(); - $this->Skip(2); // Skip decimal part - $this->underlinePosition = $this->ReadShort(); - $this->underlineThickness = $this->ReadShort(); - $this->isFixedPitch = ($this->ReadULong()!=0); - if($version==0x20000) - { - // Extract glyph names - $this->Skip(4*4); // min/max usage - $this->Skip(2); // numberOfGlyphs - $glyphNameIndex = array(); - $names = array(); - $numNames = 0; - for($i=0;$i<$this->numGlyphs;$i++) - { - $index = $this->ReadUShort(); - $glyphNameIndex[] = $index; - if($index>=258 && $index-257>$numNames) - $numNames = $index-257; - } - for($i=0;$i<$numNames;$i++) - { - $len = ord($this->Read(1)); - $names[] = $this->Read($len); - } - foreach($glyphNameIndex as $i=>$index) - { - if($index>=258) - $this->glyphs[$i]['name'] = $names[$index-258]; - else - $this->glyphs[$i]['name'] = $index; - } - $this->glyphNames = true; - } - else - $this->glyphNames = false; - } - - function Subset($chars) - { - $this->subsettedGlyphs = array(); - $this->AddGlyph(0); - $this->subsettedChars = array(); - foreach($chars as $char) - { - if(isset($this->chars[$char])) - { - $this->subsettedChars[] = $char; - $this->AddGlyph($this->chars[$char]); - } - } - } - - function AddGlyph($id) - { - if(!isset($this->glyphs[$id]['ssid'])) - { - $this->glyphs[$id]['ssid'] = count($this->subsettedGlyphs); - $this->subsettedGlyphs[] = $id; - if(isset($this->glyphs[$id]['components'])) - { - foreach($this->glyphs[$id]['components'] as $cid) - $this->AddGlyph($cid); - } - } - } - - function Build() - { - $this->BuildCmap(); - $this->BuildHhea(); - $this->BuildHmtx(); - $this->BuildLoca(); - $this->BuildGlyf(); - $this->BuildMaxp(); - $this->BuildPost(); - return $this->BuildFont(); - } - - function BuildCmap() - { - if(!isset($this->subsettedChars)) - return; - - // Divide charset in contiguous segments - $chars = $this->subsettedChars; - sort($chars); - $segments = array(); - $segment = array($chars[0], $chars[0]); - for($i=1;$i<count($chars);$i++) - { - if($chars[$i]>$segment[1]+1) - { - $segments[] = $segment; - $segment = array($chars[$i], $chars[$i]); - } - else - $segment[1]++; - } - $segments[] = $segment; - $segments[] = array(0xFFFF, 0xFFFF); - $segCount = count($segments); - - // Build a Format 4 subtable - $startCount = array(); - $endCount = array(); - $idDelta = array(); - $idRangeOffset = array(); - $glyphIdArray = ''; - for($i=0;$i<$segCount;$i++) - { - list($start, $end) = $segments[$i]; - $startCount[] = $start; - $endCount[] = $end; - if($start!=$end) - { - // Segment with multiple chars - $idDelta[] = 0; - $idRangeOffset[] = strlen($glyphIdArray) + ($segCount-$i)*2; - for($c=$start;$c<=$end;$c++) - { - $ssid = $this->glyphs[$this->chars[$c]]['ssid']; - $glyphIdArray .= pack('n', $ssid); - } - } - else - { - // Segment with a single char - if($start<0xFFFF) - $ssid = $this->glyphs[$this->chars[$start]]['ssid']; - else - $ssid = 0; - $idDelta[] = $ssid - $start; - $idRangeOffset[] = 0; - } - } - $entrySelector = 0; - $n = $segCount; - while($n!=1) - { - $n = $n>>1; - $entrySelector++; - } - $searchRange = (1<<$entrySelector)*2; - $rangeShift = 2*$segCount - $searchRange; - $cmap = pack('nnnn', 2*$segCount, $searchRange, $entrySelector, $rangeShift); - foreach($endCount as $val) - $cmap .= pack('n', $val); - $cmap .= pack('n', 0); // reservedPad - foreach($startCount as $val) - $cmap .= pack('n', $val); - foreach($idDelta as $val) - $cmap .= pack('n', $val); - foreach($idRangeOffset as $val) - $cmap .= pack('n', $val); - $cmap .= $glyphIdArray; - - $data = pack('nn', 0, 1); // version, numTables - $data .= pack('nnN', 3, 1, 12); // platformID, encodingID, offset - $data .= pack('nnn', 4, 6+strlen($cmap), 0); // format, length, language - $data .= $cmap; - $this->SetTable('cmap', $data); - } - - function BuildHhea() - { - $this->LoadTable('hhea'); - $numberOfHMetrics = count($this->subsettedGlyphs); - $data = substr_replace($this->tables['hhea']['data'], pack('n',$numberOfHMetrics), 4+15*2, 2); - $this->SetTable('hhea', $data); - } - - function BuildHmtx() - { - $data = ''; - foreach($this->subsettedGlyphs as $id) - { - $glyph = $this->glyphs[$id]; - $data .= pack('nn', $glyph['w'], $glyph['lsb']); - } - $this->SetTable('hmtx', $data); - } - - function BuildLoca() - { - $data = ''; - $offset = 0; - foreach($this->subsettedGlyphs as $id) - { - if($this->indexToLocFormat==0) - $data .= pack('n', $offset/2); - else - $data .= pack('N', $offset); - $offset += $this->glyphs[$id]['length']; - } - if($this->indexToLocFormat==0) - $data .= pack('n', $offset/2); - else - $data .= pack('N', $offset); - $this->SetTable('loca', $data); - } - - function BuildGlyf() - { - $tableOffset = $this->tables['glyf']['offset']; - $data = ''; - foreach($this->subsettedGlyphs as $id) - { - $glyph = $this->glyphs[$id]; - fseek($this->f, $tableOffset+$glyph['offset'], SEEK_SET); - $glyph_data = $this->Read($glyph['length']); - if(isset($glyph['components'])) - { - // Composite glyph - foreach($glyph['components'] as $offset=>$cid) - { - $ssid = $this->glyphs[$cid]['ssid']; - $glyph_data = substr_replace($glyph_data, pack('n',$ssid), $offset, 2); - } - } - $data .= $glyph_data; - } - $this->SetTable('glyf', $data); - } - - function BuildMaxp() - { - $this->LoadTable('maxp'); - $numGlyphs = count($this->subsettedGlyphs); - $data = substr_replace($this->tables['maxp']['data'], pack('n',$numGlyphs), 4, 2); - $this->SetTable('maxp', $data); - } - - function BuildPost() - { - $this->Seek('post'); - if($this->glyphNames) - { - // Version 2.0 - $numberOfGlyphs = count($this->subsettedGlyphs); - $numNames = 0; - $names = ''; - $data = $this->Read(2*4+2*2+5*4); - $data .= pack('n', $numberOfGlyphs); - foreach($this->subsettedGlyphs as $id) - { - $name = $this->glyphs[$id]['name']; - if(is_string($name)) - { - $data .= pack('n', 258+$numNames); - $names .= chr(strlen($name)).$name; - $numNames++; - } - else - $data .= pack('n', $name); - } - $data .= $names; - } - else - { - // Version 3.0 - $this->Skip(4); - $data = "\x00\x03\x00\x00"; - $data .= $this->Read(4+2*2+5*4); - } - $this->SetTable('post', $data); - } - - function BuildFont() - { - $tags = array(); - foreach(array('cmap', 'cvt ', 'fpgm', 'glyf', 'head', 'hhea', 'hmtx', 'loca', 'maxp', 'name', 'post', 'prep') as $tag) - { - if(isset($this->tables[$tag])) - $tags[] = $tag; - } - $numTables = count($tags); - $offset = 12 + 16*$numTables; - foreach($tags as $tag) - { - if(!isset($this->tables[$tag]['data'])) - $this->LoadTable($tag); - $this->tables[$tag]['offset'] = $offset; - $offset += strlen($this->tables[$tag]['data']); - } - - // Build offset table - $entrySelector = 0; - $n = $numTables; - while($n!=1) - { - $n = $n>>1; - $entrySelector++; - } - $searchRange = 16*(1<<$entrySelector); - $rangeShift = 16*$numTables - $searchRange; - $offsetTable = pack('nnnnnn', 1, 0, $numTables, $searchRange, $entrySelector, $rangeShift); - foreach($tags as $tag) - { - $table = $this->tables[$tag]; - $offsetTable .= $tag.$table['checkSum'].pack('NN', $table['offset'], $table['length']); - } - - // Compute checkSumAdjustment (0xB1B0AFBA - font checkSum) - $s = $this->CheckSum($offsetTable); - foreach($tags as $tag) - $s .= $this->tables[$tag]['checkSum']; - $a = unpack('n2', $this->CheckSum($s)); - $high = 0xB1B0 + ($a[1]^0xFFFF); - $low = 0xAFBA + ($a[2]^0xFFFF) + 1; - $checkSumAdjustment = pack('nn', $high+($low>>16), $low); - $this->tables['head']['data'] = substr_replace($this->tables['head']['data'], $checkSumAdjustment, 8, 4); - - $font = $offsetTable; - foreach($tags as $tag) - $font .= $this->tables[$tag]['data']; - - return $font; - } - - function LoadTable($tag) - { - $this->Seek($tag); - $length = $this->tables[$tag]['length']; - $n = $length % 4; - if($n>0) - $length += 4 - $n; - $this->tables[$tag]['data'] = $this->Read($length); - } - - function SetTable($tag, $data) - { - $length = strlen($data); - $n = $length % 4; - if($n>0) - $data = str_pad($data, $length+4-$n, "\x00"); - $this->tables[$tag]['data'] = $data; - $this->tables[$tag]['length'] = $length; - $this->tables[$tag]['checkSum'] = $this->CheckSum($data); - } - - function Seek($tag) - { - if(!isset($this->tables[$tag])) - $this->Error('Table not found: '.$tag); - fseek($this->f, $this->tables[$tag]['offset'], SEEK_SET); - } - - function Skip($n) - { - fseek($this->f, $n, SEEK_CUR); - } - - function Read($n) - { - return $n>0 ? fread($this->f, $n) : ''; - } - - function ReadUShort() - { - $a = unpack('nn', fread($this->f,2)); - return $a['n']; - } - - function ReadShort() - { - $a = unpack('nn', fread($this->f,2)); - $v = $a['n']; - if($v>=0x8000) - $v -= 65536; - return $v; - } - - function ReadULong() - { - $a = unpack('NN', fread($this->f,4)); - return $a['N']; - } - - function CheckSum($s) - { - $n = strlen($s); - $high = 0; - $low = 0; - for($i=0;$i<$n;$i+=4) - { - $high += (ord($s[$i])<<8) + ord($s[$i+1]); - $low += (ord($s[$i+2])<<8) + ord($s[$i+3]); - } - return pack('nn', $high+($low>>16), $low); - } - - function Error($msg) - { - throw new Exception($msg); - } -} -?> diff --git a/pyteomics/__init__.py b/pyteomics/__init__.py deleted file mode 100644 index fd278bfdfaf3fe2212d01b66df913df511692aa7..0000000000000000000000000000000000000000 --- a/pyteomics/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -Copyright 2012 Anton Goloborodko, Lev Levitsky - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" -__import__('pkg_resources').declare_namespace(__name__) diff --git a/pyteomics/_schema_defaults.py b/pyteomics/_schema_defaults.py deleted file mode 100644 index e31701c7d6fc0eb48838478da0831037b3d15c42..0000000000000000000000000000000000000000 --- a/pyteomics/_schema_defaults.py +++ /dev/null @@ -1,635 +0,0 @@ -_protxml_schema_defaults = {'bools': set(), - 'charlists': set(), - 'floatlists': set(), - 'floats': {('ASAPRatio', 'heavy2light_ratio_mean'), - ('ASAPRatio', 'heavy2light_ratio_standard_dev'), - ('ASAPRatio', 'ratio_mean'), - ('ASAPRatio', 'ratio_standard_dev'), - ('ASAPRatio_pvalue', 'adj_ratio_mean'), - ('ASAPRatio_pvalue', 'adj_ratio_standard_dev'), - ('ASAPRatio_pvalue', 'decimal_pvalue'), - ('ASAPRatio_pvalue', 'heavy2light_adj_ratio_mean'), - ('ASAPRatio_pvalue', 'heavy2light_adj_ratio_standard_dev'), - ('ASAPRatio_pvalue', 'pvalue'), - ('ASAP_Peak', 'heavy2light_ratio_mean'), - ('ASAP_Peak', 'heavy2light_ratio_standard_dev'), - ('ASAP_Peak', 'ratio_mean'), - ('ASAP_Peak', 'ratio_standard_dev'), - ('ASAP_Peak', 'weight'), - ('ASAP_Seq', 'heavy2light_ratio_mean'), - ('ASAP_Seq', 'heavy2light_ratio_standard_dev'), - ('ASAP_Seq', 'ratio_mean'), - ('ASAP_Seq', 'ratio_standard_dev'), - ('ASAP_Seq', 'weight'), - ('ASAP_prot_analysis_summary', 'min_peptide_probability'), - ('ASAP_prot_analysis_summary', 'min_peptide_weight'), - ('ASAP_prot_analysis_summary', 'min_protein_probability'), - ('ASAP_pvalue_analysis_summary', 'background_fitting_error'), - ('ASAP_pvalue_analysis_summary', 'background_ratio_mean'), - ('ASAP_pvalue_analysis_summary', 'background_ratio_stdev'), - ('StPeterQuant', 'SIn'), - ('StPeterQuant', 'ng'), - ('StPeterQuant_peptide', 'spectralIndex'), - ('StPeter_analysis_summary', 'FDR'), - ('StPeter_analysis_summary', 'probability'), - ('StPeter_analysis_summary', 'sampleLoad'), - ('StPeter_analysis_summary', 'tolerance'), - ('XPress_analysis_summary', 'min_peptide_probability'), - ('XPress_analysis_summary', 'min_peptide_weight'), - ('XPress_analysis_summary', 'min_protein_probability'), - ('affected_channel', 'correction'), - ('decoy_analysis_summary', 'decoy_ratio'), - ('error_point', 'error'), - ('error_point', 'min_prob'), - ('fpkm_distribution', 'alt_pos_to_neg_ratio'), - ('fpkm_distribution', 'fpkm_lower_bound_excl'), - ('fpkm_distribution', 'fpkm_lower_bound_incl'), - ('fpkm_distribution', 'neg_freq'), - ('fpkm_distribution', 'pos_freq'), - ('fpkm_distribution', 'pos_to_neg_ratio'), - ('fragment_masses', 'mz'), - ('indistinguishable_peptide', 'calc_neutral_pep_mass'), - ('intensity', 'error'), - ('intensity', 'mz'), - ('intensity', 'ratio'), - ('libra_summary', 'mass_tolerance'), - ('libra_summary', 'min_pep_prob'), - ('libra_summary', 'min_pep_wt'), - ('libra_summary', 'min_prot_prob'), - ('ni_distribution', 'alt_pos_to_neg_ratio'), - ('ni_distribution', 'neg_freq'), - ('ni_distribution', 'ni_lower_bound_excl'), - ('ni_distribution', 'ni_lower_bound_incl'), - ('ni_distribution', 'pos_freq'), - ('ni_distribution', 'pos_to_neg_ratio'), - ('nsp_distribution', 'alt_pos_to_neg_ratio'), - ('nsp_distribution', 'neg_freq'), - ('nsp_distribution', 'nsp_lower_bound_excl'), - ('nsp_distribution', 'nsp_lower_bound_incl'), - ('nsp_distribution', 'pos_freq'), - ('nsp_distribution', 'pos_to_neg_ratio'), - ('peptide', 'calc_neutral_pep_mass'), - ('peptide', 'exp_sibling_ion_bin'), - ('peptide', 'exp_sibling_ion_instances'), - ('peptide', 'exp_tot_instances'), - ('peptide', 'fpkm_adjusted_probability'), - ('peptide', 'initial_probability'), - ('peptide', 'max_fpkm'), - ('peptide', 'n_sibling_peptides'), - ('peptide', 'ni_adjusted_probability'), - ('peptide', 'nsp_adjusted_probability'), - ('peptide', 'weight'), - ('point', 'fdr_pp'), - ('point', 'fdr_pp_decoy'), - ('point', 'logratio'), - ('point', 'model_distr'), - ('point', 'num_corr_pp'), - ('point', 'num_corr_pp_decoy'), - ('point', 'obs_distr'), - ('point', 'pp_decoy_uncert'), - ('point', 'pp_uncert'), - ('point', 'prob_cutoff'), - ('protein', 'confidence'), - ('protein', 'percent_coverage'), - ('protein', 'probability'), - ('protein_group', 'probability'), - ('protein_summary_data_filter', 'false_positive_error_rate'), - ('protein_summary_data_filter', 'min_probability'), - ('protein_summary_data_filter', 'predicted_num_correct'), - ('protein_summary_data_filter', 'predicted_num_incorrect'), - ('protein_summary_data_filter', 'sensitivity'), - ('protein_summary_header', 'initial_min_peptide_prob'), - ('protein_summary_header', 'min_peptide_probability'), - ('protein_summary_header', 'min_peptide_weight'), - ('protein_summary_header', 'num_predicted_correct_prots'), - ('protein_summary_header', 'total_no_spectrum_ids')}, - 'intlists': set(), - 'ints': {('ASAPRatio', 'ratio_number_peptides'), - ('ASAP_Peak', 'datanum'), - ('ASAP_Seq', 'datanum'), - ('ASAP_pvalue_analysis_summary', 'asap_prot_id'), - ('ASAP_pvalue_analysis_summary', 'asapratio_id'), - ('StPeterQuant_peptide', 'charge'), - ('affected_channel', 'channel'), - ('analysis_result', 'id'), - ('analysis_summary', 'id'), - ('contributing_channel', 'channel'), - ('error_point', 'num_corr'), - ('error_point', 'num_incorr'), - ('fpkm_distribution', 'bin_no'), - ('fragment_masses', 'channel'), - ('intensity', 'channel'), - ('libra_result', 'number'), - ('libra_summary', 'centroiding_preference'), - ('libra_summary', 'normalization'), - ('libra_summary', 'output_type'), - ('ni_distribution', 'bin_no'), - ('nsp_distribution', 'bin_no'), - ('peptide', 'charge'), - ('peptide', 'fpkm_bin'), - ('peptide', 'n_enzymatic_termini'), - ('peptide', 'n_instances'), - ('peptide', 'n_sibling_peptides_bin'), - ('protein', 'n_indistinguishable_proteins'), - ('protein', 'total_number_distinct_peptides'), - ('protein', 'total_number_peptides'), - ('protein_summary_header', 'num_input_1_spectra'), - ('protein_summary_header', 'num_input_2_spectra'), - ('protein_summary_header', 'num_input_3_spectra'), - ('protein_summary_header', 'num_input_4_spectra'), - ('protein_summary_header', 'num_input_5_spectra')}, - 'lists': {'ASAP_Dta', - 'ASAP_Peak', - 'ASAP_Seq', - 'StPeterQuant_peptide', - 'affected_channel', - 'analysis_result', - 'analysis_summary', - 'contributing_channel', - 'error_point', - 'fpkm_distribution', - 'fpkm_information', - 'fragment_masses', - 'indistinguishable_peptide', - 'indistinguishable_protein', - 'intensity', - 'mod_aminoacid_mass', - 'modification_info', - 'ni_distribution', - 'ni_information', - 'nsp_distribution', - 'parameter', - 'peptide', - 'peptide_parent_protein', - 'point', - 'protein', - 'protein_group', - 'protein_summary_data_filter'}} - -_mzid_schema_defaults = {'bools': {('Enzyme', 'semiSpecific'), - ('Enzymes', 'independent'), - ('PeptideEvidence', 'isDecoy'), - ('ProteinDetectionHypothesis', 'passThreshold'), - ('SearchModification', 'fixedMod'), - ('SpectrumIdentificationItem', 'passThreshold')}, - 'charlists': {('Modification', 'residues'), - ('SearchModification', 'residues')}, - 'floatlists': {('FragmentArray', 'values')}, - 'floats': {('Modification', 'avgMassDelta'), - ('Modification', 'monoisotopicMassDelta'), - ('Residue', 'mass'), - ('SearchModification', 'massDelta'), - ('SpectrumIdentificationItem', 'calculatedMassToCharge'), - ('SpectrumIdentificationItem', 'calculatedPI'), - ('SpectrumIdentificationItem', 'experimentalMassToCharge'), - ('SubstitutionModification', 'avgMassDelta'), - ('SubstitutionModification', 'monoisotopicMassDelta')}, - 'intlists': {('IonType', 'index'), ('MassTable', 'msLevel')}, - 'ints': {('BibliographicReference', 'year'), - ('DBSequence', 'length'), - ('Enzyme', 'missedCleavages'), - ('IonType', 'charge'), - ('Modification', 'location'), - ('PeptideEvidence', 'end'), - ('PeptideEvidence', 'start'), - ('SearchDatabase', 'numDatabaseSequences'), - ('SearchDatabase', 'numResidues'), - ('SpectrumIdentificationItem', 'chargeState'), - ('SpectrumIdentificationItem', 'rank'), - ('SpectrumIdentificationList', 'numSequencesSearched'), - ('SubstitutionModification', 'location')}, - 'lists': {'Affiliation', - 'AmbiguousResidue', - 'AnalysisSoftware', - 'BibliographicReference', - 'ContactRole', - 'DBSequence', - 'Enzyme', - 'Filter', - 'FragmentArray', - 'InputSpectra', - 'InputSpectrumIdentifications', - 'IonType', - 'MassTable', - 'Measure', - 'Modification', - 'Peptide', - 'PeptideEvidence', - 'PeptideEvidenceRef', - 'PeptideHypothesis', - 'ProteinAmbiguityGroup', - 'ProteinDetectionHypothesis', - 'Residue', - 'Sample', - 'SearchDatabase', - 'SearchDatabaseRef', - 'SearchModification', - 'SourceFile', - 'SpecificityRules', - 'SpectraData', - 'SpectrumIdentification', - 'SpectrumIdentificationItem', - 'SpectrumIdentificationItemRef', - 'SpectrumIdentificationList', - 'SpectrumIdentificationProtocol', - 'SpectrumIdentificationResult', - 'SubSample', - 'SubstitutionModification', - 'TranslationTable', - 'cv', - 'cvParam'}} - -_trafoxml_schema_defaults = {'bools': set(), - 'charlists': set(), - 'floatlists': set(), - 'floats': {('Pair', 'from'), ('Pair', 'to'), ('TrafoXML', 'version')}, - 'intlists': set(), - 'ints': {('Pairs', 'count')}, - 'lists': {'Pair', 'Param'}} - -_featurexml_schema_defaults = { - 'ints': {('PeptideHit', 'charge'), - # ('PeptideIdentification', 'spectrum_reference'), - ('SearchParameters', 'missed_cleavages'), - # ('UnassignedPeptideIdentification', 'spectrum_reference'), - ('featureList', 'count'), - ('quality', 'dim'), - ('position', 'dim'), - ('feature', 'charge'), - ('convexhull', 'nr'), - }, - 'floats': {('PeptideHit', 'score'), - ('PeptideIdentification', 'MZ'), - ('PeptideIdentification', 'RT'), - ('PeptideIdentification', 'significance_threshold'), - ('ProteinHit', 'coverage'), - ('ProteinHit', 'score'), - ('ProteinIdentification', 'significance_threshold'), - ('SearchParameters', 'peak_mass_tolerance'), - ('SearchParameters', 'precursor_peak_tolerance'), - ('UnassignedPeptideIdentification', 'MZ'), - ('UnassignedPeptideIdentification', 'RT'), - ('UnassignedPeptideIdentification', 'significance_threshold'), - ('featureMap', 'version'), - ('pt', 'x'), - ('pt', 'y'), - ('quality', 'quality'), - ('position', 'position'), - ('feature', 'overallquality'), - ('feature', 'intensity'), - }, - 'bools': {('PeptideIdentification', 'higher_score_better'), - ('ProteinIdentification', 'higher_score_better'), - ('SearchParameters', 'peak_mass_tolerance_ppm'), - ('SearchParameters', 'precursor_peak_tolerance_ppm'), - ('UnassignedPeptideIdentification', 'higher_score_better')}, - 'intlists': set(), - 'floatlists': set(), - 'charlists': set(), - 'lists': {'FixedModification', - 'IdentificationRun', - 'PeptideHit', - 'PeptideIdentification', - 'ProteinHit', - 'ProteinIdentification', - 'SearchParameters', - 'UnassignedPeptideIdentification', - 'UserParam', - 'VariableModification', - 'convexhull', - 'dataProcessing', - 'feature', - 'hposition', - 'hullpoint', - 'param', - 'position', - 'processingAction', - 'pt', - 'quality'}} - -_tandem_schema_defaults = {'ints': { - ('group', 'z'), ('aa', 'at')} | {('domain', k) for k in [ - 'missed_cleavages', 'start', 'end', 'y_ions', 'b_ions', - 'a_ions', 'x_ions', 'c_ions', 'z_ions']}, - - 'floats': {('group', k) for k in [ - 'fI', 'sumI', 'maxI', 'mh', 'expect']} | { - ('domain', k) for k in [ - 'expect', 'hyperscore', 'b_score', 'y_score', - 'a_score', 'x_score', 'c_score', 'z_score', - 'nextscore', 'delta', 'mh']} | { - ('protein', 'expect'), ('protein', 'sumI'), - ('aa', 'modified')}, - - 'bools': set(), - 'lists': {'group', 'trace', 'attribute', 'protein', 'aa', 'note'}, - 'floatlists': {('values', 'values')}, - 'intlists': set(), 'charlists': set(), 'duration': {('group', 'rt')}} - -_mzxml_schema_defaults = {'bools': {('dataProcessing', 'centroided'), - ('dataProcessing', 'chargeDeconvoluted'), - ('dataProcessing', 'deisotoped'), - ('dataProcessing', 'spotIntegration'), - ('maldi', 'collisionGas'), - ('scan', 'centroided'), - ('scan', 'chargeDeconvoluted'), - ('scan', 'deisotoped')}, - 'charlists': set(), - 'floatlists': set(), - 'floats': {('dataProcessing', 'intensityCutoff'), - ('precursorMz', 'precursorIntensity'), - ('precursorMz', 'windowWideness'), - ('precursorMz', 'precursorMz'), - ('scan', 'basePeakIntensity'), - ('scan', 'basePeakMz'), - ('scan', 'cidGasPressure'), - ('scan', 'collisionEnergy'), - ('scan', 'compensationVoltage'), - ('scan', 'endMz'), - ('scan', 'highMz'), - ('scan', 'ionisationEnergy'), - ('scan', 'lowMz'), - ('scan', 'startMz'), - ('scan', 'totIonCurrent')}, - 'duration': {("scan", "retentionTime") - }, - 'intlists': set(), - 'ints': {('msInstrument', 'msInstrumentID'), - ('peaks', 'compressedLen'), - ('precursorMz', 'precursorCharge'), - ('robot', 'deadVolume'), - ('scan', 'msInstrumentID'), - ('scan', 'peaksCount'), - ('scanOrigin', 'num'), - ('scan', 'msLevel')}, - 'lists': {'dataProcessing', - 'msInstrument', - 'parentFile', - 'peaks', - 'plate', - 'precursorMz', - 'scanOrigin', - 'spot'}} - -_mzml_schema_defaults = {'ints': { - ('spectrum', 'index'), - ('instrumentConfigurationList', 'count'), - ('binaryDataArray', 'encodedLength'), - ('cvList', 'count'), - ('binaryDataArray', 'arrayLength'), - ('scanWindowList', 'count'), - ('componentList', 'count'), - ('sourceFileList', 'count'), - ('productList', 'count'), - ('referenceableParamGroupList', 'count'), - ('scanList', 'count'), - ('spectrum', 'defaultArrayLength'), - ('dataProcessingList', 'count'), - ('sourceFileRefList', 'count'), - ('scanSettingsList', 'count'), - ('selectedIonList', 'count'), - ('chromatogram', 'defaultArrayLength'), - ('precursorList', 'count'), - ('chromatogram', 'index'), - ('processingMethod', 'order'), - ('targetList', 'count'), - ('sampleList', 'count'), - ('softwareList', 'count'), - ('binaryDataArrayList', 'count'), - ('spectrumList', 'count'), - ('chromatogramList', 'count'), - ('selectedIon', 'charge state')}, - 'floats': {}, - 'bools': {}, - 'lists': {'scan', 'spectrum', 'sample', 'cv', 'dataProcessing', - 'cvParam', 'source', 'userParam', 'detector', 'product', - 'referenceableParamGroupRef', 'selectedIon', 'sourceFileRef', - 'binaryDataArray', 'analyzer', 'scanSettings', - 'instrumentConfiguration', 'chromatogram', 'target', - 'processingMethod', 'precursor', 'sourceFile', - 'referenceableParamGroup', 'contact', 'scanWindow', 'software'}, - 'intlists': {}, - 'floatlists': {}, - 'charlists': {}} - -_pepxml_schema_defaults = {'ints': - {('xpressratio_summary', 'xpress_light'), - ('distribution_point', 'obs_5_distr'), - ('distribution_point', 'obs_2_distr'), - ('enzymatic_search_constraint', 'max_num_internal_cleavages'), - ('asapratio_lc_heavypeak', 'right_valley'), - ('libra_summary', 'output_type'), - ('distribution_point', 'obs_7_distr'), - ('spectrum_query', 'index'), - ('data_filter', 'number'), - ('roc_data_point', 'num_incorr'), - ('search_hit', 'num_tol_term'), - ('search_hit', 'num_missed_cleavages'), - ('asapratio_lc_lightpeak', 'right_valley'), - ('libra_summary', 'normalization'), - ('specificity', 'min_spacing'), - ('database_refresh_timestamp', 'min_num_enz_term'), - ('enzymatic_search_constraint', 'min_number_termini'), - ('xpressratio_result', 'light_lastscan'), - ('distribution_point', 'obs_3_distr'), - ('spectrum_query', 'end_scan'), - ('analysis_result', 'id'), - ('search_database', 'size_in_db_entries'), - ('search_hit', 'hit_rank'), - ('alternative_protein', 'num_tol_term'), - ('search_hit', 'num_tot_proteins'), - ('asapratio_summary', 'elution'), - ('search_hit', 'tot_num_ions'), - ('error_point', 'num_incorr'), - ('mixture_model', 'precursor_ion_charge'), - ('roc_data_point', 'num_corr'), - ('search_hit', 'num_matched_ions'), - ('dataset_derivation', 'generation_no'), - ('xpressratio_result', 'heavy_firstscan'), - ('xpressratio_result', 'heavy_lastscan'), - ('error_point', 'num_corr'), - ('spectrum_query', 'assumed_charge'), - ('analysis_timestamp', 'id'), - ('xpressratio_result', 'light_firstscan'), - ('distribution_point', 'obs_4_distr'), - ('asapratio_lc_heavypeak', 'left_valley'), - ('fragment_masses', 'channel'), - ('distribution_point', 'obs_6_distr'), - ('affected_channel', 'channel'), - ('search_result', 'search_id'), - ('contributing_channel', 'channel'), - ('asapratio_lc_lightpeak', 'left_valley'), - ('asapratio_peptide_data', 'area_flag'), - ('search_database', 'size_of_residues'), - ('asapratio_peptide_data', 'cidIndex'), - ('mixture_model', 'num_iterations'), - ('mod_aminoacid_mass', 'position'), - ('spectrum_query', 'start_scan'), - ('asapratio_summary', 'area_flag'), - ('mixture_model', 'tot_num_spectra'), - ('search_summary', 'search_id'), - ('xpressratio_timestamp', 'xpress_light'), - ('distribution_point', 'obs_1_distr'), - ('intensity', 'channel'), - ('asapratio_contribution', 'charge'), - ('libra_summary', 'centroiding_preference')}, - 'floats': - {('asapratio_contribution', 'error'), - ('asapratio_lc_heavypeak', 'area_error'), - ('modification_info', 'mod_nterm_mass'), - ('distribution_point', 'model_4_neg_distr'), - ('distribution_point', 'model_5_pos_distr'), - ('spectrum_query', 'precursor_neutral_mass'), - ('asapratio_lc_heavypeak', 'time_width'), - ('xpressratio_summary', 'masstol'), - ('affected_channel', 'correction'), - ('distribution_point', 'model_7_neg_distr'), - ('error_point', 'error'), - ('intensity', 'target_mass'), - ('roc_data_point', 'sensitivity'), - ('distribution_point', 'model_4_pos_distr'), - ('distribution_point', 'model_2_neg_distr'), - ('distribution_point', 'model_3_pos_distr'), - ('mixture_model', 'prior_probability'), - ('roc_data_point', 'error'), - ('intensity', 'normalized'), - ('modification_info', 'mod_cterm_mass'), - ('asapratio_lc_lightpeak', 'area_error'), - ('distribution_point', 'fvalue'), - ('distribution_point', 'model_1_neg_distr'), - ('peptideprophet_summary', 'min_prob'), - ('asapratio_result', 'mean'), - ('point', 'pos_dens'), - ('fragment_masses', 'mz'), - ('mod_aminoacid_mass', 'mass'), - ('distribution_point', 'model_6_neg_distr'), - ('asapratio_lc_lightpeak', 'time_width'), - ('asapratio_result', 'heavy2light_error'), - ('peptideprophet_result', 'probability'), - ('error_point', 'min_prob'), - ('peptideprophet_summary', 'est_tot_num_correct'), - ('roc_data_point', 'min_prob'), - ('asapratio_result', 'heavy2light_mean'), - ('distribution_point', 'model_5_neg_distr'), - ('mixturemodel', 'neg_bandwidth'), - ('asapratio_result', 'error'), - ('xpressratio_result', 'light_mass'), - ('point', 'neg_dens'), - ('asapratio_lc_lightpeak', 'area'), - ('distribution_point', 'model_1_pos_distr'), - ('xpressratio_result', 'mass_tol'), - ('mixturemodel', 'pos_bandwidth'), - ('xpressratio_result', 'light_area'), - ('asapratio_peptide_data', 'heavy_mass'), - ('distribution_point', 'model_2_pos_distr'), - ('search_hit', 'calc_neutral_pep_mass'), - ('intensity', 'absolute'), - ('asapratio_peptide_data', 'light_mass'), - ('distribution_point', 'model_3_neg_distr'), - ('aminoacid_modification', 'mass'), - ('asapratio_lc_heavypeak', 'time'), - ('asapratio_lc_lightpeak', 'time'), - ('asapratio_lc_lightpeak', 'background'), - ('mixture_model', 'est_tot_correct'), - ('point', 'value'), - ('asapratio_lc_heavypeak', 'background'), - ('terminal_modification', 'mass'), - ('fragment_masses', 'offset'), - ('xpressratio_result', 'heavy_mass'), - ('search_hit', 'protein_mw'), - ('libra_summary', 'mass_tolerance'), - ('spectrum_query', 'retention_time_sec'), - ('distribution_point', 'model_7_pos_distr'), - ('asapratio_lc_heavypeak', 'area'), - ('alternative_protein', 'protein_mw'), - ('asapratio_contribution', 'ratio'), - ('xpressratio_result', 'heavy_area'), - ('distribution_point', 'model_6_pos_distr')}, - 'bools': - {('sample_enzyme', 'independent'), - ('intensity', 'reject'), - ('libra_result', 'is_rejected')}, - 'intlists': set(), - 'floatlists': set(), - 'charlists': set(), - 'lists': {'point', 'aminoacid_modification', 'msms_run_summary', - 'mixturemodel', 'search_hit', 'mixturemodel_distribution', - 'sequence_search_constraint', 'specificity', 'alternative_protein', - 'analysis_result', 'data_filter', 'fragment_masses', 'error_point', - 'parameter', 'spectrum_query', 'search_result', 'affected_channel', - 'analysis_summary', 'roc_data_point', 'distribution_point', - 'search_summary', 'mod_aminoacid_mass', 'search_score', 'intensity', - 'analysis_timestamp', 'mixture_model', 'terminal_modification', - 'contributing_channel', 'inputfile'}} - - -_traml_schema_defaults = {'bools': set(), - 'charlists': set(), - 'floatlists': set(), - 'floats': {('Modification', 'averageMassDelta'), - ('Modification', 'monoisotopicMassDelta')}, - 'intlists': set(), - 'ints': {('Modification', 'location')}, - 'lists': {'Compound', - 'Configuration', - 'Contact', - 'Instrument', - 'IntermediateProduct', - 'Interpretation', - 'Modification', - 'Peptide', - 'Protein', - 'ProteinRef', - 'Publication', - 'RetentionTime', - 'RetentionTimeList', - 'Software', - 'SourceFile', - 'Target', - 'Transition', - 'ValidationStatus', - 'cv', - 'cvParam', - 'userParam'}} - -_idxml_schema_defaults = { - 'ints': {('PeptideHit', 'charge'), ('SearchParameters', 'missed_cleavages'), - ('PeptideHit', 'NumMatchedMainIons'), ('PeptideHit', 'IsotopeError')}, - 'floats': {('IdXML', 'version'), - ('PeptideHit', 'score'), - ('PeptideIdentification', 'MZ'), - ('PeptideIdentification', 'RT'), - ('PeptideIdentification', 'significance_threshold'), - ('PeptideHit', 'MS2IonCurrent'), - ('PeptideHit', 'MeanErrorAll'), - ('PeptideHit', 'MeanErrorTop7'), - ('PeptideHit', 'MeanRelErrorAll'), - ('PeptideHit', 'MeanRelErrorTop7'), - ('PeptideHit', 'NTermIonCurrentRatio'), - ('PeptideHit', 'CTermIonCurrentRatio'), - ('PeptideHit', 'StdevErrorAll'), - ('PeptideHit', 'StdevErrorTop7'), - ('PeptideHit', 'StdevRelErrorAll'), - ('PeptideHit', 'StdevRelErrorTop7'), - ('PeptideHit', 'ExplainedIonCurrentRatio'), - ('ProteinHit', 'coverage'), - ('ProteinHit', 'score'), - ('ProteinIdentification', 'significance_threshold'), - ('SearchParameters', 'peak_mass_tolerance'), - ('SearchParameters', 'precursor_peak_tolerance')}, - 'bools': {('PeptideIdentification', 'higher_score_better'), - ('ProteinIdentification', 'higher_score_better'), - ('SearchParameters', 'peak_mass_tolerance_ppm'), - ('SearchParameters', 'precursor_peak_tolerance_ppm')}, - 'intlists': set(), - 'floatlists': set(), - 'charlists': set(), - 'lists': {'FixedModification', - 'IdentificationRun', - 'PeptideHit', - 'PeptideIdentification', - 'ProteinHit', - 'ProteinIdentification', - 'SearchParameters', - 'UserParam', - 'VariableModification'}} diff --git a/pyteomics/achrom.py b/pyteomics/achrom.py deleted file mode 100644 index 05a3224618136616491571bf51156099e5efeed4..0000000000000000000000000000000000000000 --- a/pyteomics/achrom.py +++ /dev/null @@ -1,1326 +0,0 @@ -""" -achrom - additive model of polypeptide chromatography -===================================================== - -Summary -------- - -The additive model of polypeptide chromatography, or achrom, is the most basic -model for peptide retention time prediction. The main equation behind -achrom has the following form: - -.. math:: - - RT = (1 + m\\,ln N) \\sum_{i=1}^{i=N}{RC_i n_i} + RT_0 - - -Here, :math:`RC_i` is the retention coefficient of the amino acid -residues of the i-th type, :math:`n_i` corresponds to the number of amino acid -residues of type :math:`i` in the peptide sequence, N is the total number of -different *types* of amino acid residues present, -and :math:`RT_0` is a constant retention time shift. - -In order to use achrom, one needs to find the retention -coeffcients, using experimentally determined retention times for a training set -of peptide retention times, i.e. to *calibrate* the model. - -Calibration ------------ - - :py:func:`get_RCs` - find a set of retention coefficients using a - given set of peptides with known retention times and a fixed value of - length correction parameter. - - :py:func:`get_RCs_vary_lcp` - find the best length correction parameter - and a set of retention coefficients for a given peptide sample. - -Retention time calculation --------------------------- - - :py:func:`calculate_RT` - calculate the retention time of a peptide - using a given set of retention coefficients. - -Data ----- - - :py:data:`RCs_guo_ph2_0` - a set of retention coefficients (RCs) - from [#Guo1]_. Conditions: Synchropak RP-P C18 column (250 x 4.1 mm - I.D.), gradient (A = 0.1% aq. TFA, pH 2.0; B = 0.1% TFA in acetonitrile) at - 1% B/min, flow rate 1 ml/min, 26 centigrades. - - :py:data:`RCs_guo_ph7_0` - a set of retention coefficients (RCs) - from [#Guo1]_. Conditions: Synchropak RP-P C18 column (250 x 4.1 mm - I.D.), gradient (A = aq. 10 mM (NH4)2HPO4 - 0.1 M NaClO4, pH 7.0; B - = 0.1 M NaClO4 in 60% aq. acetonitrile) at 1.67% B/min, flow rate 1 - ml/min, 26 centigrades. - - :py:data:`RCs_meek_ph2_1` - a set of RCs from [#Meek]_. Conditions: Bio-Rad - "ODS" column, gradient (A = 0.1 M NaClO4, 0.1% phosphoric acid in - water; B = 0.1 M NaClO4, 0.1% phosphoric acid in 60% - aq. acetonitrile) at 1.25% B/min, room temperature. - - :py:data:`RCs_meek_ph7_4` - a set of RCs from [#Meek]_. Conditions: Bio-Rad - "ODS" column, gradient (A = 0.1 M NaClO4, 5 mM phosphate buffer in - water; B = 0.1 M NaClO4, 5 mM phosphate buffer in 60% - aq. acetonitrile) at 1.25% B/min, room temperature. - - :py:data:`RCs_browne_tfa` - a set of RCs found in - [#Browne]_. Conditions: Waters mjuBondapak C18 column, gradient (A = - 0.1% aq. TFA, B = 0.1% TFA in acetonitrile) at 0.33% B/min, flow - rate 1.5 ml/min. - - :py:data:`RCs_browne_hfba` - a set of RCs found in - [#Browne]_. Conditions: Waters mjuBondapak C18 column, gradient (A = - 0.13% aq. HFBA, B = 0.13% HFBA in acetonitrile) at 0.33% B/min, flow - rate 1.5 ml/min. - - :py:data:`RCs_palmblad` - a set of RCs from - [#Palmblad]_. Conditions: a fused silica column (80-100 x 0.200 mm - I.D.) packed in-house with C18 ODS-AQ; solvent A = 0.5% aq. HAc, - B = 0.5% HAc in acetonitrile. - - :py:data:`RCs_yoshida` - a set of RCs for normal phase chromatography - from [#Yoshida]_. Conditions: - TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = 0.1% TFA - in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at 0.6% - water/min, flow rate 1.0 ml/min, 40 centigrades. - - :py:data:`RCs_yoshida_lc` - a set of length-corrected RCs for normal phase - chromatography. The set was calculated in [#Moskovets]_ for the data from - [#Yoshida]_. - Conditions: - TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = 0.1% TFA - in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at 0.6% - water/min, flow rate 1.0 ml/min, 40 centigrades. - - :py:data:`RCs_zubarev` - a set of length-corrected RCs calculated - on a dataset used in [#Goloborodko]_. - Conditions: Reprosil-Pur C18-AQ column (150 x 0.075 mm I.D.), gradient (A = - 0.5% AA in water; B = 0.5% AA in ACN-water (90:10)) at - 0.5% water/min, flow rate 200.0 nl/min, room temperature. - - :py:data:`RCs_gilar_atlantis_ph3_0` - a set of retention coefficients obtained - in [#Gilar]_. - Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A, - gradient (A = water, B = ACN, C = 200 mM ammonium formate): - 0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C - at 0.2 ml/min, temperature 40 C, pH 3.0 - - :py:data:`RCs_gilar_atlantis_ph4_5` - a set of retention coefficients obtained - in [#Gilar]_. - Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A, - gradient (A = water, B = ACN, C = 200 mM ammonium formate): - 0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C - at 0.2 ml/min, temperature 40 C, pH 4.5 - - :py:data:`RCs_gilar_atlantis_ph10_0` - a set of retention coefficients - obtained in [#Gilar]_. - Conditions: Atlantis HILIC silica column, (150 x 2.1 mm I.D.), 3 um, 100 A, - gradient (A = water, B = ACN, C = 200 mM ammonium formate): - 0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C - at 0.2 ml/min, temperature 40 C, pH 10.0 - - :py:data:`RCs_gilar_beh` - a set of retention coefficients obtained in - [#Gilar]_. - Conditions: ACQUITY UPLC BEH HILIC column (150 x 2.1 mm I.D.), 1.7 um, 130 A, - Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by - titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: - 90% ACN, 10% mobile phase A (v:v). - Gradient: 90-60% B in 50 min. - - :py:data:`RCs_gilar_beh_amide` - a set of retention coefficients obtained in - [#Gilar]_. - Conditions: ACQUITY UPLC BEH glycan column (150 x 2.1 mm I.D.), 1.7 um, 130 A, - Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by - titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: - 90% ACN, 10% mobile phase A (v:v). - Gradient: 90-60% B in 50 min. - - :py:data:`RCs_gilar_rp` - a set of retention coefficients obtained in - [#Gilar]_. - Conditions: ACQUITY UPLC BEH C18 column (100 mm x 2.1 mm I.D.), 1.7 um, 130 A. - Mobile phase A: 0.02% TFA in water, mobile phase B: 0.018% TFA in ACN. - Gradient: 0 to 50% B in 50 min, flow rate 0.2 ml/min, temperature 40 C., - pH 2.6. - - :py:data:`RCs_krokhin_100A_fa` - a set of retention coefficients obtained in - [#Krokhin]_. - Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% FA), packed with - 5-um Luna C18(2) (Phenomenex, Torrance, CA), pH=2.0. - Both eluents A (2% ACN in water) and B (98% ACN) contained - 0.1% FA as ion-pairing modifier. 0.33% ACN/min - linear gradient (0-30% B). - - :py:data:`RCs_krokhin_100A_tfa` - a set of retention coefficients obtained in - [#Krokhin]_. - Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% TFA), packed with - 5-um Luna C18(2) (Phenomenex, Torrance, CA), pH=2.0. - Both eluents A (2% ACN in water) and B (98% ACN) contained - 0.1% TFA as ion-pairing modifier. 0.33% ACN/min - linear gradient (0-30% B). - -Theory ------- - -The additive model of polypeptide chromatography, or the model of -retention coefficients was the earliest attempt to describe the dependence of -retention time of a polypeptide in liquid chromatography on its sequence -[#Meek]_, [#Guo1]_. In this model, each amino acid is assigned a number, or -a *retention coefficient* (RC) describing its retention properties. The -retention time (RT) during a gradient elution is then calculated as: - -.. math:: - - RT = \\sum_{i=1}^{i=N}{RC_i \\cdot n_i} + RT_0, - -which is the sum of retention coefficients of all amino acid residues in a -polypeptide. This equation can also be expressed in terms of linear -algebra: - -.. math:: - - RT = \\bar{aa} \\cdot \\bar{RC} + RT_0, - -where :math:`\\bar{aa}` is a vector of amino acid composition, -i.e. :math:`\\bar{aa}_i` is the number of amino acid residues of i-th -type in a polypeptide; :math:`\\bar{RC}` is a vector of respective -retention coefficients. - -In this formulation, it is clear that additive model gives the same results for -any two peptides with different sequences but the same amino acid -composition. In other words, **additive model is not sequence-specific**. - -The additive model has two advantages over all other models of chromatography -- it is easy to understand and use. The rule behind the additive model is as -simple as it could be: **each amino acid residue shifts retention time by a -fixed value, depending only on its type**. This rule allows geometrical -interpretation. Each peptide may be represented by a point in 21-dimensional -space, with first 20 coordinates equal to the amounts of corresponding amino -acid residues in the peptide and 21-st coordinate equal to RT. The additive -model assumes that a line may be drawn through these points. Of course, this -assumption is valid only partially, and most points would not lie on the -line. But the line would describe the main trend and could be used to estimate -retention time for peptides with known amino acid composition. - -This best fit line is described by retention coefficients and :math:`RT_0`. -The procedure of finding these coefficients is called *calibration*. There is -`an analytical solution to calibration of linear models -<http://en.wikipedia.org/wiki/Linear_regression>`_, which makes them -especially useful in real applications. - -Several attempts were made in order to improve the accuracy of prediction by -the additive model (for a review of the field we suggest to read [#Baczek]_ -and [#Babushok]_). The two implemented in this module are the logarithmic -length correction term described in [#MantLogLen]_ and additional sets of -retention coefficients for terminal amino acid residues [#Tripet]_. - -Logarithmic length correction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This enhancement was firstly described in [#MantLogLen]_. Briefly, it was -found that the following equation better describes the dependence of RT on the -peptide sequence: - -.. math:: - - RT = \\sum_{i=1}^{i=N}{RC_i} + m\\,ln N \\sum_{i=1}^{i=N}{RC_i} + RT_0 - -We would call the second term :math:`m\\,ln N \\sum_{i=1}^{i=N}{RC_i}` *the -length correction term* and m - *the length correction parameter*. The -simplified and vectorized form of this equation would be: - -.. math:: - - RT = (1 + m\\,ln N) \\, \\bar{RC} \\cdot \\bar{aa} + RT_0 - -This equation may be reduced to a linear form and solved by the standard -methods. - -Terminal retention coefficients -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Another significant improvement may be obtained through introduction of -separate sets of retention coefficients for terminal amino acid residues -[#Tripet]_. - -References ----------- - -.. [#Meek] Meek, J. L. `Prediction of peptide retention times in high-pressure - liquid chromatography on the basis of amino acid composition. - <http://www.ncbi.nlm.nih.gov/pubmed/6929513>`_ - PNAS, 1980, 77 (3), 1632-1636. - -.. [#Guo1] Guo, D.; Mant, C. T.; Taneja, A. K.; Parker, J. M. R.; Hodges, - R. S. `Prediction of peptide retention times in reversed-phase - high-performance liquid chromatography I. Determination of retention - coefficients of amino acid residues of model synthetic peptides. - <http://dx.doi.org/10.1016/0021-9673(86)80102-9>`_ - Journal of Chromatography A, 1986, 359, 499-518. - -.. [#Baczek] Baczek, T.; Kaliszan, R. `Predictions of peptides' retention times - in reversed-phase liquid chromatography as a new supportive tool to improve - protein identification in proteomics. - <http://dx.doi.org/10.1002/pmic.200800544>`_ - Proteomics, 2009, 9 (4), 835-47. - -.. [#Babushok] Babushok, V. I.; Zenkevich, I. G. `Retention Characteristics of - Peptides in RP-LC: Peptide Retention Prediction. - <http://dx.doi.org/10.1365/s10337-010-1721-8>`_ - Chromatographia, 2010, 72 (9-10), 781-797. - -.. [#MantLogLen] Mant, C. T.; Zhou, N. E.; Hodges, R. S. `Correlation of - protein retention times in reversed-phase chromatography with polypeptide - chain length and hydrophobicity. - <http://dx.doi.org/10.1016/S0021-9673(01)93882-8>`_ - Journal of Chromatography A, 1989, 476, 363-375. - -.. [#Tripet] Tripet, B.; Cepeniene, D.; Kovacs, J. M.; Mant, C. T.; Krokhin, - O. V.; Hodges, R. S. `Requirements for prediction of peptide retention time - in reversed-phase high-performance liquid chromatography: - hydrophilicity/hydrophobicity of side-chains at the N- and C-termini of - peptides are dramatically affected by the end-groups and location. - <http://dx.doi.org/10.1016/j.chroma.2006.12.024>`_ - Journal of chromatography A, 2007, 1141 (2), 212-25. - -.. [#Browne] Browne, C. A.; Bennett, H. P. J.; Solomon, S. `The - isolation of peptides by high-performance liquid chromatography - using predicted elution positions - <http://www.sciencedirect.com/science/article/pii/000326978290238X>`_. - Analytical Biochemistry, 1982, 124 (1), 201-208. - -.. [#Palmblad] Palmblad, M.; Ramstrom, M.; Markides, K. E.; Hakansson, - P.; Bergquist, J. `Prediction of Chromatographic Retention and - Protein Identification in Liquid Chromatography/Mass - Spectrometry - <http://pubs.acs.org/doi/abs/10.1021/ac0256890>`_. - Analytical Chemistry, 2002, 74 (22), 5826-5830. - -.. [#Yoshida] Yoshida, T. Calculation of peptide retention - coefficients in normal-phase liquid chromatography. Journal of - Chromatography A, 1998, 808 (1-2), 105-112. - -.. [#Moskovets] Moskovets, E.; Goloborodko A. A.; Gorshkov A. V.; Gorshkov M.V. - `Limitation of predictive 2-D liquid chromatography in reducing the database - search space in shotgun proteomics: In silico studies. - <http://dx.doi.org/10.1002/jssc.201100798>`_ - Journal of Separation Science, 2012, 35 (14), 1771-1778. - -.. [#Goloborodko] Goloborodko A. A.; Mayerhofer C.; Zubarev A. R.; - Tarasova I. A.; Gorshkov A. V.; Zubarev, R. A.; Gorshkov, M. V. - `Empirical approach to false discovery rate - estimation in shotgun proteomics. <http://dx.doi.org/10.1002/rcm.4417>`_ - Rapid communications in mass spectrometry, 2010, 24(4), 454-62. - -.. [#Gilar] Gilar, M., & Jaworski, A. (2011). `Retention behavior of peptides in - hydrophilic-interaction chromatography. - <http://dx.doi.org/10.1016/j.chroma.2011.04.005>`_ - Journal of chromatography A, 1218(49), 8890-6. - -.. [#Krokhin] Dwivedi, R. C.; Spicer, V.; Harder, M.; Antonovici, M.; Ens, W.; - Standing, K. G.; Wilkins, J. A.; Krokhin, O. V. (2008). `Practical - implementation of 2D HPLC scheme with accurate peptide retention prediction - in both dimensions for high-throughput bottom-up proteomics - <http://pubs.acs.org/doi/abs/10.1021/ac800984n>`_. - Analytical Chemistry, 80(18), 7036-42. - -Dependencies ------------- - -This module requires :py:mod:`numpy` and, optionally, :py:mod:`scikit-learn` -(for MAE regression). - --------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -from .auxiliary import linear_regression, PyteomicsError -try: - from sklearn.linear_model import QuantileRegressor -except ImportError: - QuantileRegressor = None - -from . import parser - -def get_RCs(sequences, RTs, lcp=-0.21, term_aa=False, metric='mse', **kwargs): - """Calculate the retention coefficients of amino acids using - retention times of a peptide sample and a fixed value of length - correction parameter. - - Parameters - ---------- - sequences : list of str - List of peptide sequences. - RTs: list of float - List of corresponding retention times. - lcp : float, optional - A multiplier before ln(L) term in the equation for the retention - time of a peptide. Set to -0.21 by default. - term_aa : bool, optional - If :py:const:`True`, terminal amino acids are treated as being - modified with 'ntermX'/'ctermX' modifications. :py:const:`False` - by default. - metric : str, optional - Metric for the regression problem. Set to "mse" (mean squared - error) by default. Alternative: "mae" (mean absolute error), - which uses quantile regression. - - .. note :: - `"mae"` requires :py:mod:`scikit-learn` for - `quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_. - - labels : list of str, optional - List of all possible amino acids and terminal groups - If not given, any modX labels are allowed. - - Returns - ------- - RC_dict : dict - Dictionary with the calculated retention coefficients. - - - RC_dict['aa'] -- amino acid retention coefficients. - - - RC_dict['const'] -- constant retention time shift. - - - RC_dict['lcp'] -- length correction parameter. - - Examples - -------- - >>> RCs = get_RCs(['A','AA'], [1.0, 2.0], 0.0, labels=['A']) - >>> abs(RCs['aa']['A'] - 1) < 1e-6 and abs(RCs['const']) < 1e-6 - True - >>> RCs = get_RCs(['A','AA','B'], [1.0, 2.0, 2.0], 0.0, labels=['A','B']) - >>> abs(RCs['aa']['A'] - 1) + abs(RCs['aa']['B'] - 2) + \ - abs(RCs['const']) < 1e-6 - True - """ - - labels = kwargs.get('labels') - - # Make a list of all amino acids present in the sample. - peptide_dicts = [ - parser.amino_acid_composition(peptide, False, term_aa, - allow_unknown_modifications=True, - labels=labels) - if not isinstance(peptide, dict) else peptide - for peptide in sequences] - - detected_amino_acids = {aa for peptide_dict in peptide_dicts - for aa in peptide_dict} - - # Determine retention coefficients using multidimensional linear - # regression. - composition_array = [] - for pdict in peptide_dicts: - loglen = np.log(parser.length(pdict)) - composition_array.append([pdict.get(aa, 0.) - * (1. + lcp * loglen) - for aa in detected_amino_acids] + [1.]) - - # Add normalizing conditions for terminal retention coefficients. The - # condition we are using here is quite arbitrary. It implies that the sum - # of N- or C-terminal RCs minus the sum of corresponding internal RCs must - # be equal to zero. - if term_aa: - for term_label in ['nterm', 'cterm']: - normalizing_peptide = [] - for aa in detected_amino_acids: - if aa.startswith(term_label): - normalizing_peptide.append(1.0) - elif (term_label+aa) in detected_amino_acids: - normalizing_peptide.append(-1.0) - else: - normalizing_peptide.append(0.0) - normalizing_peptide.append(0.0) - composition_array.append(normalizing_peptide) - RTs.append(0.0) - - if metric == 'mse': - # # Use least square linear regression. - RCs, _, _, _ = np.linalg.lstsq(np.array(composition_array), np.array(RTs), rcond=None) - - elif metric == 'mae': - if QuantileRegressor is None: - raise PyteomicsError("`metric='mae'` requires scikit-learn.") - # Use Quantile regression. - QR = QuantileRegressor(fit_intercept=False, alpha=0, solver='highs') - QR.fit(np.array(composition_array), np.array(RTs)) - RCs = QR.coef_ - else: - raise PyteomicsError('Invalid metric "{}". Must be "mse" or "mae".'.format(metric)) - - # Remove normalizing elements from the RTs vector. - if term_aa: - for term_label in ['nterm', 'cterm']: - RTs.pop() - - # Form output. - RC_dict = {} - RC_dict['aa'] = dict( - zip(list(detected_amino_acids), - RCs[:len(detected_amino_acids)])) - RC_dict['aa'][parser.std_nterm] = 0.0 - RC_dict['aa'][parser.std_cterm] = 0.0 - RC_dict['const'] = RCs[len(detected_amino_acids)] - RC_dict['lcp'] = lcp - - # Find remaining terminal RCs. - if term_aa: - for term_label in ['nterm', 'cterm']: - # Check if there are terminal RCs remaining undefined. - undefined_term_RCs = [aa for aa in RC_dict['aa'] - if aa[1:5] != 'term' - and term_label + aa not in RC_dict['aa']] - if not undefined_term_RCs: - continue - - # Find a linear relationship between internal and terminal RCs. - defined_term_RCs = [aa for aa in RC_dict['aa'] - if aa[1:5] != 'term' - and term_label + aa in RC_dict['aa']] - - a, b, r, stderr = linear_regression( - [RC_dict['aa'][aa] for aa in defined_term_RCs], - [RC_dict['aa'][term_label+aa] for aa in defined_term_RCs]) - - # Define missing terminal RCs using this linear equation. - for aa in undefined_term_RCs: - RC_dict['aa'][term_label + aa] = a * RC_dict['aa'][aa] + b - - return RC_dict - - -def get_RCs_vary_lcp(sequences, RTs, term_aa=False, lcp_range=(-1.0, 1.0), metric='mse', **kwargs): - """Find the best combination of a length correction parameter and - retention coefficients for a given peptide sample. - - Parameters - ---------- - sequences : list of str - List of peptide sequences. - RTs : list of float - List of corresponding retention times. - term_aa : bool, optional - If True, terminal amino acids are treated as being - modified with 'ntermX'/'ctermX' modifications. False by default. - metric : str, optional - Metric for the regression problem. Set to "mse" (mean squared - error) by default. Alternative: "mae" (mean absolute error). - - .. note :: - `"mae"` requires :py:mod:`scikit-learn` for - `quantile regression <https://scikit-learn.org/stable/auto_examples/linear_model/plot_quantile_regression.html>`_. - - lcp_range : 2-tuple of float, optional - Range of possible values of the length correction parameter. - labels : list of str, optional - List of labels for all possible amino acids and terminal groups - If not given, any modX labels are allowed. - lcp_accuracy : float, optional - The accuracy of the length correction parameter calculation. - - Returns - ------- - RC_dict : dict - Dictionary with the calculated retention coefficients. - - - RC_dict['aa'] -- amino acid retention coefficients. - - - RC_dict['const'] -- constant retention time shift. - - - RC_dict['lcp'] -- length correction parameter. - - Examples - -------- - >>> RCs = get_RCs_vary_lcp(['A', 'AA', 'AAA'], \ - [1.0, 2.0, 3.0], \ - labels=['A']) - >>> abs(RCs['aa']['A'] - 1) + abs(RCs['lcp']) + abs(RCs['const']) < 1e-6 - True - """ - labels = kwargs.get('labels') - - best_r = -1.1 - best_RC_dict = {} - lcp_accuracy = kwargs.get('lcp_accuracy', 0.1) - - min_lcp = lcp_range[0] - max_lcp = lcp_range[1] - step = (max_lcp - min_lcp) / 10.0 - peptide_dicts = [ - parser.amino_acid_composition(peptide, False, term_aa, - allow_unknown_modifications=True, - labels=labels) - if not isinstance(peptide, dict) else peptide - for peptide in sequences] - while step > lcp_accuracy: - lcp_grid = np.arange(min_lcp, max_lcp, - (max_lcp - min_lcp) / 10.0) - for lcp in lcp_grid: - RC_dict = get_RCs(peptide_dicts, RTs, lcp, term_aa, labels=labels, metric=metric) - regression_coeffs = linear_regression( - RTs, - [calculate_RT(peptide, RC_dict) for peptide in peptide_dicts]) - if regression_coeffs[2] > best_r: - best_r = regression_coeffs[2] - best_RC_dict = dict(RC_dict) - min_lcp = best_RC_dict['lcp'] - step - max_lcp = best_RC_dict['lcp'] + step - step = (max_lcp - min_lcp) / 10.0 - - return best_RC_dict - - -def calculate_RT(peptide, RC_dict, raise_no_mod=True): - """Calculate the retention time of a peptide using a given set - of retention coefficients. - - Parameters - ---------- - peptide : str or dict - A peptide sequence or amino acid composition. - RC_dict : dict - A set of retention coefficients, length correction parameter and - a fixed retention time shift. Keys are: 'aa', 'lcp' and 'const'. - raise_no_mod : bool, optional - If :py:const:`True` then an exception is raised when a modified amino - acid from `peptides` is not found in `RC_dict`. If :py:const:`False`, - then the retention coefficient for the non-modified amino acid residue - is used instead. :py:const:`True` by default. - - Returns - ------- - RT : float - Calculated retention time. - - Examples - -------- - >>> RT = calculate_RT('AA', {'aa': {'A': 1.1}, 'lcp':0.0, 'const': 0.1}) - >>> abs(RT - 2.3) < 1e-6 # Float comparison - True - >>> RT = calculate_RT('AAA', {'aa': {'ntermA': 1.0, 'A': 1.1, 'ctermA': 1.2},\ - 'lcp': 0.0, 'const':0.1}) - >>> abs(RT - 3.4) < 1e-6 # Float comparison - True - >>> RT = calculate_RT({'A': 3}, {'aa': {'ntermA': 1.0, 'A': 1.1, 'ctermA': 1.2},\ - 'lcp': 0.0, 'const':0.1}) - >>> abs(RT - 3.4) < 1e-6 # Float comparison - True - """ - - amino_acids = [aa for aa in RC_dict['aa'] - if not (aa[:5] == 'nterm' or aa[:5] == 'cterm')] - - # Check if there are retention coefficients for terminal amino acids. - term_aa = False - for aa in RC_dict['aa']: - if aa[:5] == 'nterm' or aa[:5] == 'cterm': - term_aa = True - break - - # Calculate retention time. - if isinstance(peptide, dict): - peptide_dict = peptide - else: - peptide_dict = parser.amino_acid_composition(peptide, False, term_aa, - allow_unknown_modifications=True, labels=amino_acids) - RT = 0.0 - for aa in peptide_dict: - if aa not in RC_dict['aa']: - if len(aa) == 1: - raise PyteomicsError('No RC for residue "{}".'.format(aa)) - if (not raise_no_mod) and aa[-1] in RC_dict['aa']: - RT += peptide_dict[aa] * RC_dict['aa'][aa[-1]] - else: - raise PyteomicsError( - 'Residue "{0}" not found in RC_dict. '.format(aa) + - 'Set raise_no_mod=False to ignore this error ' + - 'and use the RC for "{0}"" instead.'.format(aa[-1])) - else: - RT += peptide_dict[aa] * RC_dict['aa'][aa] - - length_correction_term = ( - 1.0 + RC_dict.get('lcp', 0) * np.log(parser.length(peptide_dict))) - RT *= length_correction_term - - RT += RC_dict.get('const', 0) - - return RT - -RCs_guo_ph2_0 = {'aa':{'K': -2.1, - 'G': -0.2, - 'L': 8.1, - 'A': 2.0, - 'C': 2.6, - 'E': 1.1, - 'D': 0.2, - 'F': 8.1, - 'I': 7.4, - 'H': -2.1, - 'M': 5.5, - 'N': -0.6, - 'Q': 0.0, - 'P': 2.0, - 'S': -0.2, - 'R': -0.6, - 'T': 0.6, - 'W': 8.8, - 'V': 5.0, - 'Y': 4.5, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients from Guo, D.; Mant, C. T.; Taneja, -A. K.; Parker, J. M. R.; Hodges, R. S. Prediction of peptide -retention times in reversed-phase high-performance liquid -chromatography I. Determination of retention coefficients of amino -acid residues of model synthetic peptides. Journal of Chromatography -A, 1986, 359, 499-518. - -Conditions: Synchropak RP-P C18 column (250 x 4.1 mm I.D.), gradient -(A = 0.1% aq. TFA, pH 2.0; B = 0.1% TFA in acetonitrile) at 1% B/min, -flow rate 1 ml/min, 26 centigrades. -""" - -RCs_guo_ph7_0 = {'aa':{'K': -0.2, - 'G': -0.2, - 'L': 9.0, - 'A': 2.2, - 'C': 2.6, - 'E': -1.3, - 'D': -2.6, - 'F': 9.0, - 'I': 8.3, - 'H': 2.2, - 'M': 6.0, - 'N': -0.8, - 'Q': 0.0, - 'P': 2.2, - 'S': -0.5, - 'R': 0.9, - 'T': 0.3, - 'W': 9.5, - 'V': 5.7, - 'Y': 4.6, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients from Guo, D.; Mant, C. T.; Taneja, -A. K.; Parker, J. M. R.; Hodges, R. S. Prediction of peptide -retention times in reversed-phase high-performance liquid -chromatography I. Determination of retention coefficients of amino -acid residues of model synthetic peptides. Journal of Chromatography -A, 1986, 359, 499-518. - -Conditions: Synchropak RP-P C18 column (250 x 4.1 mm I.D.), gradient -(A = aq. 10 mM (NH4)2HPO4 - 0.1 M NaClO4, pH 7.0; B = 0.1 M NaClO4 in -60% aq. acetonitrile) at 1.67% B/min, flow rate 1 ml/min, 26 -centigrades. -""" - -RCs_meek_ph2_1 = {'aa':{'K': -3.2, - 'G': -0.5, - 'L': 10.0, - 'A': -0.1, - 'C': -2.2, - 'E': -7.5, - 'D': -2.8, - 'F': 13.9, - 'I': 11.8, - 'H': 0.8, - 'M': 7.1, - 'N': -1.6, - 'Q': -2.5, - 'P': 8.0, - 'S': -3.7, - 'R': -4.5, - 'T': 1.5, - 'W': 18.1, - 'V': 3.3, - 'Y': 8.2, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients determined in Meek, -J. L. Prediction of peptide retention times in high-pressure liquid -chromatography on the basis of amino acid composition. PNAS, 1980, 77 -(3), 1632-1636. - -.. note :: C stands for Cystine. - -Conditions: Bio-Rad "ODS" column, gradient (A = 0.1 M NaClO4, -0.1% phosphoric acid in water; B = 0.1 M NaClO4, 0.1% phosphoric acid -in 60% aq. acetonitrile) at 1.25% B/min, room temperature. -""" - -RCs_meek_ph7_4 = {'aa':{'K': 0.1, - 'G': 0.0, - 'L': 8.8, - 'A': 0.5, - 'C': -6.8, - 'E':-16.9, - 'D': -8.2, - 'F': 13.2, - 'I': 13.9, - 'H': -3.5, - 'M': 4.8, - 'N': 0.8, - 'Q': -4.8, - 'P': 6.1, - 'S': 1.2, - 'R': 0.8, - 'T': 2.7, - 'W': 14.9, - 'V': 2.7, - 'Y': 6.1, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients determined in Meek, -J. L. Prediction of peptide retention times in high-pressure liquid -chromatography on the basis of amino acid composition. PNAS, 1980, 77 -(3), 1632-1636. - -.. note :: C stands for Cystine. - -Conditions: Bio-Rad "ODS" column, gradient (A = 0.1 M NaClO4, -5 mM phosphate buffer in water; B = 0.1 M NaClO4, 5 mM phosphate buffer -in 60% aq. acetonitrile) at 1.25% B/min, room temperature. -""" - -RCs_browne_tfa = {'aa':{'K': -3.7, - 'G': -1.2, - 'L': 20.0, - 'A': 7.3, - 'C': -9.2, - 'E': -7.1, - 'D': -2.9, - 'F': 19.2, - 'I': 6.6, - 'H': -2.1, - 'M': 5.6, - 'N': -5.7, - 'Q': -0.3, - 'P': 5.1, - 'S': -4.1, - 'pS':-6.5, - 'R': -3.6, - 'T': 0.8, - 'pT':-1.6, - 'W': 16.3, - 'V': 3.5, - 'Y': 5.9, - 'pY': 3.5, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients determined in Browne, C. A.; -Bennett, H. P. J.; Solomon, S. The isolation of peptides by -high-performance liquid chromatography using predicted elution -positions. Analytical Biochemistry, 1982, 124 (1), 201-208. - -Conditions: Waters mjuBondapak C18 column, gradient (A = 0.1% aq. TFA, -B = 0.1% TFA in acetonitrile) at 0.33% B/min, flow rate 1.5 ml/min. -""" - -RCs_browne_hfba = {'aa':{'K': -2.5, - 'G': -2.3, - 'L': 15.0, - 'A': 3.9, - 'C':-14.3, - 'E': -7.5, - 'D': -2.8, - 'F': 14.7, - 'I': 11.0, - 'H': 2.0, - 'M': 4.1, - 'N': -2.8, - 'Q': 1.8, - 'P': 5.6, - 'S': -3.5, - 'pS':-7.6, - 'R': 3.2, - 'T': 1.1, - 'pT':-3.0, - 'W': 17.8, - 'V': 2.1, - 'Y': 3.8, - 'pY':-0.3, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients determined in Browne, C. A.; -Bennett, H. P. J.; Solomon, S. The isolation of peptides by -high-performance liquid chromatography using predicted elution -positions. Analytical Biochemistry, 1982, 124 (1), 201-208. - -Conditions: Waters mjuBondapak C18 column, gradient (A = 0.13% aq. HFBA, -B = 0.13% HFBA in acetonitrile) at 0.33% B/min, flow rate 1.5 ml/min. -""" - -RCs_palmblad = {'aa':{'K': -0.66, - 'G': -0.29, - 'L': 2.28, - 'A': 0.41, - 'C': -1.32, - 'E': -0.26, - 'D': 0.04, - 'F': 2.68, - 'I': 2.70, - 'H': 0.57, - 'M': 0.98, - 'N': -0.54, - 'Q': 1.02, - 'P': 0.97, - 'S': -0.71, - 'R': -0.76, - 'T': 0.37, - 'W': 4.68, - 'V': 2.44, - 'Y': 2.78, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients determined in Palmblad, M.; -Ramstrom, M.; Markides, K. E.; Hakansson, P.; Bergquist, J. Prediction -of Chromatographic Retention and Protein Identification in Liquid -Chromatography/Mass Spectrometry. Analytical Chemistry, 2002, 74 (22), -5826-5830. - -Conditions: a fused silica column (80-100 x 0.200 mm I.D.) packed -in-house with C18 ODS-AQ; solvent A = 0.5% aq. HAc, B = 0.5% HAc in -acetonitrile. -""" - -RCs_yoshida = {'aa':{'K': 2.77, - 'G': -0.16, - 'L': -2.31, - 'A': 0.28, - 'C': 0.80, - 'camC': 0.80, - 'E': 1.58, - 'D': 2.45, - 'F': -2.94, - 'I': -1.34, - 'H': 3.44, - 'M': -0.14, - 'N': 3.25, - 'Q': 2.35, - 'P': 0.77, - 'S': 2.53, - 'R': 3.90, - 'T': 1.73, - 'W': -1.80, - 'V': -2.19, - 'Y': -0.11, - 'H-': 0.0, - '-OH':0.0}, - 'lcp': 0.0, - 'const': 0.0} -"""A set of retention coefficients determined in Yoshida, -T. Calculation of peptide retention coefficients in normal-phase -liquid chromatography. Journal of Chromatography A, 1998, 808 (1-2), -105-112. - -.. note:: Cysteine is Carboxymethylated. - -Conditions: TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = -0.1% TFA in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at -0.6% water/min, flow rate 1.0 ml/min, 40 centigrades. -""" - -RCs_yoshida_lc = {'aa': {'A': 1.29, - 'C': 0.94, - 'camC': 0.94, - 'D': 3.89, - 'E': 4.40, - 'F': -4.18, - 'G': 1.29, - 'H': 7.57, - 'I': -2.65, - 'K': 7.33, - 'L': -3.93, - 'M': -1.48, - 'N': 6.65, - 'P': 1.03, - 'Q': 6.68, - 'R': 7.08, - 'S': 5.09, - 'T': 3.46, - 'V': -2.52, - 'W': -1.87, - 'Y': -0.46, - 'H-': 0.0, - '-OH': 0.0}, - 'const': 0.0, - 'lcp': -0.2} -"""A set of retention coefficients from the length-corrected model -of normal-phase peptide chromatography. The dataset comes from Yoshida, T. -Calculation of peptide retention coefficients in normal-phase -liquid chromatography. Journal of Chromatography A, 1998, 808 (1-2), -105-112. The RCs were calculated in Moskovets, E.; Goloborodko A. A.; -Gorshkov A. V.; Gorshkov M.V. Limitation of predictive 2-D liquid chromatography -in reducing the database search space in shotgun proteomics: In silico studies. -Journal of Separation Science, 2012, 35 (14), 1771-1778. - -.. note:: Cysteine is Carboxymethylated. - -Conditions: TSK gel Amide-80 column (250 x 4.6 mm I.D.), gradient (A = -0.1% TFA in ACN-water (90:10); B = 0.1% TFA in ACN-water (55:45)) at -0.6% water/min, flow rate 1.0 ml/min, 40 centigrades. -""" - -RCs_zubarev = {'aa': {'A': 6.73, - 'E': 5.66, - 'C': 3.25, - 'D': 5.64, - 'G': 2.35, - 'F': 27.43, - 'I': 20.50, - 'H': -0.66, - 'K': -4.47, - 'M': 17.39, - 'L': 23.38, - 'N': 2.57, - 'Q': 2.93, - 'P': 5.66, - 'S': 3.58, - 'R': -2.55, - 'T': 4.88, - 'Y': 13.22, - 'W': 31.27, - 'V': 13.05, - 'camC': 3.25, - 'C': 3.25, - 'oxM': -7.61, - '-OH': 0.0, - 'H-': 0.0}, - 'const': 0.53, - 'lcp': -0.21} -"""A set of retention coefficients from the length-corrected model -of reversed-phase peptide chromatography. The dataset was taken from -Goloborodko A. A.; Mayerhofer C.; Zubarev A. R.; Tarasova I. A.; Gorshkov A. V.; -Zubarev, R. A.; Gorshkov, M. V. Empirical approach to false discovery rate -estimation in shotgun proteomics. Rapid communications in mass spectrometry, -2010, 24(4), 454-62. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: Reprosil-Pur C18-AQ column (150 x 0.075 mm I.D.), gradient (A = -0.5% AA in water; B = 0.5% AA in ACN-water (90:10)) at -0.5% water/min, flow rate 200.0 nl/min, room temperature. -""" - -RCs_gilar_atlantis_ph3_0 = {'aa': {'K': 15.90, - 'R': 13.64, - 'H': 12.94, - 'E': 2.97, - 'P': 4.77, - 'Q': 5.43, - 'D': 3.20, - 'C*': 4.87, - 'C': 4.87, - 'N': 3.91, - 'A': 3.34, - 'G': 3.33, - 'S': 3.04, - 'T': 2.71, - 'V': 1.75, - 'I': 0.65, - 'M': 1.13, - 'L': 0.13, - 'F': -1.17, - 'Y': -0.22, - 'W': -2.47}, - 'lcp': 0.0, - 'const': 21.33} -"""A set of retention coefficients for normal phase chromatography obtained in -Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in -hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), -8890-6. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A, -gradient (A = water, B = ACN, C = 200 mM ammonium formate): -0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C -at 0.2 ml/min, temperature 40 C, pH 3.0""" - -RCs_gilar_atlantis_ph4_5 = {'aa': {'K': 15.49, - 'R': 13.33, - 'H': 12.19, - 'E': 6.93, - 'P': 5.89, - 'Q': 5.68, - 'D': 5.31, - 'C*': 5.23, - 'C': 5.23, - 'N': 4.07, - 'A': 3.6, - 'G': 3.46, - 'S': 2.62, - 'T': 2.33, - 'V': 1.42, - 'I': 0.84, - 'M': 0.34, - 'L': 0.29, - 'F': -1.21, - 'Y': -1.62, - 'W': -2.08}, - 'lcp': 0.0, - 'const': 23.95} -"""A set of retention coefficients for normal phase chromatography obtained in -Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in -hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), -8890-6. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A, -gradient (A = water, B = ACN, C = 200 mM ammonium formate): -0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C -at 0.2 ml/min, temperature 40 C, pH 4.5""" - -RCs_gilar_atlantis_ph10_0 = {'aa': {'K': 25.23, - 'R': 23.38, - 'H': 5.94, - 'E': 0.59, - 'P': 4.00, - 'Q': 3.53, - 'D': -0.84, - 'C*': 3.52, - 'C': 3.52, - 'N': 3.26, - 'A': 3.64, - 'G': 3.02, - 'S': 2.28, - 'T': 1.74, - 'V': 1.05, - 'I': 1.51, - 'M': -0.61, - 'L': 0.25, - 'F': -0.17, - 'Y': -0.79, - 'W': 0.23}, - 'lcp': 0.0, - 'const': 13.78} -"""A set of retention coefficients for normal phase chromatography obtained in -Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in -hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), -8890-6. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: Atlantis HILIC silica column (150 x 2.1 mm I.D.), 3 um, 100 A, -gradient (A = water, B = ACN, C = 200 mM ammonium formate): -0 min, 5% A, 90% B, 5% C; 62.5 min, 55% A, 40% B, 5% C -at 0.2 ml/min, temperature 40 C, pH 10.0""" - -RCs_gilar_beh = {'aa': {'K': 9.49, - 'R': 8.56, - 'H': 8.40, - 'E': 5.95, - 'P': 4.73, - 'Q': 4.65, - 'D': 4.97, - 'C': 3.47, - 'C*': 3.47, - 'N': 3.50, - 'A': 2.90, - 'G': 2.63, - 'S': 2.14, - 'T': 2.19, - 'V': 1.71, - 'I': 1.30, - 'M': 1.40, - 'L': 0.73, - 'F': -0.09, - 'Y': -0.40, - 'W': 0.11}, - 'lcp': 0.0, - 'const': 18.41} -"""A set of retention coefficients for normal phase chromatography obtained in -Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in -hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), -8890-6. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: ACQUITY UPLC BEH HILIC column (150 x 2.1 mm I.D.), 1.7 um, 130 A, -Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by -titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: -90% ACN, 10% mobile phase A (v:v). -Gradient: 90-60% B in 50 min.""" - -RCs_gilar_beh_amide = {'aa': {'K': 7.19, - 'R': 6.68, - 'H': 6.16, - 'E': 6.11, - 'P': 3.18, - 'Q': 5.19, - 'D': 6.02, - 'C*': 3.71, - 'C': 3.71, - 'N': 4.16, - 'A': 2.64, - 'G': 3.12, - 'S': 3.17, - 'T': 3.41, - 'V': 0.83, - 'I': -0.69, - 'M': -0.12, - 'L': -1.24, - 'F': -1.93, - 'Y': 0.46, - 'W': -2.11}, - 'lcp': 0.0, - 'const': 24.26} -"""A set of retention coefficients for normal phase chromatography obtained in -Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in -hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), -8890-6. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: ACQUITY UPLC BEH glycan column (150 x 2.1 mm I.D.), 1.7 um, 130 A, -Mobile phase A: 10 mM ammonium formate buffer, pH 4.5 prepared by -titrating 10 mM solution of FA with ammonium hydroxide. Mobile phase B: -90% ACN, 10% mobile phase A (v:v). -Gradient: 90-60% B in 50 min.""" - -RCs_gilar_rp = {'aa': {'K': -1.015, - 'R': -0.681, - 'H': -1.937, - 'E': 1.475, - 'P': 3.496, - 'Q': 1.228, - 'D': 1.326, - 'C': 1.832, - 'C*': 1.832, - 'N': 0.299, - 'A': 2.322, - 'G': 1.172, - 'S': 1.165, - 'T': 1.894, - 'V': 5.695, - 'I': 8.343, - 'M': 5.128, - 'L': 9.069, - 'F': 10.877, - 'Y': 5.603, - 'W': 12.183}, - 'lcp': 0.0, - 'const': -3.696} -"""A set of retention coefficients for normal phase chromatography obtained in -Gilar, M., & Jaworski, A. (2011). Retention behavior of peptides in -hydrophilic-interaction chromatography. Journal of chromatography A, 1218(49), -8890-6. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: ACQUITY UPLC BEH C18 column (100 mm x 2.1 mm I.D.), 1.7 um, 130 A. -Mobile phase A: 0.02% TFA in water, mobile phase B: 0.018% TFA in ACN. -Gradient: 0 to 50% B in 50 min, flow rate 0.2 ml/min, temperature 40 C., pH 2.6. -""" - -RCs_krokhin_100A_fa = {'aa':{'K': -5.08, - 'G': -0.07, - 'L': 9.89, - 'A': 1.63, - 'C': 0.7, - 'camC': 0.7, - 'E': 1.75, - 'D': 0.95, - 'F': 11.92, - 'I': 9.06, - 'H': -5.05, - 'M': 6.96, - 'N': -0.59, - 'Q': 0.2, - 'P': 1.98, - 'S': 0.27, - 'R': -3.55, - 'T': 1.37, - 'W': 13.67, - 'V': 5.72, - 'Y': 5.97}, - 'lcf': 0.0, - 'const': 0.0} -"""A set of retention coefficients from R.C. Dwivedi, V. Spicer, -M. Harder, M. Antonovici, W. Ens, K.G. Standing, J.A. Wilkins, and O.V. Krokhin; -Analytical Chemistry 2008 80 (18), 7036-7042. -Practical Implementation of 2D HPLC Scheme with Accurate Peptide -Retention Prediction in Both Dimensions for High-Throughput Bottom-Up Proteomics. - -.. note:: Cysteine is Carbamidomethylated. - -Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% FA), packed with -5-um Luna C18(2) (Phenomenex, Torrance, CA), pore size 100A, pH=2.0. -Both eluents A (2% ACN in water) and B (98% ACN) contained -0.1% FA as ion-pairing modifier. 0.33% ACN/min -linear gradient (0-30% B). -""" - -RCs_krokhin_100A_tfa = {'aa':{'K': -3.53, - 'G': -0.35, - 'L': 9.44, - 'A': 1.11, - 'C': 0.04, - 'camC': 0.04, - 'E': 1.08, - 'D': -0.22, - 'F': 11.34, - 'I': 7.86, - 'H': -3.04, - 'M': 6.57, - 'N': -1.44, - 'Q': -0.53, - 'P': 1.62, - 'S': -0.33, - 'R': -2.58, - 'T': 0.48, - 'W': 13.12, - 'V': 4.86, - 'Y': 5.4}, - 'lcf': 0.0, - 'const': 0.0} -"""A set of retention coefficients from R.C. Dwivedi, V. Spicer, -M. Harder, M. Antonovici, W. Ens, K.G. Standing, J.A. Wilkins, and O.V. Krokhin; -Analytical Chemistry 2008 80 (18), 7036-7042. -Practical Implementation of 2D HPLC Scheme with Accurate Peptide -Retention Prediction in Both Dimensions for High-Throughput Bottom-Up Proteomics. - -.. note :: Cysteine is Carbamidomethylated. - -Conditions: 300 um x 150mm PepMap100 (Dionex, 0.1% TFA), packed with -5-um Luna C18(2) (Phenomenex, Torrance, CA), pore size 100 A, pH=2.0. -Both eluents A (2% ACN in water) and B (98% ACN) contained -0.1% TFA as ion-pairing modifier. 0.33% ACN/min -linear gradient (0-30% B). -""" - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pyteomics/auxiliary/__init__.py b/pyteomics/auxiliary/__init__.py deleted file mode 100644 index 79e3e5cde833c70404a051db917cad52c83abc89..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -try: - basestring = basestring -except NameError: - basestring = (str, bytes) - -from .structures import ( - PyteomicsError, Charge, ChargeList, - _parse_charge, _parse_ion, BasicComposition, - unitfloat, unitint, unitstr, cvstr, - cvquery) - -from .constants import _nist_mass - -from .file_helpers import ( - _file_obj, _keepstate, _keepstate_method, IteratorContextManager, - FileReader, IndexedTextReader, IndexedReaderMixin, TimeOrderedIndexedReaderMixin, - IndexSavingMixin, OffsetIndex, HierarchicalOffsetIndex, IndexSavingTextReader, - _file_reader, _file_writer, - _make_chain, _check_use_index, FileReadingProcess, TaskMappingMixin, - serializer, ChainBase, TableJoiner) - -from .math import ( - linear_regression, linear_regression_perpendicular, - linear_regression_vertical) - -from .target_decoy import ( - _calculate_qvalues, _qvalues_df, _decoy_or_pep_label, - _construct_dtype, _make_qvalues, _make_filter, - _itercontext, _iter, qvalues, filter, log_factorial, - _expectation, _confidence_value, _log_pi_r, - _log_pi, _make_fdr, fdr, sigma_T, sigma_fdr) - -from .utils import ( - print_tree, memoize, BinaryDataArrayTransformer, ArrayConversionMixin, BinaryArrayConversionMixin, - MaskedArrayConversionMixin, _decode_base64_data_array) diff --git a/pyteomics/auxiliary/constants.py b/pyteomics/auxiliary/constants.py deleted file mode 100644 index 7dc76648f88ffadeaf6e61e8b655f99800e193df..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/constants.py +++ /dev/null @@ -1,3297 +0,0 @@ -### Bulky constants for other modules are defined below. - -_nist_mass = {'Ac': {0: (227, 1.0), - 206: (206.0145, 0.0), - 207: (207.01195, 0.0), - 208: (208.01155, 0.0), - 209: (209.00949, 0.0), - 210: (210.00944, 0.0), - 211: (211.00773, 0.0), - 212: (212.00781, 0.0), - 213: (213.00661, 0.0), - 214: (214.006902, 0.0), - 215: (215.006454, 0.0), - 216: (216.00872, 0.0), - 217: (217.009347, 0.0), - 218: (218.01164, 0.0), - 219: (219.01242, 0.0), - 220: (220.014763, 0.0), - 221: (221.01559, 0.0), - 222: (222.017844, 0.0), - 223: (223.019137, 0.0), - 224: (224.021723, 0.0), - 225: (225.02323, 0.0), - 226: (226.026098, 0.0), - 227: (227.0277521, 0.0), - 228: (228.0310211, 0.0), - 229: (229.03302, 0.0), - 230: (230.03629, 0.0), - 231: (231.03856, 0.0), - 232: (232.04203, 0.0), - 233: (233.04455, 0.0), - 234: (234.04842, 0.0), - 235: (235.05123, 0.0), - 236: (236.0553, 0.0)}, - 'Ag': {0: (106.905097, 1.0), - 93: (92.94978, 0.0), - 94: (93.94278, 0.0), - 95: (94.93548, 0.0), - 96: (95.93068, 0.0), - 97: (96.92397, 0.0), - 98: (97.92157, 0.0), - 99: (98.9176, 0.0), - 100: (99.9161, 0.0), - 101: (100.9128, 0.0), - 102: (101.91169, 0.0), - 103: (102.908973, 0.0), - 104: (103.908629, 0.0), - 105: (104.906529, 0.0), - 106: (105.906669, 0.0), - 107: (106.905097, 0.51839), - 108: (107.905956, 0.0), - 109: (108.904752, 0.48161), - 110: (109.906107, 0.0), - 111: (110.905291, 0.0), - 112: (111.907005, 0.0), - 113: (112.906567, 0.0), - 114: (113.908804, 0.0), - 115: (114.90876, 0.0), - 116: (115.91136, 0.0), - 117: (116.91168, 0.0), - 118: (117.91458, 0.0), - 119: (118.91567, 0.0), - 120: (119.91879, 0.0), - 121: (120.91985, 0.0), - 122: (121.92353, 0.0), - 123: (122.9249, 0.0), - 124: (123.92864, 0.0), - 125: (124.93043, 0.0), - 126: (125.9345, 0.0), - 127: (126.93677, 0.0), - 128: (127.94117, 0.0), - 129: (128.94369, 0.0), - 130: (129.95045, 0.0)}, - 'Al': {0: (26.98153863, 1.0), - 21: (21.02804, 0.0), - 22: (22.01952, 0.0), - 23: (23.007267, 0.0), - 24: (23.9999389, 0.0), - 25: (24.9904281, 0.0), - 26: (25.98689169, 0.0), - 27: (26.98153863, 1.0), - 28: (27.98191031, 0.0), - 29: (28.980445, 0.0), - 30: (29.98296, 0.0), - 31: (30.983947, 0.0), - 32: (31.98812, 0.0), - 33: (32.99084, 0.0), - 34: (33.99685, 0.0), - 35: (34.99986, 0.0), - 36: (36.00621, 0.0), - 37: (37.01068, 0.0), - 38: (38.01723, 0.0), - 39: (39.02297, 0.0), - 40: (40.03145, 0.0), - 41: (41.03833, 0.0), - 42: (42.04689, 0.0)}, - 'Am': {0: (243, 1.0), - 231: (231.04556, 0.0), - 232: (232.04659, 0.0), - 233: (233.04635, 0.0), - 234: (234.04781, 0.0), - 235: (235.04795, 0.0), - 236: (236.04958, 0.0), - 237: (237.05, 0.0), - 238: (238.05198, 0.0), - 239: (239.0530245, 0.0), - 240: (240.0553, 0.0), - 241: (241.0568291, 0.0), - 242: (242.0595492, 0.0), - 243: (243.0613811, 0.0), - 244: (244.0642848, 0.0), - 245: (245.066452, 0.0), - 246: (246.069775, 0.0), - 247: (247.07209, 0.0), - 248: (248.07575, 0.0), - 249: (249.07848, 0.0)}, - 'Ar': {0: (39.9623831225, 1.0), - 30: (30.02156, 0.0), - 31: (31.01212, 0.0), - 32: (31.997638, 0.0), - 33: (32.9899257, 0.0), - 34: (33.9802712, 0.0), - 35: (34.9752576, 0.0), - 36: (35.967545106, 0.003365), - 37: (36.96677632, 0.0), - 38: (37.9627324, 0.000632), - 39: (38.964313, 0.0), - 40: (39.9623831225, 0.996003), - 41: (40.9645006, 0.0), - 42: (41.963046, 0.0), - 43: (42.965636, 0.0), - 44: (43.964924, 0.0), - 45: (44.96804, 0.0), - 46: (45.96809, 0.0), - 47: (46.97219, 0.0), - 48: (47.97454, 0.0), - 49: (48.98052, 0.0), - 50: (49.98443, 0.0), - 51: (50.99163, 0.0), - 52: (51.99678, 0.0), - 53: (53.00494, 0.0)}, - 'As': {0: (74.9215965, 1.0), - 60: (59.99313, 0.0), - 61: (60.98062, 0.0), - 62: (61.9732, 0.0), - 63: (62.96369, 0.0), - 64: (63.95757, 0.0), - 65: (64.94956, 0.0), - 66: (65.94471, 0.0), - 67: (66.93919, 0.0), - 68: (67.93677, 0.0), - 69: (68.93227, 0.0), - 70: (69.93092, 0.0), - 71: (70.927112, 0.0), - 72: (71.926752, 0.0), - 73: (72.923825, 0.0), - 74: (73.9239287, 0.0), - 75: (74.9215965, 1.0), - 76: (75.922394, 0.0), - 77: (76.9206473, 0.0), - 78: (77.921827, 0.0), - 79: (78.920948, 0.0), - 80: (79.922534, 0.0), - 81: (80.922132, 0.0), - 82: (81.9245, 0.0), - 83: (82.92498, 0.0), - 84: (83.92906, 0.0), - 85: (84.93202, 0.0), - 86: (85.9365, 0.0), - 87: (86.9399, 0.0), - 88: (87.94494, 0.0), - 89: (88.94939, 0.0), - 90: (89.9555, 0.0), - 91: (90.96043, 0.0), - 92: (91.9668, 0.0)}, - 'At': {0: (210, 1.0), - 193: (192.99984, 0.0), - 194: (193.99873, 0.0), - 195: (194.996268, 0.0), - 196: (195.99579, 0.0), - 197: (196.99319, 0.0), - 198: (197.99284, 0.0), - 199: (198.99053, 0.0), - 200: (199.990351, 0.0), - 201: (200.988417, 0.0), - 202: (201.98863, 0.0), - 203: (202.986942, 0.0), - 204: (203.987251, 0.0), - 205: (204.986074, 0.0), - 206: (205.986667, 0.0), - 207: (206.985784, 0.0), - 208: (207.98659, 0.0), - 209: (208.986173, 0.0), - 210: (209.987148, 0.0), - 211: (210.9874963, 0.0), - 212: (211.990745, 0.0), - 213: (212.992937, 0.0), - 214: (213.996372, 0.0), - 215: (214.998653, 0.0), - 216: (216.002423, 0.0), - 217: (217.004719, 0.0), - 218: (218.008694, 0.0), - 219: (219.011162, 0.0), - 220: (220.01541, 0.0), - 221: (221.01805, 0.0), - 222: (222.02233, 0.0), - 223: (223.02519, 0.0)}, - 'Au': {0: (196.9665687, 1.0), - 169: (168.99808, 0.0), - 170: (169.99612, 0.0), - 171: (170.991879, 0.0), - 172: (171.99004, 0.0), - 173: (172.986237, 0.0), - 174: (173.98476, 0.0), - 175: (174.98127, 0.0), - 176: (175.9801, 0.0), - 177: (176.976865, 0.0), - 178: (177.97603, 0.0), - 179: (178.973213, 0.0), - 180: (179.972521, 0.0), - 181: (180.970079, 0.0), - 182: (181.969618, 0.0), - 183: (182.967593, 0.0), - 184: (183.967452, 0.0), - 185: (184.965789, 0.0), - 186: (185.965953, 0.0), - 187: (186.964568, 0.0), - 188: (187.965324, 0.0), - 189: (188.963948, 0.0), - 190: (189.9647, 0.0), - 191: (190.9637, 0.0), - 192: (191.964813, 0.0), - 193: (192.96415, 0.0), - 194: (193.965365, 0.0), - 195: (194.9650346, 0.0), - 196: (195.96657, 0.0), - 197: (196.9665687, 1.0), - 198: (197.9682423, 0.0), - 199: (198.9687652, 0.0), - 200: (199.97073, 0.0), - 201: (200.971657, 0.0), - 202: (201.97381, 0.0), - 203: (202.975155, 0.0), - 204: (203.97772, 0.0), - 205: (204.97987, 0.0)}, - 'B': {0: (11.0093054, 1.0), - 6: (6.04681, 0.0), - 7: (7.02992, 0.0), - 8: (8.0246072, 0.0), - 9: (9.0133288, 0.0), - 10: (10.012937, 0.199), - 11: (11.0093054, 0.801), - 12: (12.0143521, 0.0), - 13: (13.0177802, 0.0), - 14: (14.025404, 0.0), - 15: (15.031103, 0.0), - 16: (16.03981, 0.0), - 17: (17.04699, 0.0), - 18: (18.05617, 0.0), - 19: (19.06373, 0.0)}, - 'Ba': {0: (137.9052472, 1.0), - 114: (113.95068, 0.0), - 115: (114.94737, 0.0), - 116: (115.94138, 0.0), - 117: (116.9385, 0.0), - 118: (117.93304, 0.0), - 119: (118.93066, 0.0), - 120: (119.92604, 0.0), - 121: (120.92405, 0.0), - 122: (121.9199, 0.0), - 123: (122.918781, 0.0), - 124: (123.915094, 0.0), - 125: (124.914473, 0.0), - 126: (125.91125, 0.0), - 127: (126.911094, 0.0), - 128: (127.908318, 0.0), - 129: (128.908679, 0.0), - 130: (129.9063208, 0.00106), - 131: (130.906941, 0.0), - 132: (131.9050613, 0.00101), - 133: (132.9060075, 0.0), - 134: (133.9045084, 0.02417), - 135: (134.9056886, 0.06592), - 136: (135.9045759, 0.07854), - 137: (136.9058274, 0.11232), - 138: (137.9052472, 0.71698), - 139: (138.9088413, 0.0), - 140: (139.910605, 0.0), - 141: (140.914411, 0.0), - 142: (141.916453, 0.0), - 143: (142.920627, 0.0), - 144: (143.922953, 0.0), - 145: (144.92763, 0.0), - 146: (145.93022, 0.0), - 147: (146.93495, 0.0), - 148: (147.93772, 0.0), - 149: (148.94258, 0.0), - 150: (149.94568, 0.0), - 151: (150.95081, 0.0), - 152: (151.95427, 0.0), - 153: (152.95961, 0.0)}, - 'Be': {0: (9.0121822, 1.0), - 5: (5.04079, 0.0), - 6: (6.019726, 0.0), - 7: (7.01692983, 0.0), - 8: (8.0053051, 0.0), - 9: (9.0121822, 1.0), - 10: (10.0135338, 0.0), - 11: (11.021658, 0.0), - 12: (12.026921, 0.0), - 13: (13.03569, 0.0), - 14: (14.04289, 0.0), - 15: (15.05346, 0.0), - 16: (16.06192, 0.0)}, - 'Bh': {0: (272, 1.0), - 260: (260.12197, 0.0), - 261: (261.12166, 0.0), - 262: (262.12289, 0.0), - 263: (263.12304, 0.0), - 264: (264.1246, 0.0), - 265: (265.12515, 0.0), - 266: (266.12694, 0.0), - 267: (267.12765, 0.0), - 268: (268.12976, 0.0), - 269: (269.13069, 0.0), - 270: (270.13362, 0.0), - 271: (271.13518, 0.0), - 272: (272.13803, 0.0), - 273: (273.13962, 0.0), - 274: (274.14244, 0.0), - 275: (275.14425, 0.0)}, - 'Bi': {0: (208.9803987, 1.0), - 184: (184.00112, 0.0), - 185: (184.99763, 0.0), - 186: (185.9966, 0.0), - 187: (186.993158, 0.0), - 188: (187.99227, 0.0), - 189: (188.9892, 0.0), - 190: (189.9883, 0.0), - 191: (190.985786, 0.0), - 192: (191.98546, 0.0), - 193: (192.98296, 0.0), - 194: (193.98283, 0.0), - 195: (194.980651, 0.0), - 196: (195.980667, 0.0), - 197: (196.978864, 0.0), - 198: (197.97921, 0.0), - 199: (198.977672, 0.0), - 200: (199.978132, 0.0), - 201: (200.977009, 0.0), - 202: (201.977742, 0.0), - 203: (202.976876, 0.0), - 204: (203.977813, 0.0), - 205: (204.977389, 0.0), - 206: (205.978499, 0.0), - 207: (206.9784707, 0.0), - 208: (207.9797422, 0.0), - 209: (208.9803987, 1.0), - 210: (209.9841204, 0.0), - 211: (210.987269, 0.0), - 212: (211.9912857, 0.0), - 213: (212.994385, 0.0), - 214: (213.998712, 0.0), - 215: (215.00177, 0.0), - 216: (216.006306, 0.0), - 217: (217.00947, 0.0), - 218: (218.01432, 0.0)}, - 'Bk': {0: (247, 1.0), - 235: (235.05658, 0.0), - 236: (236.05733, 0.0), - 237: (237.057, 0.0), - 238: (238.05828, 0.0), - 239: (239.05828, 0.0), - 240: (240.05976, 0.0), - 241: (241.06023, 0.0), - 242: (242.06198, 0.0), - 243: (243.063008, 0.0), - 244: (244.065181, 0.0), - 245: (245.0663616, 0.0), - 246: (246.06867, 0.0), - 247: (247.070307, 0.0), - 248: (248.07309, 0.0), - 249: (249.0749867, 0.0), - 250: (250.078317, 0.0), - 251: (251.08076, 0.0), - 252: (252.08431, 0.0), - 253: (253.08688, 0.0), - 254: (254.0906, 0.0)}, - 'Br': {0: (78.9183371, 1.0), - 67: (66.96479, 0.0), - 68: (67.95852, 0.0), - 69: (68.95011, 0.0), - 70: (69.94479, 0.0), - 71: (70.93874, 0.0), - 72: (71.93664, 0.0), - 73: (72.93169, 0.0), - 74: (73.929891, 0.0), - 75: (74.925776, 0.0), - 76: (75.924541, 0.0), - 77: (76.921379, 0.0), - 78: (77.921146, 0.0), - 79: (78.9183371, 0.5069), - 80: (79.9185293, 0.0), - 81: (80.9162906, 0.4931), - 82: (81.9168041, 0.0), - 83: (82.91518, 0.0), - 84: (83.916479, 0.0), - 85: (84.915608, 0.0), - 86: (85.918798, 0.0), - 87: (86.920711, 0.0), - 88: (87.92407, 0.0), - 89: (88.92639, 0.0), - 90: (89.93063, 0.0), - 91: (90.93397, 0.0), - 92: (91.93926, 0.0), - 93: (92.94305, 0.0), - 94: (93.94868, 0.0), - 95: (94.95287, 0.0), - 96: (95.95853, 0.0), - 97: (96.9628, 0.0)}, - 'C': {0: (12.0, 1.0), - 8: (8.037675, 0.0), - 9: (9.0310367, 0.0), - 10: (10.0168532, 0.0), - 11: (11.0114336, 0.0), - 12: (12.0, 0.9893), - 13: (13.0033548378, 0.0107), - 14: (14.003241989, 0.0), - 15: (15.0105993, 0.0), - 16: (16.014701, 0.0), - 17: (17.022586, 0.0), - 18: (18.02676, 0.0), - 19: (19.03481, 0.0), - 20: (20.04032, 0.0), - 21: (21.04934, 0.0), - 22: (22.0572, 0.0)}, - 'Ca': {0: (39.96259098, 1.0), - 34: (34.01412, 0.0), - 35: (35.00494, 0.0), - 36: (35.99309, 0.0), - 37: (36.98587, 0.0), - 38: (37.976318, 0.0), - 39: (38.9707197, 0.0), - 40: (39.96259098, 0.96941), - 41: (40.96227806, 0.0), - 42: (41.95861801, 0.00647), - 43: (42.9587666, 0.00135), - 44: (43.9554818, 0.02086), - 45: (44.9561866, 0.0), - 46: (45.9536926, 4e-05), - 47: (46.954546, 0.0), - 48: (47.952534, 0.00187), - 49: (48.955674, 0.0), - 50: (49.957519, 0.0), - 51: (50.9615, 0.0), - 52: (51.9651, 0.0), - 53: (52.97005, 0.0), - 54: (53.97435, 0.0), - 55: (54.98055, 0.0), - 56: (55.98557, 0.0), - 57: (56.99236, 0.0)}, - 'Cd': {0: (113.9033585, 1.0), - 95: (94.94987, 0.0), - 96: (95.93977, 0.0), - 97: (96.93494, 0.0), - 98: (97.9274, 0.0), - 99: (98.92501, 0.0), - 100: (99.92029, 0.0), - 101: (100.91868, 0.0), - 102: (101.91446, 0.0), - 103: (102.913419, 0.0), - 104: (103.909849, 0.0), - 105: (104.909468, 0.0), - 106: (105.906459, 0.0125), - 107: (106.906618, 0.0), - 108: (107.904184, 0.0089), - 109: (108.904982, 0.0), - 110: (109.9030021, 0.1249), - 111: (110.9041781, 0.128), - 112: (111.9027578, 0.2413), - 113: (112.9044017, 0.1222), - 114: (113.9033585, 0.2873), - 115: (114.905431, 0.0), - 116: (115.904756, 0.0749), - 117: (116.907219, 0.0), - 118: (117.906915, 0.0), - 119: (118.90992, 0.0), - 120: (119.90985, 0.0), - 121: (120.91298, 0.0), - 122: (121.91333, 0.0), - 123: (122.917, 0.0), - 124: (123.91765, 0.0), - 125: (124.92125, 0.0), - 126: (125.92235, 0.0), - 127: (126.92644, 0.0), - 128: (127.92776, 0.0), - 129: (128.93215, 0.0), - 130: (129.9339, 0.0), - 131: (130.94067, 0.0), - 132: (131.94555, 0.0)}, - 'Ce': {0: (139.9054387, 1.0), - 119: (118.95276, 0.0), - 120: (119.94664, 0.0), - 121: (120.94342, 0.0), - 122: (121.93791, 0.0), - 123: (122.9354, 0.0), - 124: (123.93041, 0.0), - 125: (124.92844, 0.0), - 126: (125.92397, 0.0), - 127: (126.92273, 0.0), - 128: (127.91891, 0.0), - 129: (128.9181, 0.0), - 130: (129.91474, 0.0), - 131: (130.91442, 0.0), - 132: (131.91146, 0.0), - 133: (132.911515, 0.0), - 134: (133.908925, 0.0), - 135: (134.909151, 0.0), - 136: (135.907172, 0.00185), - 137: (136.907806, 0.0), - 138: (137.905991, 0.00251), - 139: (138.906653, 0.0), - 140: (139.9054387, 0.8845), - 141: (140.9082763, 0.0), - 142: (141.909244, 0.11114), - 143: (142.912386, 0.0), - 144: (143.913647, 0.0), - 145: (144.91723, 0.0), - 146: (145.91876, 0.0), - 147: (146.92267, 0.0), - 148: (147.92443, 0.0), - 149: (148.9284, 0.0), - 150: (149.93041, 0.0), - 151: (150.93398, 0.0), - 152: (151.93654, 0.0), - 153: (152.94058, 0.0), - 154: (153.94342, 0.0), - 155: (154.94804, 0.0), - 156: (155.95126, 0.0), - 157: (156.95634, 0.0)}, - 'Cf': {0: (251, 1.0), - 237: (237.06207, 0.0), - 238: (238.06141, 0.0), - 239: (239.06242, 0.0), - 240: (240.0623, 0.0), - 241: (241.06373, 0.0), - 242: (242.0637, 0.0), - 243: (243.06543, 0.0), - 244: (244.066001, 0.0), - 245: (245.068049, 0.0), - 246: (246.0688053, 0.0), - 247: (247.071001, 0.0), - 248: (248.072185, 0.0), - 249: (249.0748535, 0.0), - 250: (250.0764061, 0.0), - 251: (251.079587, 0.0), - 252: (252.081626, 0.0), - 253: (253.085133, 0.0), - 254: (254.087323, 0.0), - 255: (255.09105, 0.0), - 256: (256.09344, 0.0)}, - 'Cl': {0: (34.96885268, 1.0), - 28: (28.02851, 0.0), - 29: (29.01411, 0.0), - 30: (30.00477, 0.0), - 31: (30.99241, 0.0), - 32: (31.98569, 0.0), - 33: (32.9774519, 0.0), - 34: (33.97376282, 0.0), - 35: (34.96885268, 0.7576), - 36: (35.96830698, 0.0), - 37: (36.96590259, 0.2424), - 38: (37.96801043, 0.0), - 39: (38.9680082, 0.0), - 40: (39.97042, 0.0), - 41: (40.97068, 0.0), - 42: (41.97325, 0.0), - 43: (42.97405, 0.0), - 44: (43.97828, 0.0), - 45: (44.98029, 0.0), - 46: (45.98421, 0.0), - 47: (46.98871, 0.0), - 48: (47.99495, 0.0), - 49: (49.00032, 0.0), - 50: (50.00784, 0.0), - 51: (51.01449, 0.0)}, - 'Cm': {0: (247, 1.0), - 233: (233.05077, 0.0), - 234: (234.05016, 0.0), - 235: (235.05143, 0.0), - 236: (236.05141, 0.0), - 237: (237.0529, 0.0), - 238: (238.05303, 0.0), - 239: (239.05496, 0.0), - 240: (240.0555295, 0.0), - 241: (241.057653, 0.0), - 242: (242.0588358, 0.0), - 243: (243.0613891, 0.0), - 244: (244.0627526, 0.0), - 245: (245.0654912, 0.0), - 246: (246.0672237, 0.0), - 247: (247.070354, 0.0), - 248: (248.072349, 0.0), - 249: (249.075953, 0.0), - 250: (250.078357, 0.0), - 251: (251.082285, 0.0), - 252: (252.08487, 0.0)}, - 'Cn': {0: (285, 1.0), - 277: (277.16394, 0.0), - 278: (278.16431, 0.0), - 279: (279.16655, 0.0), - 280: (280.16704, 0.0), - 281: (281.16929, 0.0), - 282: (282.16977, 0.0), - 283: (283.17179, 0.0), - 284: (284.17238, 0.0), - 285: (285.17411, 0.0)}, - 'Co': {0: (58.933195, 1.0), - 47: (47.01149, 0.0), - 48: (48.00176, 0.0), - 49: (48.98972, 0.0), - 50: (49.98154, 0.0), - 51: (50.97072, 0.0), - 52: (51.96359, 0.0), - 53: (52.954219, 0.0), - 54: (53.9484596, 0.0), - 55: (54.941999, 0.0), - 56: (55.9398393, 0.0), - 57: (56.9362914, 0.0), - 58: (57.9357528, 0.0), - 59: (58.933195, 1.0), - 60: (59.9338171, 0.0), - 61: (60.9324758, 0.0), - 62: (61.934051, 0.0), - 63: (62.933612, 0.0), - 64: (63.93581, 0.0), - 65: (64.936478, 0.0), - 66: (65.93976, 0.0), - 67: (66.94089, 0.0), - 68: (67.94487, 0.0), - 69: (68.94632, 0.0), - 70: (69.951, 0.0), - 71: (70.9529, 0.0), - 72: (71.95781, 0.0), - 73: (72.96024, 0.0), - 74: (73.96538, 0.0), - 75: (74.96833, 0.0)}, - 'Cr': {0: (51.9405075, 1.0), - 42: (42.00643, 0.0), - 43: (42.99771, 0.0), - 44: (43.98555, 0.0), - 45: (44.97964, 0.0), - 46: (45.968359, 0.0), - 47: (46.9629, 0.0), - 48: (47.954032, 0.0), - 49: (48.9513357, 0.0), - 50: (49.9460442, 0.04345), - 51: (50.9447674, 0.0), - 52: (51.9405075, 0.83789), - 53: (52.9406494, 0.09501), - 54: (53.9388804, 0.02365), - 55: (54.9408397, 0.0), - 56: (55.9406531, 0.0), - 57: (56.943613, 0.0), - 58: (57.94435, 0.0), - 59: (58.94859, 0.0), - 60: (59.95008, 0.0), - 61: (60.95472, 0.0), - 62: (61.95661, 0.0), - 63: (62.96186, 0.0), - 64: (63.96441, 0.0), - 65: (64.97016, 0.0), - 66: (65.97338, 0.0), - 67: (66.97955, 0.0)}, - 'Cs': {0: (132.905451933, 1.0), - 112: (111.9503, 0.0), - 113: (112.94449, 0.0), - 114: (113.94145, 0.0), - 115: (114.93591, 0.0), - 116: (115.93337, 0.0), - 117: (116.92867, 0.0), - 118: (117.926559, 0.0), - 119: (118.922377, 0.0), - 120: (119.920677, 0.0), - 121: (120.917229, 0.0), - 122: (121.91611, 0.0), - 123: (122.912996, 0.0), - 124: (123.912258, 0.0), - 125: (124.909728, 0.0), - 126: (125.909452, 0.0), - 127: (126.907418, 0.0), - 128: (127.907749, 0.0), - 129: (128.906064, 0.0), - 130: (129.906709, 0.0), - 131: (130.905464, 0.0), - 132: (131.9064343, 0.0), - 133: (132.905451933, 1.0), - 134: (133.906718475, 0.0), - 135: (134.905977, 0.0), - 136: (135.9073116, 0.0), - 137: (136.9070895, 0.0), - 138: (137.911017, 0.0), - 139: (138.913364, 0.0), - 140: (139.917282, 0.0), - 141: (140.920046, 0.0), - 142: (141.924299, 0.0), - 143: (142.927352, 0.0), - 144: (143.932077, 0.0), - 145: (144.935526, 0.0), - 146: (145.94029, 0.0), - 147: (146.94416, 0.0), - 148: (147.94922, 0.0), - 149: (148.95293, 0.0), - 150: (149.95817, 0.0), - 151: (150.96219, 0.0)}, - 'Cu': {0: (62.9295975, 1.0), - 52: (51.99718, 0.0), - 53: (52.98555, 0.0), - 54: (53.97671, 0.0), - 55: (54.96605, 0.0), - 56: (55.95856, 0.0), - 57: (56.949211, 0.0), - 58: (57.9445385, 0.0), - 59: (58.939498, 0.0), - 60: (59.937365, 0.0), - 61: (60.9334578, 0.0), - 62: (61.932584, 0.0), - 63: (62.9295975, 0.6915), - 64: (63.9297642, 0.0), - 65: (64.9277895, 0.3085), - 66: (65.9288688, 0.0), - 67: (66.9277303, 0.0), - 68: (67.9296109, 0.0), - 69: (68.9294293, 0.0), - 70: (69.9323923, 0.0), - 71: (70.9326768, 0.0), - 72: (71.9358203, 0.0), - 73: (72.936675, 0.0), - 74: (73.939875, 0.0), - 75: (74.9419, 0.0), - 76: (75.945275, 0.0), - 77: (76.94785, 0.0), - 78: (77.95196, 0.0), - 79: (78.95456, 0.0), - 80: (79.96087, 0.0)}, - 'Db': {0: (268, 1.0), - 255: (255.1074, 0.0), - 256: (256.10813, 0.0), - 257: (257.10772, 0.0), - 258: (258.10923, 0.0), - 259: (259.10961, 0.0), - 260: (260.1113, 0.0), - 261: (261.11206, 0.0), - 262: (262.11408, 0.0), - 263: (263.11499, 0.0), - 264: (264.1174, 0.0), - 265: (265.1186, 0.0), - 266: (266.12103, 0.0), - 267: (267.12238, 0.0), - 268: (268.12545, 0.0), - 269: (269.12746, 0.0), - 270: (270.13071, 0.0)}, - 'Ds': {0: (281, 1.0), - 267: (267.14434, 0.0), - 268: (268.1438, 0.0), - 269: (269.14512, 0.0), - 270: (270.14472, 0.0), - 271: (271.14606, 0.0), - 272: (272.14632, 0.0), - 273: (273.14886, 0.0), - 274: (274.14949, 0.0), - 275: (275.15218, 0.0), - 276: (276.15303, 0.0), - 277: (277.15565, 0.0), - 278: (278.15647, 0.0), - 279: (279.15886, 0.0), - 280: (280.1598, 0.0), - 281: (281.16206, 0.0)}, - 'Dy': {0: (163.9291748, 1.0), - 138: (137.96249, 0.0), - 139: (138.95954, 0.0), - 140: (139.95401, 0.0), - 141: (140.95135, 0.0), - 142: (141.94637, 0.0), - 143: (142.94383, 0.0), - 144: (143.93925, 0.0), - 145: (144.93743, 0.0), - 146: (145.932845, 0.0), - 147: (146.931092, 0.0), - 148: (147.92715, 0.0), - 149: (148.927305, 0.0), - 150: (149.925585, 0.0), - 151: (150.926185, 0.0), - 152: (151.924718, 0.0), - 153: (152.925765, 0.0), - 154: (153.924424, 0.0), - 155: (154.925754, 0.0), - 156: (155.924283, 0.00056), - 157: (156.925466, 0.0), - 158: (157.924409, 0.00095), - 159: (158.9257392, 0.0), - 160: (159.9251975, 0.02329), - 161: (160.9269334, 0.18889), - 162: (161.9267984, 0.25475), - 163: (162.9287312, 0.24896), - 164: (163.9291748, 0.2826), - 165: (164.9317033, 0.0), - 166: (165.9328067, 0.0), - 167: (166.93566, 0.0), - 168: (167.93713, 0.0), - 169: (168.94031, 0.0), - 170: (169.94239, 0.0), - 171: (170.9462, 0.0), - 172: (171.94876, 0.0), - 173: (172.953, 0.0)}, - 'Er': {0: (165.9302931, 1.0), - 143: (142.96634, 0.0), - 144: (143.96038, 0.0), - 145: (144.95739, 0.0), - 146: (145.952, 0.0), - 147: (146.94949, 0.0), - 148: (147.94455, 0.0), - 149: (148.94231, 0.0), - 150: (149.937914, 0.0), - 151: (150.937449, 0.0), - 152: (151.93505, 0.0), - 153: (152.935063, 0.0), - 154: (153.932783, 0.0), - 155: (154.933209, 0.0), - 156: (155.931065, 0.0), - 157: (156.93192, 0.0), - 158: (157.929893, 0.0), - 159: (158.930684, 0.0), - 160: (159.929083, 0.0), - 161: (160.929995, 0.0), - 162: (161.928778, 0.00139), - 163: (162.930033, 0.0), - 164: (163.9292, 0.01601), - 165: (164.930726, 0.0), - 166: (165.9302931, 0.33503), - 167: (166.9320482, 0.22869), - 168: (167.9323702, 0.26978), - 169: (168.9345904, 0.0), - 170: (169.9354643, 0.1491), - 171: (170.9380298, 0.0), - 172: (171.939356, 0.0), - 173: (172.9424, 0.0), - 174: (173.94423, 0.0), - 175: (174.94777, 0.0), - 176: (175.95008, 0.0), - 177: (176.95405, 0.0)}, - 'Es': {0: (252, 1.0), - 240: (240.06892, 0.0), - 241: (241.06854, 0.0), - 242: (242.06975, 0.0), - 243: (243.06955, 0.0), - 244: (244.07088, 0.0), - 245: (245.07132, 0.0), - 246: (246.0729, 0.0), - 247: (247.07366, 0.0), - 248: (248.07547, 0.0), - 249: (249.07641, 0.0), - 250: (250.07861, 0.0), - 251: (251.079992, 0.0), - 252: (252.08298, 0.0), - 253: (253.0848247, 0.0), - 254: (254.088022, 0.0), - 255: (255.090273, 0.0), - 256: (256.0936, 0.0), - 257: (257.09598, 0.0), - 258: (258.09952, 0.0)}, - 'Eu': {0: (152.9212303, 1.0), - 130: (129.96357, 0.0), - 131: (130.95775, 0.0), - 132: (131.95437, 0.0), - 133: (132.94924, 0.0), - 134: (133.94651, 0.0), - 135: (134.94182, 0.0), - 136: (135.9396, 0.0), - 137: (136.93557, 0.0), - 138: (137.93371, 0.0), - 139: (138.929792, 0.0), - 140: (139.92809, 0.0), - 141: (140.924931, 0.0), - 142: (141.92343, 0.0), - 143: (142.920298, 0.0), - 144: (143.918817, 0.0), - 145: (144.916265, 0.0), - 146: (145.917206, 0.0), - 147: (146.916746, 0.0), - 148: (147.918086, 0.0), - 149: (148.917931, 0.0), - 150: (149.919702, 0.0), - 151: (150.9198502, 0.4781), - 152: (151.9217445, 0.0), - 153: (152.9212303, 0.5219), - 154: (153.9229792, 0.0), - 155: (154.9228933, 0.0), - 156: (155.924752, 0.0), - 157: (156.925424, 0.0), - 158: (157.92785, 0.0), - 159: (158.929089, 0.0), - 160: (159.93197, 0.0), - 161: (160.93368, 0.0), - 162: (161.93704, 0.0), - 163: (162.93921, 0.0), - 164: (163.94299, 0.0), - 165: (164.94572, 0.0), - 166: (165.94997, 0.0), - 167: (166.95321, 0.0)}, - 'F': {0: (18.99840322, 1.0), - 14: (14.03506, 0.0), - 15: (15.01801, 0.0), - 16: (16.011466, 0.0), - 17: (17.00209524, 0.0), - 18: (18.000938, 0.0), - 19: (18.99840322, 1.0), - 20: (19.99998132, 0.0), - 21: (20.999949, 0.0), - 22: (22.002999, 0.0), - 23: (23.00357, 0.0), - 24: (24.00812, 0.0), - 25: (25.0121, 0.0), - 26: (26.01962, 0.0), - 27: (27.02676, 0.0), - 28: (28.03567, 0.0), - 29: (29.04326, 0.0), - 30: (30.0525, 0.0), - 31: (31.06043, 0.0)}, - 'Fe': {0: (55.9349375, 1.0), - 45: (45.01458, 0.0), - 46: (46.00081, 0.0), - 47: (46.99289, 0.0), - 48: (47.9805, 0.0), - 49: (48.97361, 0.0), - 50: (49.96299, 0.0), - 51: (50.95682, 0.0), - 52: (51.948114, 0.0), - 53: (52.9453079, 0.0), - 54: (53.9396105, 0.05845), - 55: (54.9382934, 0.0), - 56: (55.9349375, 0.91754), - 57: (56.935394, 0.02119), - 58: (57.9332756, 0.00282), - 59: (58.9348755, 0.0), - 60: (59.934072, 0.0), - 61: (60.936745, 0.0), - 62: (61.936767, 0.0), - 63: (62.94037, 0.0), - 64: (63.9412, 0.0), - 65: (64.94538, 0.0), - 66: (65.94678, 0.0), - 67: (66.95095, 0.0), - 68: (67.9537, 0.0), - 69: (68.95878, 0.0), - 70: (69.96146, 0.0), - 71: (70.96672, 0.0), - 72: (71.96962, 0.0)}, - 'Fm': {0: (257, 1.0), - 242: (242.07343, 0.0), - 243: (243.07435, 0.0), - 244: (244.07408, 0.0), - 245: (245.07539, 0.0), - 246: (246.0753, 0.0), - 247: (247.07685, 0.0), - 248: (248.077195, 0.0), - 249: (249.07903, 0.0), - 250: (250.079521, 0.0), - 251: (251.081575, 0.0), - 252: (252.082467, 0.0), - 253: (253.085185, 0.0), - 254: (254.0868542, 0.0), - 255: (255.089962, 0.0), - 256: (256.091773, 0.0), - 257: (257.095105, 0.0), - 258: (258.09708, 0.0), - 259: (259.1006, 0.0), - 260: (260.10268, 0.0)}, - 'Fr': {0: (223, 1.0), - 199: (199.00726, 0.0), - 200: (200.00657, 0.0), - 201: (201.00386, 0.0), - 202: (202.00337, 0.0), - 203: (203.000925, 0.0), - 204: (204.000653, 0.0), - 205: (204.998594, 0.0), - 206: (205.99867, 0.0), - 207: (206.99695, 0.0), - 208: (207.99714, 0.0), - 209: (208.995954, 0.0), - 210: (209.996408, 0.0), - 211: (210.995537, 0.0), - 212: (211.996202, 0.0), - 213: (212.996189, 0.0), - 214: (213.998971, 0.0), - 215: (215.000341, 0.0), - 216: (216.003198, 0.0), - 217: (217.004632, 0.0), - 218: (218.007578, 0.0), - 219: (219.009252, 0.0), - 220: (220.012327, 0.0), - 221: (221.014255, 0.0), - 222: (222.017552, 0.0), - 223: (223.0197359, 0.0), - 224: (224.02325, 0.0), - 225: (225.02557, 0.0), - 226: (226.02939, 0.0), - 227: (227.03184, 0.0), - 228: (228.03573, 0.0), - 229: (229.03845, 0.0), - 230: (230.04251, 0.0), - 231: (231.04544, 0.0), - 232: (232.04977, 0.0)}, - 'Ga': {0: (68.9255736, 1.0), - 56: (55.99491, 0.0), - 57: (56.98293, 0.0), - 58: (57.97425, 0.0), - 59: (58.96337, 0.0), - 60: (59.95706, 0.0), - 61: (60.94945, 0.0), - 62: (61.944175, 0.0), - 63: (62.9392942, 0.0), - 64: (63.9368387, 0.0), - 65: (64.9327348, 0.0), - 66: (65.931589, 0.0), - 67: (66.9282017, 0.0), - 68: (67.9279801, 0.0), - 69: (68.9255736, 0.60108), - 70: (69.926022, 0.0), - 71: (70.9247013, 0.39892), - 72: (71.9263663, 0.0), - 73: (72.9251747, 0.0), - 74: (73.926946, 0.0), - 75: (74.9265002, 0.0), - 76: (75.9288276, 0.0), - 77: (76.9291543, 0.0), - 78: (77.9316082, 0.0), - 79: (78.93289, 0.0), - 80: (79.93652, 0.0), - 81: (80.93775, 0.0), - 82: (81.94299, 0.0), - 83: (82.94698, 0.0), - 84: (83.95265, 0.0), - 85: (84.957, 0.0), - 86: (85.96312, 0.0)}, - 'Gd': {0: (157.9241039, 1.0), - 134: (133.95537, 0.0), - 135: (134.95257, 0.0), - 136: (135.94734, 0.0), - 137: (136.94502, 0.0), - 138: (137.94012, 0.0), - 139: (138.93824, 0.0), - 140: (139.93367, 0.0), - 141: (140.932126, 0.0), - 142: (141.92812, 0.0), - 143: (142.92675, 0.0), - 144: (143.92296, 0.0), - 145: (144.921709, 0.0), - 146: (145.918311, 0.0), - 147: (146.919094, 0.0), - 148: (147.918115, 0.0), - 149: (148.919341, 0.0), - 150: (149.918659, 0.0), - 151: (150.920348, 0.0), - 152: (151.919791, 0.002), - 153: (152.9217495, 0.0), - 154: (153.9208656, 0.0218), - 155: (154.922622, 0.148), - 156: (155.9221227, 0.2047), - 157: (156.9239601, 0.1565), - 158: (157.9241039, 0.2484), - 159: (158.9263887, 0.0), - 160: (159.9270541, 0.2186), - 161: (160.9296692, 0.0), - 162: (161.930985, 0.0), - 163: (162.93399, 0.0), - 164: (163.93586, 0.0), - 165: (164.93938, 0.0), - 166: (165.9416, 0.0), - 167: (166.94557, 0.0), - 168: (167.94836, 0.0), - 169: (168.95287, 0.0)}, - 'Ge': {0: (73.9211778, 1.0), - 58: (57.99101, 0.0), - 59: (58.98175, 0.0), - 60: (59.97019, 0.0), - 61: (60.96379, 0.0), - 62: (61.95465, 0.0), - 63: (62.94964, 0.0), - 64: (63.94165, 0.0), - 65: (64.93944, 0.0), - 66: (65.93384, 0.0), - 67: (66.932734, 0.0), - 68: (67.928094, 0.0), - 69: (68.9279645, 0.0), - 70: (69.9242474, 0.2038), - 71: (70.924951, 0.0), - 72: (71.9220758, 0.2731), - 73: (72.9234589, 0.0776), - 74: (73.9211778, 0.3672), - 75: (74.9228589, 0.0), - 76: (75.9214026, 0.0783), - 77: (76.9235486, 0.0), - 78: (77.922853, 0.0), - 79: (78.9254, 0.0), - 80: (79.92537, 0.0), - 81: (80.92882, 0.0), - 82: (81.92955, 0.0), - 83: (82.93462, 0.0), - 84: (83.93747, 0.0), - 85: (84.94303, 0.0), - 86: (85.94649, 0.0), - 87: (86.95251, 0.0), - 88: (87.95691, 0.0), - 89: (88.96383, 0.0)}, - 'H': {0: (1.00782503207, 1.0), - 1: (1.00782503207, 0.999885), - 2: (2.0141017778, 0.000115), - 3: (3.0160492777, 0.0), - 4: (4.02781, 0.0), - 5: (5.03531, 0.0), - 6: (6.04494, 0.0), - 7: (7.05275, 0.0)}, - 'H+': {0: (1.00727646677, 1.0), 1: (1.00727646677, 1.0)}, - 'He': {0: (4.00260325415, 1.0), - 3: (3.0160293191, 1.34e-06), - 4: (4.00260325415, 0.99999866), - 5: (5.01222, 0.0), - 6: (6.0188891, 0.0), - 7: (7.028021, 0.0), - 8: (8.033922, 0.0), - 9: (9.04395, 0.0), - 10: (10.0524, 0.0)}, - 'Hf': {0: (179.94655, 1.0), - 153: (152.97069, 0.0), - 154: (153.96486, 0.0), - 155: (154.96339, 0.0), - 156: (155.95936, 0.0), - 157: (156.9584, 0.0), - 158: (157.954799, 0.0), - 159: (158.953995, 0.0), - 160: (159.950684, 0.0), - 161: (160.950275, 0.0), - 162: (161.94721, 0.0), - 163: (162.94709, 0.0), - 164: (163.944367, 0.0), - 165: (164.94457, 0.0), - 166: (165.94218, 0.0), - 167: (166.9426, 0.0), - 168: (167.94057, 0.0), - 169: (168.94126, 0.0), - 170: (169.93961, 0.0), - 171: (170.94049, 0.0), - 172: (171.939448, 0.0), - 173: (172.94051, 0.0), - 174: (173.940046, 0.0016), - 175: (174.941509, 0.0), - 176: (175.9414086, 0.0526), - 177: (176.9432207, 0.186), - 178: (177.9436988, 0.2728), - 179: (178.9458161, 0.1362), - 180: (179.94655, 0.3508), - 181: (180.9491012, 0.0), - 182: (181.950554, 0.0), - 183: (182.95353, 0.0), - 184: (183.95545, 0.0), - 185: (184.95882, 0.0), - 186: (185.96089, 0.0), - 187: (186.96459, 0.0), - 188: (187.96685, 0.0)}, - 'Hg': {0: (201.970643, 1.0), - 171: (171.00376, 0.0), - 172: (171.99883, 0.0), - 173: (172.99724, 0.0), - 174: (173.992864, 0.0), - 175: (174.99142, 0.0), - 176: (175.987355, 0.0), - 177: (176.98628, 0.0), - 178: (177.982483, 0.0), - 179: (178.981834, 0.0), - 180: (179.978266, 0.0), - 181: (180.977819, 0.0), - 182: (181.97469, 0.0), - 183: (182.97445, 0.0), - 184: (183.971713, 0.0), - 185: (184.971899, 0.0), - 186: (185.969362, 0.0), - 187: (186.969814, 0.0), - 188: (187.967577, 0.0), - 189: (188.96819, 0.0), - 190: (189.966322, 0.0), - 191: (190.967157, 0.0), - 192: (191.965634, 0.0), - 193: (192.966665, 0.0), - 194: (193.965439, 0.0), - 195: (194.96672, 0.0), - 196: (195.965833, 0.0015), - 197: (196.967213, 0.0), - 198: (197.966769, 0.0997), - 199: (198.9682799, 0.1687), - 200: (199.968326, 0.231), - 201: (200.9703023, 0.1318), - 202: (201.970643, 0.2986), - 203: (202.9728725, 0.0), - 204: (203.9734939, 0.0687), - 205: (204.976073, 0.0), - 206: (205.977514, 0.0), - 207: (206.98259, 0.0), - 208: (207.98594, 0.0), - 209: (208.99104, 0.0), - 210: (209.99451, 0.0)}, - 'Ho': {0: (164.9303221, 1.0), - 140: (139.96854, 0.0), - 141: (140.9631, 0.0), - 142: (141.95977, 0.0), - 143: (142.95461, 0.0), - 144: (143.95148, 0.0), - 145: (144.9472, 0.0), - 146: (145.94464, 0.0), - 147: (146.94006, 0.0), - 148: (147.93772, 0.0), - 149: (148.933775, 0.0), - 150: (149.933496, 0.0), - 151: (150.931688, 0.0), - 152: (151.931714, 0.0), - 153: (152.930199, 0.0), - 154: (153.930602, 0.0), - 155: (154.929103, 0.0), - 156: (155.92984, 0.0), - 157: (156.928256, 0.0), - 158: (157.928941, 0.0), - 159: (158.927712, 0.0), - 160: (159.928729, 0.0), - 161: (160.927855, 0.0), - 162: (161.929096, 0.0), - 163: (162.9287339, 0.0), - 164: (163.9302335, 0.0), - 165: (164.9303221, 1.0), - 166: (165.9322842, 0.0), - 167: (166.933133, 0.0), - 168: (167.93552, 0.0), - 169: (168.936872, 0.0), - 170: (169.93962, 0.0), - 171: (170.94147, 0.0), - 172: (171.94482, 0.0), - 173: (172.94729, 0.0), - 174: (173.95115, 0.0), - 175: (174.95405, 0.0)}, - 'Hs': {0: (270, 1.0), - 263: (263.12856, 0.0), - 264: (264.12839, 0.0), - 265: (265.13009, 0.0), - 266: (266.1301, 0.0), - 267: (267.13179, 0.0), - 268: (268.13216, 0.0), - 269: (269.13406, 0.0), - 270: (270.13465, 0.0), - 271: (271.13766, 0.0), - 272: (272.13905, 0.0), - 273: (273.14199, 0.0), - 274: (274.14313, 0.0), - 275: (275.14595, 0.0), - 276: (276.14721, 0.0), - 277: (277.14984, 0.0)}, - 'I': {0: (126.904473, 1.0), - 108: (107.94348, 0.0), - 109: (108.93815, 0.0), - 110: (109.93524, 0.0), - 111: (110.93028, 0.0), - 112: (111.92797, 0.0), - 113: (112.92364, 0.0), - 114: (113.92185, 0.0), - 115: (114.91805, 0.0), - 116: (115.91681, 0.0), - 117: (116.91365, 0.0), - 118: (117.913074, 0.0), - 119: (118.91007, 0.0), - 120: (119.910048, 0.0), - 121: (120.907367, 0.0), - 122: (121.907589, 0.0), - 123: (122.905589, 0.0), - 124: (123.9062099, 0.0), - 125: (124.9046302, 0.0), - 126: (125.905624, 0.0), - 127: (126.904473, 1.0), - 128: (127.905809, 0.0), - 129: (128.904988, 0.0), - 130: (129.906674, 0.0), - 131: (130.9061246, 0.0), - 132: (131.907997, 0.0), - 133: (132.907797, 0.0), - 134: (133.909744, 0.0), - 135: (134.910048, 0.0), - 136: (135.91465, 0.0), - 137: (136.917871, 0.0), - 138: (137.92235, 0.0), - 139: (138.9261, 0.0), - 140: (139.931, 0.0), - 141: (140.93503, 0.0), - 142: (141.94018, 0.0), - 143: (142.94456, 0.0), - 144: (143.94999, 0.0)}, - 'In': {0: (114.903878, 1.0), - 97: (96.94954, 0.0), - 98: (97.94214, 0.0), - 99: (98.93422, 0.0), - 100: (99.93111, 0.0), - 101: (100.92634, 0.0), - 102: (101.92409, 0.0), - 103: (102.919914, 0.0), - 104: (103.9183, 0.0), - 105: (104.914674, 0.0), - 106: (105.913465, 0.0), - 107: (106.910295, 0.0), - 108: (107.909698, 0.0), - 109: (108.907151, 0.0), - 110: (109.907165, 0.0), - 111: (110.905103, 0.0), - 112: (111.905532, 0.0), - 113: (112.904058, 0.0429), - 114: (113.904914, 0.0), - 115: (114.903878, 0.9571), - 116: (115.90526, 0.0), - 117: (116.904514, 0.0), - 118: (117.906354, 0.0), - 119: (118.905845, 0.0), - 120: (119.90796, 0.0), - 121: (120.907846, 0.0), - 122: (121.91028, 0.0), - 123: (122.910438, 0.0), - 124: (123.91318, 0.0), - 125: (124.9136, 0.0), - 126: (125.91646, 0.0), - 127: (126.91735, 0.0), - 128: (127.92017, 0.0), - 129: (128.9217, 0.0), - 130: (129.92497, 0.0), - 131: (130.92685, 0.0), - 132: (131.93299, 0.0), - 133: (132.93781, 0.0), - 134: (133.94415, 0.0), - 135: (134.94933, 0.0)}, - 'Ir': {0: (192.9629264, 1.0), - 164: (163.9922, 0.0), - 165: (164.98752, 0.0), - 166: (165.98582, 0.0), - 167: (166.981665, 0.0), - 168: (167.97988, 0.0), - 169: (168.976295, 0.0), - 170: (169.97497, 0.0), - 171: (170.97163, 0.0), - 172: (171.97046, 0.0), - 173: (172.967502, 0.0), - 174: (173.966861, 0.0), - 175: (174.964113, 0.0), - 176: (175.963649, 0.0), - 177: (176.961302, 0.0), - 178: (177.961082, 0.0), - 179: (178.959122, 0.0), - 180: (179.959229, 0.0), - 181: (180.957625, 0.0), - 182: (181.958076, 0.0), - 183: (182.956846, 0.0), - 184: (183.95748, 0.0), - 185: (184.9567, 0.0), - 186: (185.957946, 0.0), - 187: (186.957363, 0.0), - 188: (187.958853, 0.0), - 189: (188.958719, 0.0), - 190: (189.960546, 0.0), - 191: (190.960594, 0.373), - 192: (191.962605, 0.0), - 193: (192.9629264, 0.627), - 194: (193.9650784, 0.0), - 195: (194.9659796, 0.0), - 196: (195.9684, 0.0), - 197: (196.969653, 0.0), - 198: (197.97228, 0.0), - 199: (198.9738, 0.0)}, - 'K': {0: (38.96370668, 1.0), - 32: (32.02192, 0.0), - 33: (33.00726, 0.0), - 34: (33.99841, 0.0), - 35: (34.98801, 0.0), - 36: (35.981292, 0.0), - 37: (36.97337589, 0.0), - 38: (37.9690812, 0.0), - 39: (38.96370668, 0.932581), - 40: (39.96399848, 0.000117), - 41: (40.96182576, 0.067302), - 42: (41.96240281, 0.0), - 43: (42.960716, 0.0), - 44: (43.96156, 0.0), - 45: (44.960699, 0.0), - 46: (45.961977, 0.0), - 47: (46.961678, 0.0), - 48: (47.965514, 0.0), - 49: (48.96745, 0.0), - 50: (49.97278, 0.0), - 51: (50.97638, 0.0), - 52: (51.98261, 0.0), - 53: (52.98712, 0.0), - 54: (53.9942, 0.0), - 55: (54.99971, 0.0)}, - 'Kr': {0: (83.911507, 1.0), - 69: (68.96518, 0.0), - 70: (69.95526, 0.0), - 71: (70.94963, 0.0), - 72: (71.942092, 0.0), - 73: (72.939289, 0.0), - 74: (73.9330844, 0.0), - 75: (74.930946, 0.0), - 76: (75.92591, 0.0), - 77: (76.92467, 0.0), - 78: (77.9203648, 0.00355), - 79: (78.920082, 0.0), - 80: (79.916379, 0.02286), - 81: (80.916592, 0.0), - 82: (81.9134836, 0.11593), - 83: (82.914136, 0.115), - 84: (83.911507, 0.56987), - 85: (84.9125273, 0.0), - 86: (85.91061073, 0.17279), - 87: (86.91335486, 0.0), - 88: (87.914447, 0.0), - 89: (88.91763, 0.0), - 90: (89.919517, 0.0), - 91: (90.92345, 0.0), - 92: (91.926156, 0.0), - 93: (92.93127, 0.0), - 94: (93.93436, 0.0), - 95: (94.93984, 0.0), - 96: (95.94307, 0.0), - 97: (96.94856, 0.0), - 98: (97.95191, 0.0), - 99: (98.9576, 0.0), - 100: (99.96114, 0.0)}, - 'La': {0: (138.9063533, 1.0), - 117: (116.95007, 0.0), - 118: (117.94673, 0.0), - 119: (118.94099, 0.0), - 120: (119.93807, 0.0), - 121: (120.93301, 0.0), - 122: (121.93071, 0.0), - 123: (122.92624, 0.0), - 124: (123.92457, 0.0), - 125: (124.920816, 0.0), - 126: (125.91951, 0.0), - 127: (126.916375, 0.0), - 128: (127.91559, 0.0), - 129: (128.912693, 0.0), - 130: (129.912369, 0.0), - 131: (130.91007, 0.0), - 132: (131.9101, 0.0), - 133: (132.90822, 0.0), - 134: (133.908514, 0.0), - 135: (134.906977, 0.0), - 136: (135.90764, 0.0), - 137: (136.906494, 0.0), - 138: (137.907112, 0.0009), - 139: (138.9063533, 0.9991), - 140: (139.9094776, 0.0), - 141: (140.910962, 0.0), - 142: (141.914079, 0.0), - 143: (142.916063, 0.0), - 144: (143.9196, 0.0), - 145: (144.92165, 0.0), - 146: (145.92579, 0.0), - 147: (146.92824, 0.0), - 148: (147.93223, 0.0), - 149: (148.93473, 0.0), - 150: (149.93877, 0.0), - 151: (150.94172, 0.0), - 152: (151.94625, 0.0), - 153: (152.94962, 0.0), - 154: (153.9545, 0.0), - 155: (154.95835, 0.0)}, - 'Li': {0: (7.01600455, 1.0), - 3: (3.03078, 0.0), - 4: (4.02719, 0.0), - 5: (5.01254, 0.0), - 6: (6.015122795, 0.0759), - 7: (7.01600455, 0.9241), - 8: (8.02248736, 0.0), - 9: (9.0267895, 0.0), - 10: (10.035481, 0.0), - 11: (11.043798, 0.0), - 12: (12.05378, 0.0)}, - 'Lr': {0: (262, 1.0), - 251: (251.09436, 0.0), - 252: (252.09537, 0.0), - 253: (253.09521, 0.0), - 254: (254.09645, 0.0), - 255: (255.09668, 0.0), - 256: (256.09863, 0.0), - 257: (257.09956, 0.0), - 258: (258.10181, 0.0), - 259: (259.1029, 0.0), - 260: (260.1055, 0.0), - 261: (261.10688, 0.0), - 262: (262.10963, 0.0), - 263: (263.11129, 0.0), - 264: (264.11404, 0.0), - 265: (265.11584, 0.0), - 266: (266.11931, 0.0)}, - 'Lu': {0: (174.9407718, 1.0), - 150: (149.97323, 0.0), - 151: (150.96758, 0.0), - 152: (151.96412, 0.0), - 153: (152.95877, 0.0), - 154: (153.95752, 0.0), - 155: (154.954316, 0.0), - 156: (155.95303, 0.0), - 157: (156.950098, 0.0), - 158: (157.949313, 0.0), - 159: (158.94663, 0.0), - 160: (159.94603, 0.0), - 161: (160.94357, 0.0), - 162: (161.94328, 0.0), - 163: (162.94118, 0.0), - 164: (163.94134, 0.0), - 165: (164.939407, 0.0), - 166: (165.93986, 0.0), - 167: (166.93827, 0.0), - 168: (167.93874, 0.0), - 169: (168.937651, 0.0), - 170: (169.938475, 0.0), - 171: (170.9379131, 0.0), - 172: (171.939086, 0.0), - 173: (172.9389306, 0.0), - 174: (173.9403375, 0.0), - 175: (174.9407718, 0.9741), - 176: (175.9426863, 0.0259), - 177: (176.9437581, 0.0), - 178: (177.945955, 0.0), - 179: (178.947327, 0.0), - 180: (179.94988, 0.0), - 181: (180.95197, 0.0), - 182: (181.95504, 0.0), - 183: (182.95757, 0.0), - 184: (183.96091, 0.0)}, - 'Md': {0: (258, 1.0), - 245: (245.08083, 0.0), - 246: (246.08189, 0.0), - 247: (247.08164, 0.0), - 248: (248.08282, 0.0), - 249: (249.08301, 0.0), - 250: (250.08442, 0.0), - 251: (251.08484, 0.0), - 252: (252.08656, 0.0), - 253: (253.08728, 0.0), - 254: (254.08966, 0.0), - 255: (255.091083, 0.0), - 256: (256.09406, 0.0), - 257: (257.095541, 0.0), - 258: (258.098431, 0.0), - 259: (259.10051, 0.0), - 260: (260.10365, 0.0), - 261: (261.10572, 0.0), - 262: (262.10887, 0.0)}, - 'Mg': {0: (23.9850417, 1.0), - 19: (19.03547, 0.0), - 20: (20.018863, 0.0), - 21: (21.011713, 0.0), - 22: (21.9995738, 0.0), - 23: (22.9941237, 0.0), - 24: (23.9850417, 0.7899), - 25: (24.98583692, 0.1), - 26: (25.982592929, 0.1101), - 27: (26.98434059, 0.0), - 28: (27.9838768, 0.0), - 29: (28.9886, 0.0), - 30: (29.990434, 0.0), - 31: (30.996546, 0.0), - 32: (31.998975, 0.0), - 33: (33.005254, 0.0), - 34: (34.00946, 0.0), - 35: (35.01734, 0.0), - 36: (36.023, 0.0), - 37: (37.0314, 0.0), - 38: (38.03757, 0.0), - 39: (39.04677, 0.0), - 40: (40.05393, 0.0)}, - 'Mn': {0: (54.9380451, 1.0), - 44: (44.00687, 0.0), - 45: (44.99451, 0.0), - 46: (45.98672, 0.0), - 47: (46.9761, 0.0), - 48: (47.96852, 0.0), - 49: (48.959618, 0.0), - 50: (49.9542382, 0.0), - 51: (50.9482108, 0.0), - 52: (51.9455655, 0.0), - 53: (52.9412901, 0.0), - 54: (53.9403589, 0.0), - 55: (54.9380451, 1.0), - 56: (55.9389049, 0.0), - 57: (56.9382854, 0.0), - 58: (57.93998, 0.0), - 59: (58.94044, 0.0), - 60: (59.94291, 0.0), - 61: (60.94465, 0.0), - 62: (61.94843, 0.0), - 63: (62.95024, 0.0), - 64: (63.95425, 0.0), - 65: (64.95634, 0.0), - 66: (65.96108, 0.0), - 67: (66.96414, 0.0), - 68: (67.9693, 0.0), - 69: (68.97284, 0.0)}, - 'Mo': {0: (97.9054082, 1.0), - 83: (82.94874, 0.0), - 84: (83.94009, 0.0), - 85: (84.93655, 0.0), - 86: (85.9307, 0.0), - 87: (86.92733, 0.0), - 88: (87.921953, 0.0), - 89: (88.91948, 0.0), - 90: (89.913937, 0.0), - 91: (90.91175, 0.0), - 92: (91.906811, 0.1477), - 93: (92.906813, 0.0), - 94: (93.9050883, 0.0923), - 95: (94.9058421, 0.159), - 96: (95.9046795, 0.1668), - 97: (96.9060215, 0.0956), - 98: (97.9054082, 0.2419), - 99: (98.9077119, 0.0), - 100: (99.907477, 0.0967), - 101: (100.910347, 0.0), - 102: (101.910297, 0.0), - 103: (102.91321, 0.0), - 104: (103.91376, 0.0), - 105: (104.91697, 0.0), - 106: (105.918137, 0.0), - 107: (106.92169, 0.0), - 108: (107.92345, 0.0), - 109: (108.92781, 0.0), - 110: (109.92973, 0.0), - 111: (110.93441, 0.0), - 112: (111.93684, 0.0), - 113: (112.94188, 0.0), - 114: (113.94492, 0.0), - 115: (114.95029, 0.0)}, - 'Mt': {0: (276, 1.0), - 265: (265.13615, 0.0), - 266: (266.1373, 0.0), - 267: (267.13731, 0.0), - 268: (268.13873, 0.0), - 269: (269.13906, 0.0), - 270: (270.14066, 0.0), - 271: (271.14114, 0.0), - 272: (272.14374, 0.0), - 273: (273.14491, 0.0), - 274: (274.14749, 0.0), - 275: (275.14865, 0.0), - 276: (276.15116, 0.0), - 277: (277.15242, 0.0), - 278: (278.15481, 0.0), - 279: (279.15619, 0.0)}, - 'N': {0: (14.0030740048, 1.0), - 10: (10.04165, 0.0), - 11: (11.02609, 0.0), - 12: (12.0186132, 0.0), - 13: (13.00573861, 0.0), - 14: (14.0030740048, 0.99636), - 15: (15.0001088982, 0.00364), - 16: (16.0061017, 0.0), - 17: (17.00845, 0.0), - 18: (18.014079, 0.0), - 19: (19.017029, 0.0), - 20: (20.02337, 0.0), - 21: (21.02711, 0.0), - 22: (22.03439, 0.0), - 23: (23.04122, 0.0), - 24: (24.05104, 0.0), - 25: (25.06066, 0.0)}, - 'Na': {0: (22.9897692809, 1.0), - 18: (18.02597, 0.0), - 19: (19.013877, 0.0), - 20: (20.007351, 0.0), - 21: (20.9976552, 0.0), - 22: (21.9944364, 0.0), - 23: (22.9897692809, 1.0), - 24: (23.99096278, 0.0), - 25: (24.989954, 0.0), - 26: (25.992633, 0.0), - 27: (26.994077, 0.0), - 28: (27.998938, 0.0), - 29: (29.002861, 0.0), - 30: (30.008976, 0.0), - 31: (31.01359, 0.0), - 32: (32.02047, 0.0), - 33: (33.02672, 0.0), - 34: (34.03517, 0.0), - 35: (35.04249, 0.0), - 36: (36.05148, 0.0), - 37: (37.05934, 0.0)}, - 'Nb': {0: (92.9063781, 1.0), - 81: (80.94903, 0.0), - 82: (81.94313, 0.0), - 83: (82.93671, 0.0), - 84: (83.93357, 0.0), - 85: (84.92791, 0.0), - 86: (85.92504, 0.0), - 87: (86.92036, 0.0), - 88: (87.91833, 0.0), - 89: (88.913418, 0.0), - 90: (89.911265, 0.0), - 91: (90.906996, 0.0), - 92: (91.907194, 0.0), - 93: (92.9063781, 1.0), - 94: (93.9072839, 0.0), - 95: (94.9068358, 0.0), - 96: (95.908101, 0.0), - 97: (96.9080986, 0.0), - 98: (97.910328, 0.0), - 99: (98.911618, 0.0), - 100: (99.914182, 0.0), - 101: (100.915252, 0.0), - 102: (101.91804, 0.0), - 103: (102.91914, 0.0), - 104: (103.92246, 0.0), - 105: (104.92394, 0.0), - 106: (105.92797, 0.0), - 107: (106.93031, 0.0), - 108: (107.93484, 0.0), - 109: (108.93763, 0.0), - 110: (109.94244, 0.0), - 111: (110.94565, 0.0), - 112: (111.95083, 0.0), - 113: (112.9547, 0.0)}, - 'Nd': {0: (141.9077233, 1.0), - 124: (123.95223, 0.0), - 125: (124.94888, 0.0), - 126: (125.94322, 0.0), - 127: (126.9405, 0.0), - 128: (127.93539, 0.0), - 129: (128.93319, 0.0), - 130: (129.92851, 0.0), - 131: (130.92725, 0.0), - 132: (131.923321, 0.0), - 133: (132.92235, 0.0), - 134: (133.91879, 0.0), - 135: (134.918181, 0.0), - 136: (135.914976, 0.0), - 137: (136.914567, 0.0), - 138: (137.91195, 0.0), - 139: (138.911978, 0.0), - 140: (139.90955, 0.0), - 141: (140.90961, 0.0), - 142: (141.9077233, 0.272), - 143: (142.9098143, 0.122), - 144: (143.9100873, 0.238), - 145: (144.9125736, 0.083), - 146: (145.9131169, 0.172), - 147: (146.9161004, 0.0), - 148: (147.916893, 0.057), - 149: (148.920149, 0.0), - 150: (149.920891, 0.056), - 151: (150.923829, 0.0), - 152: (151.924682, 0.0), - 153: (152.927698, 0.0), - 154: (153.92948, 0.0), - 155: (154.93293, 0.0), - 156: (155.93502, 0.0), - 157: (156.93903, 0.0), - 158: (157.9416, 0.0), - 159: (158.94609, 0.0), - 160: (159.94909, 0.0), - 161: (160.95388, 0.0)}, - 'Ne': {0: (19.9924401754, 1.0), - 16: (16.025761, 0.0), - 17: (17.017672, 0.0), - 18: (18.0057082, 0.0), - 19: (19.0018802, 0.0), - 20: (19.9924401754, 0.9048), - 21: (20.99384668, 0.0027), - 22: (21.991385114, 0.0925), - 23: (22.9944669, 0.0), - 24: (23.9936108, 0.0), - 25: (24.997737, 0.0), - 26: (26.000461, 0.0), - 27: (27.00759, 0.0), - 28: (28.01207, 0.0), - 29: (29.01939, 0.0), - 30: (30.0248, 0.0), - 31: (31.03311, 0.0), - 32: (32.04002, 0.0), - 33: (33.04938, 0.0), - 34: (34.05703, 0.0)}, - 'Ni': {0: (57.9353429, 1.0), - 48: (48.01975, 0.0), - 49: (49.00966, 0.0), - 50: (49.99593, 0.0), - 51: (50.98772, 0.0), - 52: (51.97568, 0.0), - 53: (52.96847, 0.0), - 54: (53.95791, 0.0), - 55: (54.95133, 0.0), - 56: (55.942132, 0.0), - 57: (56.9397935, 0.0), - 58: (57.9353429, 0.680769), - 59: (58.9343467, 0.0), - 60: (59.9307864, 0.262231), - 61: (60.931056, 0.011399), - 62: (61.9283451, 0.036345), - 63: (62.9296694, 0.0), - 64: (63.927966, 0.009256), - 65: (64.9300843, 0.0), - 66: (65.9291393, 0.0), - 67: (66.931569, 0.0), - 68: (67.931869, 0.0), - 69: (68.93561, 0.0), - 70: (69.9365, 0.0), - 71: (70.94074, 0.0), - 72: (71.94209, 0.0), - 73: (72.94647, 0.0), - 74: (73.94807, 0.0), - 75: (74.95287, 0.0), - 76: (75.95533, 0.0), - 77: (76.96055, 0.0), - 78: (77.96318, 0.0)}, - 'No': {0: (259, 1.0), - 248: (248.0866, 0.0), - 249: (249.08783, 0.0), - 250: (250.08751, 0.0), - 251: (251.08901, 0.0), - 252: (252.088977, 0.0), - 253: (253.09068, 0.0), - 254: (254.090955, 0.0), - 255: (255.093241, 0.0), - 256: (256.094283, 0.0), - 257: (257.096877, 0.0), - 258: (258.09821, 0.0), - 259: (259.10103, 0.0), - 260: (260.10264, 0.0), - 261: (261.10575, 0.0), - 262: (262.1073, 0.0), - 263: (263.11055, 0.0), - 264: (264.11235, 0.0)}, - 'Np': {0: (237, 1.0), - 225: (225.03391, 0.0), - 226: (226.03515, 0.0), - 227: (227.03496, 0.0), - 228: (228.03618, 0.0), - 229: (229.03626, 0.0), - 230: (230.03783, 0.0), - 231: (231.03825, 0.0), - 232: (232.04011, 0.0), - 233: (233.04074, 0.0), - 234: (234.042895, 0.0), - 235: (235.0440633, 0.0), - 236: (236.04657, 0.0), - 237: (237.0481734, 0.0), - 238: (238.0509464, 0.0), - 239: (239.052939, 0.0), - 240: (240.056162, 0.0), - 241: (241.05825, 0.0), - 242: (242.06164, 0.0), - 243: (243.06428, 0.0), - 244: (244.06785, 0.0)}, - 'O': {0: (15.99491461956, 1.0), - 12: (12.034405, 0.0), - 13: (13.024812, 0.0), - 14: (14.00859625, 0.0), - 15: (15.0030656, 0.0), - 16: (15.99491461956, 0.99757), - 17: (16.9991317, 0.00038), - 18: (17.999161, 0.00205), - 19: (19.00358, 0.0), - 20: (20.0040767, 0.0), - 21: (21.008656, 0.0), - 22: (22.00997, 0.0), - 23: (23.01569, 0.0), - 24: (24.02047, 0.0), - 25: (25.02946, 0.0), - 26: (26.03834, 0.0), - 27: (27.04826, 0.0), - 28: (28.05781, 0.0)}, - 'Os': {0: (191.9614807, 1.0), - 162: (161.98443, 0.0), - 163: (162.98269, 0.0), - 164: (163.97804, 0.0), - 165: (164.97676, 0.0), - 166: (165.972691, 0.0), - 167: (166.97155, 0.0), - 168: (167.967804, 0.0), - 169: (168.967019, 0.0), - 170: (169.963577, 0.0), - 171: (170.963185, 0.0), - 172: (171.960023, 0.0), - 173: (172.959808, 0.0), - 174: (173.957062, 0.0), - 175: (174.956946, 0.0), - 176: (175.95481, 0.0), - 177: (176.954965, 0.0), - 178: (177.953251, 0.0), - 179: (178.953816, 0.0), - 180: (179.952379, 0.0), - 181: (180.95324, 0.0), - 182: (181.95211, 0.0), - 183: (182.95313, 0.0), - 184: (183.9524891, 0.0002), - 185: (184.9540423, 0.0), - 186: (185.9538382, 0.0159), - 187: (186.9557505, 0.0196), - 188: (187.9558382, 0.1324), - 189: (188.9581475, 0.1615), - 190: (189.958447, 0.2626), - 191: (190.9609297, 0.0), - 192: (191.9614807, 0.4078), - 193: (192.9641516, 0.0), - 194: (193.9651821, 0.0), - 195: (194.96813, 0.0), - 196: (195.96964, 0.0)}, - 'P': {0: (30.97376163, 1.0), - 24: (24.03435, 0.0), - 25: (25.02026, 0.0), - 26: (26.01178, 0.0), - 27: (26.99923, 0.0), - 28: (27.992315, 0.0), - 29: (28.9818006, 0.0), - 30: (29.9783138, 0.0), - 31: (30.97376163, 1.0), - 32: (31.97390727, 0.0), - 33: (32.9717255, 0.0), - 34: (33.973636, 0.0), - 35: (34.9733141, 0.0), - 36: (35.97826, 0.0), - 37: (36.97961, 0.0), - 38: (37.98416, 0.0), - 39: (38.98618, 0.0), - 40: (39.9913, 0.0), - 41: (40.99434, 0.0), - 42: (42.00101, 0.0), - 43: (43.00619, 0.0), - 44: (44.01299, 0.0), - 45: (45.01922, 0.0), - 46: (46.02738, 0.0)}, - 'Pa': {0: (231.035884, 1.0), - 212: (212.0232, 0.0), - 213: (213.02111, 0.0), - 214: (214.02092, 0.0), - 215: (215.01919, 0.0), - 216: (216.01911, 0.0), - 217: (217.01832, 0.0), - 218: (218.020042, 0.0), - 219: (219.01988, 0.0), - 220: (220.02188, 0.0), - 221: (221.02188, 0.0), - 222: (222.02374, 0.0), - 223: (223.02396, 0.0), - 224: (224.025626, 0.0), - 225: (225.02613, 0.0), - 226: (226.027948, 0.0), - 227: (227.028805, 0.0), - 228: (228.031051, 0.0), - 229: (229.0320968, 0.0), - 230: (230.034541, 0.0), - 231: (231.035884, 1.0), - 232: (232.038592, 0.0), - 233: (233.0402473, 0.0), - 234: (234.043308, 0.0), - 235: (235.04544, 0.0), - 236: (236.04868, 0.0), - 237: (237.05115, 0.0), - 238: (238.0545, 0.0), - 239: (239.05726, 0.0), - 240: (240.06098, 0.0)}, - 'Pb': {0: (207.9766521, 1.0), - 178: (178.00383, 0.0), - 179: (179.00215, 0.0), - 180: (179.997918, 0.0), - 181: (180.99662, 0.0), - 182: (181.992672, 0.0), - 183: (182.99187, 0.0), - 184: (183.988142, 0.0), - 185: (184.98761, 0.0), - 186: (185.984239, 0.0), - 187: (186.983918, 0.0), - 188: (187.980874, 0.0), - 189: (188.98081, 0.0), - 190: (189.978082, 0.0), - 191: (190.97827, 0.0), - 192: (191.975785, 0.0), - 193: (192.97617, 0.0), - 194: (193.974012, 0.0), - 195: (194.974542, 0.0), - 196: (195.972774, 0.0), - 197: (196.973431, 0.0), - 198: (197.972034, 0.0), - 199: (198.972917, 0.0), - 200: (199.971827, 0.0), - 201: (200.972885, 0.0), - 202: (201.972159, 0.0), - 203: (202.973391, 0.0), - 204: (203.9730436, 0.014), - 205: (204.9744818, 0.0), - 206: (205.9744653, 0.241), - 207: (206.9758969, 0.221), - 208: (207.9766521, 0.524), - 209: (208.9810901, 0.0), - 210: (209.9841885, 0.0), - 211: (210.988737, 0.0), - 212: (211.9918975, 0.0), - 213: (212.996581, 0.0), - 214: (213.9998054, 0.0), - 215: (215.00481, 0.0)}, - 'Pd': {0: (105.903486, 1.0), - 91: (90.94911, 0.0), - 92: (91.94042, 0.0), - 93: (92.93591, 0.0), - 94: (93.92877, 0.0), - 95: (94.92469, 0.0), - 96: (95.91816, 0.0), - 97: (96.91648, 0.0), - 98: (97.912721, 0.0), - 99: (98.911768, 0.0), - 100: (99.908506, 0.0), - 101: (100.908289, 0.0), - 102: (101.905609, 0.0102), - 103: (102.906087, 0.0), - 104: (103.904036, 0.1114), - 105: (104.905085, 0.2233), - 106: (105.903486, 0.2733), - 107: (106.905133, 0.0), - 108: (107.903892, 0.2646), - 109: (108.90595, 0.0), - 110: (109.905153, 0.1172), - 111: (110.907671, 0.0), - 112: (111.907314, 0.0), - 113: (112.91015, 0.0), - 114: (113.910363, 0.0), - 115: (114.91368, 0.0), - 116: (115.91416, 0.0), - 117: (116.91784, 0.0), - 118: (117.91898, 0.0), - 119: (118.92311, 0.0), - 120: (119.92469, 0.0), - 121: (120.92887, 0.0), - 122: (121.93055, 0.0), - 123: (122.93493, 0.0), - 124: (123.93688, 0.0)}, - 'Pm': {0: (145, 1.0), - 126: (125.95752, 0.0), - 127: (126.95163, 0.0), - 128: (127.94842, 0.0), - 129: (128.94316, 0.0), - 130: (129.94045, 0.0), - 131: (130.93587, 0.0), - 132: (131.93375, 0.0), - 133: (132.92978, 0.0), - 134: (133.92835, 0.0), - 135: (134.92488, 0.0), - 136: (135.92357, 0.0), - 137: (136.920479, 0.0), - 138: (137.919548, 0.0), - 139: (138.916804, 0.0), - 140: (139.91604, 0.0), - 141: (140.913555, 0.0), - 142: (141.912874, 0.0), - 143: (142.910933, 0.0), - 144: (143.912591, 0.0), - 145: (144.912749, 0.0), - 146: (145.914696, 0.0), - 147: (146.9151385, 0.0), - 148: (147.917475, 0.0), - 149: (148.918334, 0.0), - 150: (149.920984, 0.0), - 151: (150.921207, 0.0), - 152: (151.923497, 0.0), - 153: (152.924117, 0.0), - 154: (153.92646, 0.0), - 155: (154.9281, 0.0), - 156: (155.93106, 0.0), - 157: (156.93304, 0.0), - 158: (157.93656, 0.0), - 159: (158.93897, 0.0), - 160: (159.94299, 0.0), - 161: (160.94586, 0.0), - 162: (161.95029, 0.0), - 163: (162.95368, 0.0)}, - 'Po': {0: (209, 1.0), - 188: (187.999422, 0.0), - 189: (188.998481, 0.0), - 190: (189.995101, 0.0), - 191: (190.994574, 0.0), - 192: (191.991335, 0.0), - 193: (192.99103, 0.0), - 194: (193.988186, 0.0), - 195: (194.98811, 0.0), - 196: (195.985535, 0.0), - 197: (196.98566, 0.0), - 198: (197.983389, 0.0), - 199: (198.983666, 0.0), - 200: (199.981799, 0.0), - 201: (200.98226, 0.0), - 202: (201.980758, 0.0), - 203: (202.98142, 0.0), - 204: (203.980318, 0.0), - 205: (204.981203, 0.0), - 206: (205.980481, 0.0), - 207: (206.981593, 0.0), - 208: (207.9812457, 0.0), - 209: (208.9824304, 0.0), - 210: (209.9828737, 0.0), - 211: (210.9866532, 0.0), - 212: (211.988868, 0.0), - 213: (212.992857, 0.0), - 214: (213.9952014, 0.0), - 215: (214.99942, 0.0), - 216: (216.001915, 0.0), - 217: (217.006335, 0.0), - 218: (218.008973, 0.0), - 219: (219.01374, 0.0), - 220: (220.0166, 0.0)}, - 'Pr': {0: (140.9076528, 1.0), - 121: (120.95536, 0.0), - 122: (121.95181, 0.0), - 123: (122.94596, 0.0), - 124: (123.94296, 0.0), - 125: (124.93783, 0.0), - 126: (125.93531, 0.0), - 127: (126.93083, 0.0), - 128: (127.92879, 0.0), - 129: (128.9251, 0.0), - 130: (129.92359, 0.0), - 131: (130.92026, 0.0), - 132: (131.91926, 0.0), - 133: (132.916331, 0.0), - 134: (133.91571, 0.0), - 135: (134.913112, 0.0), - 136: (135.912692, 0.0), - 137: (136.910705, 0.0), - 138: (137.910755, 0.0), - 139: (138.908938, 0.0), - 140: (139.909076, 0.0), - 141: (140.9076528, 1.0), - 142: (141.9100448, 0.0), - 143: (142.9108169, 0.0), - 144: (143.913305, 0.0), - 145: (144.914512, 0.0), - 146: (145.91764, 0.0), - 147: (146.918996, 0.0), - 148: (147.922135, 0.0), - 149: (148.92372, 0.0), - 150: (149.926673, 0.0), - 151: (150.928319, 0.0), - 152: (151.9315, 0.0), - 153: (152.93384, 0.0), - 154: (153.93752, 0.0), - 155: (154.94012, 0.0), - 156: (155.94427, 0.0), - 157: (156.94743, 0.0), - 158: (157.95198, 0.0), - 159: (158.9555, 0.0)}, - 'Pt': {0: (194.9647911, 1.0), - 166: (165.99486, 0.0), - 167: (166.99298, 0.0), - 168: (167.98815, 0.0), - 169: (168.98672, 0.0), - 170: (169.982495, 0.0), - 171: (170.98124, 0.0), - 172: (171.977347, 0.0), - 173: (172.97644, 0.0), - 174: (173.972819, 0.0), - 175: (174.972421, 0.0), - 176: (175.968945, 0.0), - 177: (176.968469, 0.0), - 178: (177.965649, 0.0), - 179: (178.965363, 0.0), - 180: (179.963031, 0.0), - 181: (180.963097, 0.0), - 182: (181.961171, 0.0), - 183: (182.961597, 0.0), - 184: (183.959922, 0.0), - 185: (184.96062, 0.0), - 186: (185.959351, 0.0), - 187: (186.96059, 0.0), - 188: (187.959395, 0.0), - 189: (188.960834, 0.0), - 190: (189.959932, 0.00014), - 191: (190.961677, 0.0), - 192: (191.961038, 0.00782), - 193: (192.9629874, 0.0), - 194: (193.9626803, 0.32967), - 195: (194.9647911, 0.33832), - 196: (195.9649515, 0.25242), - 197: (196.9673402, 0.0), - 198: (197.967893, 0.07163), - 199: (198.970593, 0.0), - 200: (199.971441, 0.0), - 201: (200.97451, 0.0), - 202: (201.97574, 0.0)}, - 'Pu': {0: (244, 1.0), - 228: (228.03874, 0.0), - 229: (229.04015, 0.0), - 230: (230.03965, 0.0), - 231: (231.041101, 0.0), - 232: (232.041187, 0.0), - 233: (233.043, 0.0), - 234: (234.043317, 0.0), - 235: (235.045286, 0.0), - 236: (236.046058, 0.0), - 237: (237.0484097, 0.0), - 238: (238.0495599, 0.0), - 239: (239.0521634, 0.0), - 240: (240.0538135, 0.0), - 241: (241.0568515, 0.0), - 242: (242.0587426, 0.0), - 243: (243.062003, 0.0), - 244: (244.064204, 0.0), - 245: (245.067747, 0.0), - 246: (246.070205, 0.0), - 247: (247.07407, 0.0)}, - 'Ra': {0: (226, 1.0), - 202: (202.00989, 0.0), - 203: (203.00927, 0.0), - 204: (204.0065, 0.0), - 205: (205.00627, 0.0), - 206: (206.003827, 0.0), - 207: (207.0038, 0.0), - 208: (208.00184, 0.0), - 209: (209.00199, 0.0), - 210: (210.000495, 0.0), - 211: (211.000898, 0.0), - 212: (211.999794, 0.0), - 213: (213.000384, 0.0), - 214: (214.000108, 0.0), - 215: (215.00272, 0.0), - 216: (216.003533, 0.0), - 217: (217.00632, 0.0), - 218: (218.00714, 0.0), - 219: (219.010085, 0.0), - 220: (220.011028, 0.0), - 221: (221.013917, 0.0), - 222: (222.015375, 0.0), - 223: (223.0185022, 0.0), - 224: (224.0202118, 0.0), - 225: (225.023612, 0.0), - 226: (226.0254098, 0.0), - 227: (227.0291778, 0.0), - 228: (228.0310703, 0.0), - 229: (229.034958, 0.0), - 230: (230.037056, 0.0), - 231: (231.04122, 0.0), - 232: (232.04364, 0.0), - 233: (233.04806, 0.0), - 234: (234.0507, 0.0)}, - 'Rb': {0: (84.911789738, 1.0), - 71: (70.96532, 0.0), - 72: (71.95908, 0.0), - 73: (72.95056, 0.0), - 74: (73.944265, 0.0), - 75: (74.93857, 0.0), - 76: (75.9350722, 0.0), - 77: (76.930408, 0.0), - 78: (77.928141, 0.0), - 79: (78.923989, 0.0), - 80: (79.922519, 0.0), - 81: (80.918996, 0.0), - 82: (81.9182086, 0.0), - 83: (82.91511, 0.0), - 84: (83.914385, 0.0), - 85: (84.911789738, 0.7217), - 86: (85.91116742, 0.0), - 87: (86.909180527, 0.2783), - 88: (87.91131559, 0.0), - 89: (88.912278, 0.0), - 90: (89.914802, 0.0), - 91: (90.916537, 0.0), - 92: (91.919729, 0.0), - 93: (92.922042, 0.0), - 94: (93.926405, 0.0), - 95: (94.929303, 0.0), - 96: (95.93427, 0.0), - 97: (96.93735, 0.0), - 98: (97.94179, 0.0), - 99: (98.94538, 0.0), - 100: (99.94987, 0.0), - 101: (100.9532, 0.0), - 102: (101.95887, 0.0)}, - 'Re': {0: (186.9557531, 1.0), - 160: (159.98212, 0.0), - 161: (160.97759, 0.0), - 162: (161.976, 0.0), - 163: (162.972081, 0.0), - 164: (163.97032, 0.0), - 165: (164.967089, 0.0), - 166: (165.96581, 0.0), - 167: (166.9626, 0.0), - 168: (167.96157, 0.0), - 169: (168.95879, 0.0), - 170: (169.95822, 0.0), - 171: (170.95572, 0.0), - 172: (171.95542, 0.0), - 173: (172.95324, 0.0), - 174: (173.95312, 0.0), - 175: (174.95138, 0.0), - 176: (175.95162, 0.0), - 177: (176.95033, 0.0), - 178: (177.95099, 0.0), - 179: (178.949988, 0.0), - 180: (179.950789, 0.0), - 181: (180.950068, 0.0), - 182: (181.95121, 0.0), - 183: (182.95082, 0.0), - 184: (183.952521, 0.0), - 185: (184.952955, 0.374), - 186: (185.9549861, 0.0), - 187: (186.9557531, 0.626), - 188: (187.9581144, 0.0), - 189: (188.959229, 0.0), - 190: (189.96182, 0.0), - 191: (190.963125, 0.0), - 192: (191.96596, 0.0), - 193: (192.96747, 0.0), - 194: (193.97042, 0.0)}, - 'Rf': {0: (265, 1.0), - 253: (253.10069, 0.0), - 254: (254.10018, 0.0), - 255: (255.10134, 0.0), - 256: (256.101166, 0.0), - 257: (257.10299, 0.0), - 258: (258.10349, 0.0), - 259: (259.10564, 0.0), - 260: (260.10644, 0.0), - 261: (261.10877, 0.0), - 262: (262.10993, 0.0), - 263: (263.11255, 0.0), - 264: (264.11399, 0.0), - 265: (265.1167, 0.0), - 266: (266.11796, 0.0), - 267: (267.12153, 0.0), - 268: (268.12364, 0.0)}, - 'Rg': {0: (280, 1.0), - 272: (272.15362, 0.0), - 273: (273.15368, 0.0), - 274: (274.15571, 0.0), - 275: (275.15614, 0.0), - 276: (276.15849, 0.0), - 277: (277.15952, 0.0), - 278: (278.1616, 0.0), - 279: (279.16247, 0.0), - 280: (280.16447, 0.0), - 281: (281.16537, 0.0), - 282: (282.16749, 0.0), - 283: (283.16842, 0.0)}, - 'Rh': {0: (102.905504, 1.0), - 89: (88.94884, 0.0), - 90: (89.94287, 0.0), - 91: (90.93655, 0.0), - 92: (91.93198, 0.0), - 93: (92.92574, 0.0), - 94: (93.9217, 0.0), - 95: (94.9159, 0.0), - 96: (95.914461, 0.0), - 97: (96.91134, 0.0), - 98: (97.910708, 0.0), - 99: (98.908132, 0.0), - 100: (99.908122, 0.0), - 101: (100.906164, 0.0), - 102: (101.906843, 0.0), - 103: (102.905504, 1.0), - 104: (103.906656, 0.0), - 105: (104.905694, 0.0), - 106: (105.907287, 0.0), - 107: (106.906748, 0.0), - 108: (107.90873, 0.0), - 109: (108.908737, 0.0), - 110: (109.91114, 0.0), - 111: (110.91159, 0.0), - 112: (111.91439, 0.0), - 113: (112.91553, 0.0), - 114: (113.91881, 0.0), - 115: (114.92033, 0.0), - 116: (115.92406, 0.0), - 117: (116.92598, 0.0), - 118: (117.93007, 0.0), - 119: (118.93211, 0.0), - 120: (119.93641, 0.0), - 121: (120.93872, 0.0), - 122: (121.94321, 0.0)}, - 'Rn': {0: (222, 1.0), - 195: (195.00544, 0.0), - 196: (196.002115, 0.0), - 197: (197.00158, 0.0), - 198: (197.998679, 0.0), - 199: (198.99837, 0.0), - 200: (199.995699, 0.0), - 201: (200.99563, 0.0), - 202: (201.993263, 0.0), - 203: (202.993387, 0.0), - 204: (203.991429, 0.0), - 205: (204.99172, 0.0), - 206: (205.990214, 0.0), - 207: (206.990734, 0.0), - 208: (207.989642, 0.0), - 209: (208.990415, 0.0), - 210: (209.989696, 0.0), - 211: (210.990601, 0.0), - 212: (211.990704, 0.0), - 213: (212.993883, 0.0), - 214: (213.995363, 0.0), - 215: (214.998745, 0.0), - 216: (216.000274, 0.0), - 217: (217.003928, 0.0), - 218: (218.0056013, 0.0), - 219: (219.0094802, 0.0), - 220: (220.011394, 0.0), - 221: (221.015537, 0.0), - 222: (222.0175777, 0.0), - 223: (223.02179, 0.0), - 224: (224.02409, 0.0), - 225: (225.02844, 0.0), - 226: (226.03089, 0.0), - 227: (227.03541, 0.0), - 228: (228.03799, 0.0)}, - 'Ru': {0: (101.9043493, 1.0), - 87: (86.94918, 0.0), - 88: (87.94026, 0.0), - 89: (88.93611, 0.0), - 90: (89.92989, 0.0), - 91: (90.92629, 0.0), - 92: (91.92012, 0.0), - 93: (92.91705, 0.0), - 94: (93.91136, 0.0), - 95: (94.910413, 0.0), - 96: (95.907598, 0.0554), - 97: (96.907555, 0.0), - 98: (97.905287, 0.0187), - 99: (98.9059393, 0.1276), - 100: (99.9042195, 0.126), - 101: (100.9055821, 0.1706), - 102: (101.9043493, 0.3155), - 103: (102.9063238, 0.0), - 104: (103.905433, 0.1862), - 105: (104.907753, 0.0), - 106: (105.907329, 0.0), - 107: (106.90991, 0.0), - 108: (107.91017, 0.0), - 109: (108.9132, 0.0), - 110: (109.91414, 0.0), - 111: (110.9177, 0.0), - 112: (111.91897, 0.0), - 113: (112.92249, 0.0), - 114: (113.92428, 0.0), - 115: (114.92869, 0.0), - 116: (115.93081, 0.0), - 117: (116.93558, 0.0), - 118: (117.93782, 0.0), - 119: (118.94284, 0.0), - 120: (119.94531, 0.0)}, - 'S': {0: (31.972071, 1.0), - 26: (26.02788, 0.0), - 27: (27.01883, 0.0), - 28: (28.00437, 0.0), - 29: (28.99661, 0.0), - 30: (29.984903, 0.0), - 31: (30.9795547, 0.0), - 32: (31.972071, 0.9499), - 33: (32.97145876, 0.0075), - 34: (33.9678669, 0.0425), - 35: (34.96903216, 0.0), - 36: (35.96708076, 0.0001), - 37: (36.97112557, 0.0), - 38: (37.971163, 0.0), - 39: (38.97513, 0.0), - 40: (39.97545, 0.0), - 41: (40.97958, 0.0), - 42: (41.98102, 0.0), - 43: (42.98715, 0.0), - 44: (43.99021, 0.0), - 45: (44.99651, 0.0), - 46: (46.00075, 0.0), - 47: (47.00859, 0.0), - 48: (48.01417, 0.0), - 49: (49.02362, 0.0)}, - 'Sb': {0: (120.9038157, 1.0), - 103: (102.93969, 0.0), - 104: (103.93647, 0.0), - 105: (104.93149, 0.0), - 106: (105.92879, 0.0), - 107: (106.92415, 0.0), - 108: (107.92216, 0.0), - 109: (108.918132, 0.0), - 110: (109.91675, 0.0), - 111: (110.91316, 0.0), - 112: (111.912398, 0.0), - 113: (112.909372, 0.0), - 114: (113.90927, 0.0), - 115: (114.906598, 0.0), - 116: (115.906794, 0.0), - 117: (116.904836, 0.0), - 118: (117.905529, 0.0), - 119: (118.903942, 0.0), - 120: (119.905072, 0.0), - 121: (120.9038157, 0.5721), - 122: (121.9051737, 0.0), - 123: (122.904214, 0.4279), - 124: (123.9059357, 0.0), - 125: (124.9052538, 0.0), - 126: (125.90725, 0.0), - 127: (126.906924, 0.0), - 128: (127.909169, 0.0), - 129: (128.909148, 0.0), - 130: (129.911656, 0.0), - 131: (130.911982, 0.0), - 132: (131.914467, 0.0), - 133: (132.915252, 0.0), - 134: (133.92038, 0.0), - 135: (134.92517, 0.0), - 136: (135.93035, 0.0), - 137: (136.93531, 0.0), - 138: (137.94079, 0.0), - 139: (138.94598, 0.0)}, - 'Sc': {0: (44.9559119, 1.0), - 36: (36.01492, 0.0), - 37: (37.00305, 0.0), - 38: (37.9947, 0.0), - 39: (38.98479, 0.0), - 40: (39.977967, 0.0), - 41: (40.96925113, 0.0), - 42: (41.96551643, 0.0), - 43: (42.9611507, 0.0), - 44: (43.9594028, 0.0), - 45: (44.9559119, 1.0), - 46: (45.9551719, 0.0), - 47: (46.9524075, 0.0), - 48: (47.952231, 0.0), - 49: (48.950024, 0.0), - 50: (49.952188, 0.0), - 51: (50.953603, 0.0), - 52: (51.95668, 0.0), - 53: (52.95961, 0.0), - 54: (53.96326, 0.0), - 55: (54.96824, 0.0), - 56: (55.97287, 0.0), - 57: (56.97779, 0.0), - 58: (57.98371, 0.0), - 59: (58.98922, 0.0), - 60: (59.99571, 0.0)}, - 'Se': {0: (79.9165213, 1.0), - 65: (64.96466, 0.0), - 66: (65.95521, 0.0), - 67: (66.95009, 0.0), - 68: (67.9418, 0.0), - 69: (68.93956, 0.0), - 70: (69.93339, 0.0), - 71: (70.93224, 0.0), - 72: (71.927112, 0.0), - 73: (72.926765, 0.0), - 74: (73.9224764, 0.0089), - 75: (74.9225234, 0.0), - 76: (75.9192136, 0.0937), - 77: (76.919914, 0.0763), - 78: (77.9173091, 0.2377), - 79: (78.9184991, 0.0), - 80: (79.9165213, 0.4961), - 81: (80.9179925, 0.0), - 82: (81.9166994, 0.0873), - 83: (82.919118, 0.0), - 84: (83.918462, 0.0), - 85: (84.92225, 0.0), - 86: (85.924272, 0.0), - 87: (86.92852, 0.0), - 88: (87.93142, 0.0), - 89: (88.93645, 0.0), - 90: (89.93996, 0.0), - 91: (90.94596, 0.0), - 92: (91.94992, 0.0), - 93: (92.95629, 0.0), - 94: (93.96049, 0.0)}, - 'Sg': {0: (271, 1.0), - 258: (258.11317, 0.0), - 259: (259.1145, 0.0), - 260: (260.11442, 0.0), - 261: (261.11612, 0.0), - 262: (262.1164, 0.0), - 263: (263.11832, 0.0), - 264: (264.11893, 0.0), - 265: (265.12111, 0.0), - 266: (266.12207, 0.0), - 267: (267.12443, 0.0), - 268: (268.12561, 0.0), - 269: (269.12876, 0.0), - 270: (270.13033, 0.0), - 271: (271.13347, 0.0), - 272: (272.13516, 0.0), - 273: (273.13822, 0.0)}, - 'Si': {0: (27.9769265325, 1.0), - 22: (22.03453, 0.0), - 23: (23.02552, 0.0), - 24: (24.011546, 0.0), - 25: (25.004106, 0.0), - 26: (25.99233, 0.0), - 27: (26.98670491, 0.0), - 28: (27.9769265325, 0.92223), - 29: (28.9764947, 0.04685), - 30: (29.97377017, 0.03092), - 31: (30.97536323, 0.0), - 32: (31.97414808, 0.0), - 33: (32.978, 0.0), - 34: (33.978576, 0.0), - 35: (34.98458, 0.0), - 36: (35.9866, 0.0), - 37: (36.99294, 0.0), - 38: (37.99563, 0.0), - 39: (39.00207, 0.0), - 40: (40.00587, 0.0), - 41: (41.01456, 0.0), - 42: (42.01979, 0.0), - 43: (43.02866, 0.0), - 44: (44.03526, 0.0)}, - 'Sm': {0: (151.9197324, 1.0), - 128: (127.95808, 0.0), - 129: (128.95464, 0.0), - 130: (129.94892, 0.0), - 131: (130.94611, 0.0), - 132: (131.94069, 0.0), - 133: (132.93867, 0.0), - 134: (133.93397, 0.0), - 135: (134.93252, 0.0), - 136: (135.928276, 0.0), - 137: (136.92697, 0.0), - 138: (137.923244, 0.0), - 139: (138.922297, 0.0), - 140: (139.918995, 0.0), - 141: (140.918476, 0.0), - 142: (141.915198, 0.0), - 143: (142.914628, 0.0), - 144: (143.911999, 0.0307), - 145: (144.91341, 0.0), - 146: (145.913041, 0.0), - 147: (146.9148979, 0.1499), - 148: (147.9148227, 0.1124), - 149: (148.9171847, 0.1382), - 150: (149.9172755, 0.0738), - 151: (150.9199324, 0.0), - 152: (151.9197324, 0.2675), - 153: (152.9220974, 0.0), - 154: (153.9222093, 0.2275), - 155: (154.9246402, 0.0), - 156: (155.925528, 0.0), - 157: (156.92836, 0.0), - 158: (157.92999, 0.0), - 159: (158.93321, 0.0), - 160: (159.93514, 0.0), - 161: (160.93883, 0.0), - 162: (161.94122, 0.0), - 163: (162.94536, 0.0), - 164: (163.94828, 0.0), - 165: (164.95298, 0.0)}, - 'Sn': {0: (119.9021947, 1.0), - 99: (98.94933, 0.0), - 100: (99.93904, 0.0), - 101: (100.93606, 0.0), - 102: (101.9303, 0.0), - 103: (102.9281, 0.0), - 104: (103.92314, 0.0), - 105: (104.92135, 0.0), - 106: (105.91688, 0.0), - 107: (106.91564, 0.0), - 108: (107.911925, 0.0), - 109: (108.911283, 0.0), - 110: (109.907843, 0.0), - 111: (110.907734, 0.0), - 112: (111.904818, 0.0097), - 113: (112.905171, 0.0), - 114: (113.902779, 0.0066), - 115: (114.903342, 0.0034), - 116: (115.901741, 0.1454), - 117: (116.902952, 0.0768), - 118: (117.901603, 0.2422), - 119: (118.903308, 0.0859), - 120: (119.9021947, 0.3258), - 121: (120.9042355, 0.0), - 122: (121.903439, 0.0463), - 123: (122.9057208, 0.0), - 124: (123.9052739, 0.0579), - 125: (124.9077841, 0.0), - 126: (125.907653, 0.0), - 127: (126.91036, 0.0), - 128: (127.910537, 0.0), - 129: (128.91348, 0.0), - 130: (129.913967, 0.0), - 131: (130.917, 0.0), - 132: (131.917816, 0.0), - 133: (132.92383, 0.0), - 134: (133.92829, 0.0), - 135: (134.93473, 0.0), - 136: (135.93934, 0.0), - 137: (136.94599, 0.0)}, - 'Sr': {0: (87.9056121, 1.0), - 73: (72.96597, 0.0), - 74: (73.95631, 0.0), - 75: (74.94995, 0.0), - 76: (75.94177, 0.0), - 77: (76.937945, 0.0), - 78: (77.93218, 0.0), - 79: (78.929708, 0.0), - 80: (79.924521, 0.0), - 81: (80.923212, 0.0), - 82: (81.918402, 0.0), - 83: (82.917557, 0.0), - 84: (83.913425, 0.0056), - 85: (84.912933, 0.0), - 86: (85.9092602, 0.0986), - 87: (86.9088771, 0.07), - 88: (87.9056121, 0.8258), - 89: (88.9074507, 0.0), - 90: (89.907738, 0.0), - 91: (90.910203, 0.0), - 92: (91.911038, 0.0), - 93: (92.914026, 0.0), - 94: (93.915361, 0.0), - 95: (94.919359, 0.0), - 96: (95.921697, 0.0), - 97: (96.926153, 0.0), - 98: (97.928453, 0.0), - 99: (98.93324, 0.0), - 100: (99.93535, 0.0), - 101: (100.94052, 0.0), - 102: (101.94302, 0.0), - 103: (102.94895, 0.0), - 104: (103.95233, 0.0), - 105: (104.95858, 0.0)}, - 'Ta': {0: (180.9479958, 1.0), - 155: (154.97459, 0.0), - 156: (155.9723, 0.0), - 157: (156.96819, 0.0), - 158: (157.9667, 0.0), - 159: (158.963018, 0.0), - 160: (159.96149, 0.0), - 161: (160.95842, 0.0), - 162: (161.95729, 0.0), - 163: (162.95433, 0.0), - 164: (163.95353, 0.0), - 165: (164.950773, 0.0), - 166: (165.95051, 0.0), - 167: (166.94809, 0.0), - 168: (167.94805, 0.0), - 169: (168.94601, 0.0), - 170: (169.94618, 0.0), - 171: (170.94448, 0.0), - 172: (171.9449, 0.0), - 173: (172.94375, 0.0), - 174: (173.94445, 0.0), - 175: (174.94374, 0.0), - 176: (175.94486, 0.0), - 177: (176.944472, 0.0), - 178: (177.945778, 0.0), - 179: (178.9459295, 0.0), - 180: (179.9474648, 0.00012), - 181: (180.9479958, 0.99988), - 182: (181.9501518, 0.0), - 183: (182.9513726, 0.0), - 184: (183.954008, 0.0), - 185: (184.955559, 0.0), - 186: (185.95855, 0.0), - 187: (186.96053, 0.0), - 188: (187.9637, 0.0), - 189: (188.96583, 0.0), - 190: (189.96923, 0.0)}, - 'Tb': {0: (158.9253468, 1.0), - 136: (135.96138, 0.0), - 137: (136.95598, 0.0), - 138: (137.95316, 0.0), - 139: (138.94829, 0.0), - 140: (139.94581, 0.0), - 141: (140.94145, 0.0), - 142: (141.93874, 0.0), - 143: (142.93512, 0.0), - 144: (143.93305, 0.0), - 145: (144.92927, 0.0), - 146: (145.92725, 0.0), - 147: (146.924045, 0.0), - 148: (147.924272, 0.0), - 149: (148.923246, 0.0), - 150: (149.92366, 0.0), - 151: (150.923103, 0.0), - 152: (151.92407, 0.0), - 153: (152.923435, 0.0), - 154: (153.92468, 0.0), - 155: (154.923505, 0.0), - 156: (155.924747, 0.0), - 157: (156.9240246, 0.0), - 158: (157.9254131, 0.0), - 159: (158.9253468, 1.0), - 160: (159.9271676, 0.0), - 161: (160.9275699, 0.0), - 162: (161.92949, 0.0), - 163: (162.930648, 0.0), - 164: (163.93335, 0.0), - 165: (164.93488, 0.0), - 166: (165.93799, 0.0), - 167: (166.94005, 0.0), - 168: (167.94364, 0.0), - 169: (168.94622, 0.0), - 170: (169.95025, 0.0), - 171: (170.9533, 0.0)}, - 'Tc': {0: (98, 1.0), - 85: (84.94883, 0.0), - 86: (85.94288, 0.0), - 87: (86.93653, 0.0), - 88: (87.93268, 0.0), - 89: (88.92717, 0.0), - 90: (89.92356, 0.0), - 91: (90.91843, 0.0), - 92: (91.91526, 0.0), - 93: (92.910249, 0.0), - 94: (93.909657, 0.0), - 95: (94.907657, 0.0), - 96: (95.907871, 0.0), - 97: (96.906365, 0.0), - 98: (97.907216, 0.0), - 99: (98.9062547, 0.0), - 100: (99.9076578, 0.0), - 101: (100.907315, 0.0), - 102: (101.909215, 0.0), - 103: (102.909181, 0.0), - 104: (103.91145, 0.0), - 105: (104.91166, 0.0), - 106: (105.914358, 0.0), - 107: (106.91508, 0.0), - 108: (107.91846, 0.0), - 109: (108.91998, 0.0), - 110: (109.92382, 0.0), - 111: (110.92569, 0.0), - 112: (111.92915, 0.0), - 113: (112.93159, 0.0), - 114: (113.93588, 0.0), - 115: (114.93869, 0.0), - 116: (115.94337, 0.0), - 117: (116.94648, 0.0), - 118: (117.95148, 0.0)}, - 'Te': {0: (129.9062244, 1.0), - 105: (104.94364, 0.0), - 106: (105.9375, 0.0), - 107: (106.93501, 0.0), - 108: (107.92944, 0.0), - 109: (108.92742, 0.0), - 110: (109.92241, 0.0), - 111: (110.92111, 0.0), - 112: (111.91701, 0.0), - 113: (112.91589, 0.0), - 114: (113.91209, 0.0), - 115: (114.9119, 0.0), - 116: (115.90846, 0.0), - 117: (116.908645, 0.0), - 118: (117.905828, 0.0), - 119: (118.906404, 0.0), - 120: (119.90402, 0.0009), - 121: (120.904936, 0.0), - 122: (121.9030439, 0.0255), - 123: (122.90427, 0.0089), - 124: (123.9028179, 0.0474), - 125: (124.9044307, 0.0707), - 126: (125.9033117, 0.1884), - 127: (126.9052263, 0.0), - 128: (127.9044631, 0.3174), - 129: (128.9065982, 0.0), - 130: (129.9062244, 0.3408), - 131: (130.9085239, 0.0), - 132: (131.908553, 0.0), - 133: (132.910955, 0.0), - 134: (133.911369, 0.0), - 135: (134.91645, 0.0), - 136: (135.9201, 0.0), - 137: (136.92532, 0.0), - 138: (137.92922, 0.0), - 139: (138.93473, 0.0), - 140: (139.93885, 0.0), - 141: (140.94465, 0.0), - 142: (141.94908, 0.0)}, - 'Th': {0: (232.0380553, 1.0), - 209: (209.01772, 0.0), - 210: (210.015075, 0.0), - 211: (211.01493, 0.0), - 212: (212.01298, 0.0), - 213: (213.01301, 0.0), - 214: (214.0115, 0.0), - 215: (215.01173, 0.0), - 216: (216.011062, 0.0), - 217: (217.013114, 0.0), - 218: (218.013284, 0.0), - 219: (219.01554, 0.0), - 220: (220.015748, 0.0), - 221: (221.018184, 0.0), - 222: (222.018468, 0.0), - 223: (223.020811, 0.0), - 224: (224.021467, 0.0), - 225: (225.023951, 0.0), - 226: (226.024903, 0.0), - 227: (227.0277041, 0.0), - 228: (228.0287411, 0.0), - 229: (229.031762, 0.0), - 230: (230.0331338, 0.0), - 231: (231.0363043, 0.0), - 232: (232.0380553, 1.0), - 233: (233.0415818, 0.0), - 234: (234.043601, 0.0), - 235: (235.04751, 0.0), - 236: (236.04987, 0.0), - 237: (237.05389, 0.0), - 238: (238.0565, 0.0)}, - 'Ti': {0: (47.9479463, 1.0), - 38: (38.00977, 0.0), - 39: (39.00161, 0.0), - 40: (39.9905, 0.0), - 41: (40.98315, 0.0), - 42: (41.973031, 0.0), - 43: (42.968522, 0.0), - 44: (43.9596901, 0.0), - 45: (44.9581256, 0.0), - 46: (45.9526316, 0.0825), - 47: (46.9517631, 0.0744), - 48: (47.9479463, 0.7372), - 49: (48.94787, 0.0541), - 50: (49.9447912, 0.0518), - 51: (50.946615, 0.0), - 52: (51.946897, 0.0), - 53: (52.94973, 0.0), - 54: (53.95105, 0.0), - 55: (54.95527, 0.0), - 56: (55.9582, 0.0), - 57: (56.96399, 0.0), - 58: (57.96697, 0.0), - 59: (58.97293, 0.0), - 60: (59.97676, 0.0), - 61: (60.9832, 0.0), - 62: (61.98749, 0.0), - 63: (62.99442, 0.0)}, - 'Tl': {0: (204.9744275, 1.0), - 176: (176.00059, 0.0), - 177: (176.996427, 0.0), - 178: (177.9949, 0.0), - 179: (178.99109, 0.0), - 180: (179.98991, 0.0), - 181: (180.986257, 0.0), - 182: (181.98567, 0.0), - 183: (182.982193, 0.0), - 184: (183.98187, 0.0), - 185: (184.97879, 0.0), - 186: (185.97833, 0.0), - 187: (186.975906, 0.0), - 188: (187.97601, 0.0), - 189: (188.973588, 0.0), - 190: (189.97388, 0.0), - 191: (190.971786, 0.0), - 192: (191.97223, 0.0), - 193: (192.97067, 0.0), - 194: (193.9712, 0.0), - 195: (194.969774, 0.0), - 196: (195.970481, 0.0), - 197: (196.969575, 0.0), - 198: (197.97048, 0.0), - 199: (198.96988, 0.0), - 200: (199.970963, 0.0), - 201: (200.970819, 0.0), - 202: (201.972106, 0.0), - 203: (202.9723442, 0.2952), - 204: (203.9738635, 0.0), - 205: (204.9744275, 0.7048), - 206: (205.9761103, 0.0), - 207: (206.977419, 0.0), - 208: (207.9820187, 0.0), - 209: (208.985359, 0.0), - 210: (209.990074, 0.0), - 211: (210.99348, 0.0), - 212: (211.99823, 0.0)}, - 'Tm': {0: (168.9342133, 1.0), - 145: (144.97007, 0.0), - 146: (145.96643, 0.0), - 147: (146.96096, 0.0), - 148: (147.95784, 0.0), - 149: (148.95272, 0.0), - 150: (149.94996, 0.0), - 151: (150.945483, 0.0), - 152: (151.94442, 0.0), - 153: (152.942012, 0.0), - 154: (153.941568, 0.0), - 155: (154.939199, 0.0), - 156: (155.93898, 0.0), - 157: (156.93697, 0.0), - 158: (157.93698, 0.0), - 159: (158.93498, 0.0), - 160: (159.93526, 0.0), - 161: (160.93355, 0.0), - 162: (161.933995, 0.0), - 163: (162.932651, 0.0), - 164: (163.93356, 0.0), - 165: (164.932435, 0.0), - 166: (165.933554, 0.0), - 167: (166.9328516, 0.0), - 168: (167.934173, 0.0), - 169: (168.9342133, 1.0), - 170: (169.9358014, 0.0), - 171: (170.9364294, 0.0), - 172: (171.9384, 0.0), - 173: (172.939604, 0.0), - 174: (173.94217, 0.0), - 175: (174.94384, 0.0), - 176: (175.94699, 0.0), - 177: (176.94904, 0.0), - 178: (177.95264, 0.0), - 179: (178.95534, 0.0)}, - 'U': {0: (238.0507882, 1.0), - 217: (217.02437, 0.0), - 218: (218.02354, 0.0), - 219: (219.02492, 0.0), - 220: (220.02472, 0.0), - 221: (221.0264, 0.0), - 222: (222.02609, 0.0), - 223: (223.02774, 0.0), - 224: (224.027605, 0.0), - 225: (225.029391, 0.0), - 226: (226.029339, 0.0), - 227: (227.031156, 0.0), - 228: (228.031374, 0.0), - 229: (229.033506, 0.0), - 230: (230.03394, 0.0), - 231: (231.036294, 0.0), - 232: (232.0371562, 0.0), - 233: (233.0396352, 0.0), - 234: (234.0409521, 5.4e-05), - 235: (235.0439299, 0.007204), - 236: (236.045568, 0.0), - 237: (237.0487302, 0.0), - 238: (238.0507882, 0.992742), - 239: (239.0542933, 0.0), - 240: (240.056592, 0.0), - 241: (241.06033, 0.0), - 242: (242.06293, 0.0)}, - 'Uuh': {0: (293, 1.0), - 289: (289.19886, 0.0), - 290: (290.19859, 0.0), - 291: (291.20001, 0.0), - 292: (292.19979, 0.0)}, - 'Uuo': {0: (294, 1.0), 293: (293.21467, 0.0)}, - 'Uup': {0: (288, 1.0), - 287: (287.19119, 0.0), - 288: (288.19249, 0.0), - 289: (289.19272, 0.0), - 290: (290.19414, 0.0), - 291: (291.19438, 0.0)}, - 'Uuq': {0: (289, 1.0), - 285: (285.1837, 0.0), - 286: (286.18386, 0.0), - 287: (287.1856, 0.0), - 288: (288.18569, 0.0), - 289: (289.18728, 0.0)}, - 'Uus': {0: (292, 1.0), 291: (291.20656, 0.0), 292: (292.20755, 0.0)}, - 'Uut': {0: (284, 1.0), - 283: (283.17645, 0.0), - 284: (284.17808, 0.0), - 285: (285.17873, 0.0), - 286: (286.18048, 0.0), - 287: (287.18105, 0.0)}, - 'V': {0: (50.9439595, 1.0), - 40: (40.01109, 0.0), - 41: (40.99978, 0.0), - 42: (41.99123, 0.0), - 43: (42.98065, 0.0), - 44: (43.97411, 0.0), - 45: (44.965776, 0.0), - 46: (45.9602005, 0.0), - 47: (46.9549089, 0.0), - 48: (47.9522537, 0.0), - 49: (48.9485161, 0.0), - 50: (49.9471585, 0.0025), - 51: (50.9439595, 0.9975), - 52: (51.9447755, 0.0), - 53: (52.944338, 0.0), - 54: (53.94644, 0.0), - 55: (54.94723, 0.0), - 56: (55.95053, 0.0), - 57: (56.95256, 0.0), - 58: (57.95683, 0.0), - 59: (58.96021, 0.0), - 60: (59.96503, 0.0), - 61: (60.96848, 0.0), - 62: (61.97378, 0.0), - 63: (62.97755, 0.0), - 64: (63.98347, 0.0), - 65: (64.98792, 0.0)}, - 'W': {0: (183.9509312, 1.0), - 158: (157.97456, 0.0), - 159: (158.97292, 0.0), - 160: (159.96848, 0.0), - 161: (160.96736, 0.0), - 162: (161.963497, 0.0), - 163: (162.96252, 0.0), - 164: (163.958954, 0.0), - 165: (164.95828, 0.0), - 166: (165.955027, 0.0), - 167: (166.954816, 0.0), - 168: (167.951808, 0.0), - 169: (168.951779, 0.0), - 170: (169.949228, 0.0), - 171: (170.94945, 0.0), - 172: (171.94729, 0.0), - 173: (172.94769, 0.0), - 174: (173.94608, 0.0), - 175: (174.94672, 0.0), - 176: (175.94563, 0.0), - 177: (176.94664, 0.0), - 178: (177.945876, 0.0), - 179: (178.94707, 0.0), - 180: (179.946704, 0.0012), - 181: (180.948197, 0.0), - 182: (181.9482042, 0.265), - 183: (182.950223, 0.1431), - 184: (183.9509312, 0.3064), - 185: (184.9534193, 0.0), - 186: (185.9543641, 0.2843), - 187: (186.9571605, 0.0), - 188: (187.958489, 0.0), - 189: (188.96191, 0.0), - 190: (189.96318, 0.0), - 191: (190.9666, 0.0), - 192: (191.96817, 0.0)}, - 'Xe': {0: (131.9041535, 1.0), - 110: (109.94428, 0.0), - 111: (110.9416, 0.0), - 112: (111.93562, 0.0), - 113: (112.93334, 0.0), - 114: (113.92798, 0.0), - 115: (114.926294, 0.0), - 116: (115.921581, 0.0), - 117: (116.920359, 0.0), - 118: (117.916179, 0.0), - 119: (118.915411, 0.0), - 120: (119.911784, 0.0), - 121: (120.911462, 0.0), - 122: (121.908368, 0.0), - 123: (122.908482, 0.0), - 124: (123.905893, 0.000952), - 125: (124.9063955, 0.0), - 126: (125.904274, 0.00089), - 127: (126.905184, 0.0), - 128: (127.9035313, 0.019102), - 129: (128.9047794, 0.264006), - 130: (129.903508, 0.04071), - 131: (130.9050824, 0.212324), - 132: (131.9041535, 0.269086), - 133: (132.9059107, 0.0), - 134: (133.9053945, 0.104357), - 135: (134.907227, 0.0), - 136: (135.907219, 0.088573), - 137: (136.911562, 0.0), - 138: (137.91395, 0.0), - 139: (138.918793, 0.0), - 140: (139.92164, 0.0), - 141: (140.92665, 0.0), - 142: (141.92971, 0.0), - 143: (142.93511, 0.0), - 144: (143.93851, 0.0), - 145: (144.94407, 0.0), - 146: (145.94775, 0.0), - 147: (146.95356, 0.0)}, - 'Y': {0: (88.9058483, 1.0), - 76: (75.95845, 0.0), - 77: (76.94965, 0.0), - 78: (77.94361, 0.0), - 79: (78.93735, 0.0), - 80: (79.93428, 0.0), - 81: (80.92913, 0.0), - 82: (81.92679, 0.0), - 83: (82.92235, 0.0), - 84: (83.92039, 0.0), - 85: (84.916433, 0.0), - 86: (85.914886, 0.0), - 87: (86.9108757, 0.0), - 88: (87.9095011, 0.0), - 89: (88.9058483, 1.0), - 90: (89.9071519, 0.0), - 91: (90.907305, 0.0), - 92: (91.908949, 0.0), - 93: (92.909583, 0.0), - 94: (93.911595, 0.0), - 95: (94.912821, 0.0), - 96: (95.915891, 0.0), - 97: (96.918134, 0.0), - 98: (97.922203, 0.0), - 99: (98.924636, 0.0), - 100: (99.92776, 0.0), - 101: (100.93031, 0.0), - 102: (101.93356, 0.0), - 103: (102.93673, 0.0), - 104: (103.94105, 0.0), - 105: (104.94487, 0.0), - 106: (105.94979, 0.0), - 107: (106.95414, 0.0), - 108: (107.95948, 0.0)}, - 'Yb': {0: (173.9388621, 1.0), - 148: (147.96742, 0.0), - 149: (148.96404, 0.0), - 150: (149.95842, 0.0), - 151: (150.9554, 0.0), - 152: (151.95029, 0.0), - 153: (152.94948, 0.0), - 154: (153.946394, 0.0), - 155: (154.945782, 0.0), - 156: (155.942818, 0.0), - 157: (156.942628, 0.0), - 158: (157.939866, 0.0), - 159: (158.94005, 0.0), - 160: (159.937552, 0.0), - 161: (160.937902, 0.0), - 162: (161.935768, 0.0), - 163: (162.936334, 0.0), - 164: (163.934489, 0.0), - 165: (164.93528, 0.0), - 166: (165.933882, 0.0), - 167: (166.93495, 0.0), - 168: (167.933897, 0.0013), - 169: (168.93519, 0.0), - 170: (169.9347618, 0.0304), - 171: (170.9363258, 0.1428), - 172: (171.9363815, 0.2183), - 173: (172.9382108, 0.1613), - 174: (173.9388621, 0.3183), - 175: (174.9412765, 0.0), - 176: (175.9425717, 0.1276), - 177: (176.9452608, 0.0), - 178: (177.946647, 0.0), - 179: (178.95017, 0.0), - 180: (179.95233, 0.0), - 181: (180.95615, 0.0)}, - 'Zn': {0: (63.9291422, 1.0), - 54: (53.99295, 0.0), - 55: (54.98398, 0.0), - 56: (55.97238, 0.0), - 57: (56.96479, 0.0), - 58: (57.95459, 0.0), - 59: (58.94926, 0.0), - 60: (59.941827, 0.0), - 61: (60.939511, 0.0), - 62: (61.93433, 0.0), - 63: (62.9332116, 0.0), - 64: (63.9291422, 0.48268), - 65: (64.929241, 0.0), - 66: (65.9260334, 0.27975), - 67: (66.9271273, 0.04102), - 68: (67.9248442, 0.19024), - 69: (68.9265503, 0.0), - 70: (69.9253193, 0.00631), - 71: (70.927722, 0.0), - 72: (71.926858, 0.0), - 73: (72.92978, 0.0), - 74: (73.92946, 0.0), - 75: (74.93294, 0.0), - 76: (75.93329, 0.0), - 77: (76.93696, 0.0), - 78: (77.93844, 0.0), - 79: (78.94265, 0.0), - 80: (79.94434, 0.0), - 81: (80.95048, 0.0), - 82: (81.95442, 0.0), - 83: (82.96103, 0.0)}, - 'Zr': {0: (89.9047044, 1.0), - 78: (77.95523, 0.0), - 79: (78.94916, 0.0), - 80: (79.9404, 0.0), - 81: (80.93721, 0.0), - 82: (81.93109, 0.0), - 83: (82.92865, 0.0), - 84: (83.92325, 0.0), - 85: (84.92147, 0.0), - 86: (85.91647, 0.0), - 87: (86.914816, 0.0), - 88: (87.910227, 0.0), - 89: (88.90889, 0.0), - 90: (89.9047044, 0.5145), - 91: (90.9056458, 0.1122), - 92: (91.9050408, 0.1715), - 93: (92.906476, 0.0), - 94: (93.9063152, 0.1738), - 95: (94.9080426, 0.0), - 96: (95.9082734, 0.028), - 97: (96.9109531, 0.0), - 98: (97.912735, 0.0), - 99: (98.916512, 0.0), - 100: (99.91776, 0.0), - 101: (100.92114, 0.0), - 102: (101.92298, 0.0), - 103: (102.9266, 0.0), - 104: (103.92878, 0.0), - 105: (104.93305, 0.0), - 106: (105.93591, 0.0), - 107: (106.94075, 0.0), - 108: (107.94396, 0.0), - 109: (108.94924, 0.0), - 110: (109.95287, 0.0)}, - 'e*': {0: (0.00054857990943, 1.0)}} diff --git a/pyteomics/auxiliary/file_helpers.py b/pyteomics/auxiliary/file_helpers.py deleted file mode 100644 index d29e106532c78eb9ae04ed193b01c4f1a4fae064..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/file_helpers.py +++ /dev/null @@ -1,1250 +0,0 @@ -import sys -import codecs -import re -from functools import wraps -from contextlib import contextmanager -from collections import OrderedDict, defaultdict -import json -import multiprocessing as mp -import threading -import warnings -import os -from abc import ABCMeta - -try: - basestring -except NameError: - basestring = (str, bytes) - -try: - import pandas as pd -except ImportError: - pd = None - -try: - import numpy as np -except ImportError: - np = None - -try: - import dill -except ImportError: - dill = None - try: - import cPickle as pickle - except ImportError: - import pickle - serializer = pickle -else: - serializer = dill - -try: - from queue import Empty -except ImportError: - from Queue import Empty - -try: - from collections.abc import Sequence -except ImportError: - from collections import Sequence - -from .structures import PyteomicsError -from .utils import add_metaclass - - -def _keepstate(func): - """Decorator to help keep the position in open files passed as - positional arguments to functions""" - @wraps(func) - def wrapped(*args, **kwargs): - positions = [getattr(arg, 'seek', None) and getattr(arg, 'tell', type(None))() for arg in args] - for arg, pos in zip(args, positions): - if pos is not None: - arg.seek(0) - res = func(*args, **kwargs) - for arg, pos in zip(args, positions): - if pos is not None: - try: - arg.seek(pos) - except ValueError: - pass - return res - return wrapped - - -def _keepstate_method(func): - """Decorator for :py:class:`FileReader` methods to help keep the position - in the underlying file. - """ - @wraps(func) - def wrapped(self, *args, **kwargs): - position = self.tell() - self.seek(0) - try: - return func(self, *args, **kwargs) - finally: - self.seek(position) - return wrapped - - -class _file_obj(object): - """Check if `f` is a file name and open the file in `mode`. - A context manager.""" - - def __init__(self, f, mode, encoding=None): - self._file_spec = None - self.mode = mode - if f is None: - self.file = {'r': sys.stdin, 'a': sys.stdout, 'w': sys.stdout - }[mode[0]] - self._file_spec = None - elif isinstance(f, basestring): - self.file = codecs.open(f, mode, encoding) - self._file_spec = f - else: - self._file_spec = f - self.file = f - self.encoding = getattr(self.file, 'encoding', encoding) - self.close_file = (self.file is not f) - - def __enter__(self): - return self - - def __reduce_ex__(self, protocol): - return self.__class__, (self._file_spec, self.mode, self.encoding) - - def __exit__(self, *args, **kwargs): - if (not self.close_file) or self._file_spec is None: - return # do nothing - # clean up - exit = getattr(self.file, '__exit__', None) - if exit is not None: - return exit(*args, **kwargs) - else: - exit = getattr(self.file, 'close', None) - if exit is not None: - exit() - - def __getattr__(self, attr): - return getattr(self.file, attr) - - def __iter__(self): - return iter(self.file) - - -class NoOpBaseReader(object): - def __init__(self, *args, **kwargs): - pass - - -class IteratorContextManager(NoOpBaseReader): - def __init__(self, *args, **kwargs): - self._func = kwargs.pop('parser_func') - self._args = args - self._kwargs = kwargs - if type(self) == IteratorContextManager: - self.reset() - super(IteratorContextManager, self).__init__(*args, **kwargs) - - def __getstate__(self): - state = {} - state['_iterator_args'] = self._args - state['_iterator_kwargs'] = self._kwargs - return state - - def __setstate__(self, state): - self._args = state['_iterator_args'] - self._kwargs = state['_iterator_kwargs'] - - def reset(self): - """Resets the iterator to its initial state.""" - try: - self._reader = self._func(*self._args, **self._kwargs) - except Exception: - self.__exit__(*sys.exc_info()) - raise - - def __enter__(self): - return self - - def __exit__(self, *args, **kwargs): - pass - - def __iter__(self): - return self - - def __next__(self): - # try: - return next(self._reader) - # except StopIteration: - # self.__exit__(None, None, None) - # raise - - next = __next__ - - -@add_metaclass(ABCMeta) -class FileReader(IteratorContextManager): - """Abstract class implementing context manager protocol - for file readers. - """ - - def __init__(self, source, **kwargs): - func = kwargs['parser_func'] - super(FileReader, self).__init__(*kwargs['args'], parser_func=func, **kwargs['kwargs']) - self._pass_file = kwargs['pass_file'] - self._source_init = source - self._mode = kwargs['mode'] - self._encoding = kwargs.get('encoding') - self.reset() - - def reset(self): - if hasattr(self, '_source'): - self._source.__exit__(None, None, None) - self._source = _file_obj(self._source_init, self._mode, self._encoding) - try: - if self._pass_file: - self._reader = self._func( - self._source, *self._args, **self._kwargs) - else: - self._reader = self._func(*self._args, **self._kwargs) - except Exception: # clean up on any error - self.__exit__(*sys.exc_info()) - raise - - def __exit__(self, *args, **kwargs): - self._source.__exit__(*args, **kwargs) - - # delegate everything else to file object - def __getattr__(self, attr): - if attr == '_source': - raise AttributeError - return getattr(self._source, attr) - - -def remove_bom(bstr): - return bstr.replace(codecs.BOM_LE, b'').lstrip(b"\x00") - - -class IndexedReaderMixin(NoOpBaseReader): - """Common interface for :py:class:`IndexedTextReader` and :py:class:`IndexedXML`.""" - @property - def index(self): - return self._offset_index - - @property - def default_index(self): - return self._offset_index - - def __len__(self): - return len(self._offset_index) - - def __contains__(self, key): - return key in self._offset_index - - def _item_from_offsets(self, offsets): - raise NotImplementedError - - def get_by_id(self, elem_id): - index = self.default_index - if index is None: - raise PyteomicsError('Access by ID requires building an offset index.') - offsets = index[elem_id] - return self._item_from_offsets(offsets) - - def get_by_ids(self, ids): - return [self.get_by_id(key) for key in ids] - - def get_by_index(self, i): - try: - key = self.default_index.from_index(i, False) - except AttributeError: - raise PyteomicsError('Positional access requires building an offset index.') - return self.get_by_id(key) - - def get_by_indexes(self, indexes): - return [self.get_by_index(i) for i in indexes] - - def get_by_index_slice(self, s): - try: - keys = self.default_index.from_slice(s, False) - except AttributeError: - raise PyteomicsError('Positional access requires building an offset index.') - return self.get_by_ids(keys) - - def get_by_key_slice(self, s): - keys = self.default_index.between(s.start, s.stop) - if s.step: - keys = keys[::s.step] - return self.get_by_ids(keys) - - def __getitem__(self, key): - if isinstance(key, basestring): - return self.get_by_id(key) - if isinstance(key, int): - return self.get_by_index(key) - if isinstance(key, Sequence): - if not key: - return [] - if isinstance(key[0], int): - return self.get_by_indexes(key) - if isinstance(key[0], basestring): - return self.get_by_ids(key) - if isinstance(key, slice): - for item in (key.start, key.stop, key.step): - if item is not None: - break - if isinstance(item, int): - return self.get_by_index_slice(key) - if isinstance(item, basestring): - return self.get_by_key_slice(key) - if item is None: - return list(self) - raise PyteomicsError('Unsupported query key: {}'.format(key)) - - -class RTLocator(): - def __init__(self, reader): - self._reader = reader - - def _get_scan_by_time(self, time): - """Retrieve the scan object for the specified scan time. - - Parameters - ---------- - time : float - The time to get the nearest scan from - Returns - ------- - tuple: (scan_id, scan, scan_time) - """ - if not self._reader.default_index: - raise PyteomicsError("This method requires the index. Please pass `use_index=True` during initialization") - - scan_ids = tuple(self._reader.default_index) - lo = 0 - hi = len(scan_ids) - - best_match = None - best_error = float('inf') - best_time = None - best_id = None - - if time == float('inf'): - scan = self._reader.get_by_id(scan_ids[-1]) - return scan_ids[-1], scan, self._reader._get_time(scan) - - while hi != lo: - mid = (hi + lo) // 2 - sid = scan_ids[mid] - scan = self._reader.get_by_id(sid) - scan_time = self._reader._get_time(scan) - err = abs(scan_time - time) - if err < best_error: - best_error = err - best_match = scan - best_time = scan_time - best_id = sid - if scan_time == time: - return sid, scan, scan_time - elif (hi - lo) == 1: - return best_id, best_match, best_time - elif scan_time > time: - hi = mid - else: - lo = mid - - def __getitem__(self, key): - if isinstance(key, (int, float)): - return self._get_scan_by_time(key)[1] - if isinstance(key, Sequence): - return [self._get_scan_by_time(t)[1] for t in key] - if isinstance(key, slice): - if key.start is None: - start_index = self._reader.default_index.from_index(0) - else: - start_index = self._get_scan_by_time(key.start)[0] - if key.stop is None: - stop_index = self._reader.default_index.from_index(-1) - else: - stop_index = self._get_scan_by_time(key.stop)[0] - return self._reader[start_index:stop_index:key.step] - - -class TimeOrderedIndexedReaderMixin(IndexedReaderMixin): - @property - def time(self): - return self._time - - def __init__(self, *args, **kwargs): - super(TimeOrderedIndexedReaderMixin, self).__init__(*args, **kwargs) - self._time = RTLocator(self) - - @staticmethod - def _get_time(scan): - raise NotImplementedError - - -class IndexedTextReader(IndexedReaderMixin, FileReader): - """Abstract class for text file readers that keep an index of records for random access. - This requires reading the file in binary mode.""" - - delimiter = None - label = None - block_size = 1000000 - label_group = 1 - _kw_keys = ['delimiter', 'label', 'block_size', 'label_group'] - - def __init__(self, source, **kwargs): - # the underlying _file_obj gets None as encoding - # to avoid transparent decoding of StreamReader on read() calls - encoding = kwargs.pop('encoding', 'utf-8') - super(IndexedTextReader, self).__init__(source, mode='rb', encoding=None, **kwargs) - self.encoding = encoding - for attr in self._kw_keys: - if attr in kwargs: - setattr(self, attr, kwargs.pop(attr)) - self._offset_index = None - if not kwargs.pop('_skip_index', False): - self._offset_index = self.build_byte_index() - - def __getstate__(self): - state = super(IndexedTextReader, self).__getstate__() - state['offset_index'] = self._offset_index - for key in self._kw_keys: - state[key] = getattr(self, key) - return state - - def __setstate__(self, state): - super(IndexedTextReader, self).__setstate__(state) - self._offset_index = state['offset_index'] - for key in self._kw_keys: - if key in state: - setattr(self, key, state[key]) - - def _chunk_iterator(self): - fh = self._source.file - delim = remove_bom(self.delimiter.encode(self.encoding)) - buff = fh.read(self.block_size) - parts = buff.split(delim) - started_with_delim = buff.startswith(delim) - tail = parts[-1] - front = parts[:-1] - i = 0 - for part in front: - i += 1 - if part == b"": - continue - if i == 1: - if started_with_delim: - yield delim + part - else: - yield part - else: - yield delim + part - running = True - while running: - buff = fh.read(self.block_size) - if len(buff) == 0: - running = False - buff = tail - else: - buff = tail + buff - parts = buff.split(delim) - tail = parts[-1] - front = parts[:-1] - for part in front: - yield delim + part - yield delim + tail - - def _generate_offsets(self): - i = 0 - pattern = re.compile(remove_bom(self.label.encode(self.encoding))) - for chunk in self._chunk_iterator(): - match = pattern.search(chunk) - if match: - label = match.group(self.label_group) - yield i, label.decode(self.encoding), match - i += len(chunk) - yield i, None, None - - def build_byte_index(self): - index = OffsetIndex() - g = self._generate_offsets() - last_offset = 0 - last_label = None - for offset, label, keyline in g: - if last_label is not None: - index[last_label] = (last_offset, offset) - last_label = label - last_offset = offset - assert last_label is None - return index - - def _read_lines_from_offsets(self, start, end): - self._source.seek(start) - lines = self._source.read(end - start).decode(self.encoding).split('\n') - return lines - - -class IndexSavingMixin(NoOpBaseReader): - """Common interface for :py:class:`IndexSavingXML` and :py:class:`IndexSavingTextReader`.""" - _index_class = NotImplemented - - @property - def _byte_offset_filename(self): - try: - path = self._source.name - except AttributeError: - return None - name, ext = os.path.splitext(path) - byte_offset_filename = '{}-{}-byte-offsets.json'.format(name, ext[1:]) - return byte_offset_filename - - def _check_has_byte_offset_file(self): - """Check if the file at :attr:`_byte_offset_filename` exists - - Returns - ------- - bool - Whether the file exists - """ - path = self._byte_offset_filename - if path is None: - return False - return os.path.exists(path) - - @classmethod - def prebuild_byte_offset_file(cls, path): - """Construct a new XML reader, build its byte offset index and - write it to file - - Parameters - ---------- - path : str - The path to the file to parse - """ - with cls(path) as inst: - inst.write_byte_offsets() - - def write_byte_offsets(self): - """Write the byte offsets in :attr:`_offset_index` to the file - at :attr:`_byte_offset_filename` - """ - with open(self._byte_offset_filename, 'w') as f: - self._offset_index.save(f) - - @_keepstate_method - def _build_index(self): - """Build the byte offset index by either reading these offsets - from the file at :attr:`_byte_offset_filename`, or falling back - to the method used by :class:`IndexedXML` if this operation fails - due to an IOError - """ - if not self._use_index: return - try: - self._read_byte_offsets() - except (IOError, AttributeError, TypeError): - super(IndexSavingMixin, self)._build_index() - - def _read_byte_offsets(self): - """Read the byte offset index JSON file at :attr:`_byte_offset_filename` - and populate :attr:`_offset_index` - """ - with open(self._byte_offset_filename, 'r') as f: - index = self._index_class.load(f) - self._offset_index = index - - -def _file_reader(_mode='r'): - # a lot of the code below is borrowed from - # http://stackoverflow.com/a/14095585/1258041 - def decorator(_func): - """A decorator implementing the context manager protocol for functions - that read files. - - Note: 'close' must be in kwargs! Otherwise it won't be respected. - """ - @wraps(_func) - def helper(*args, **kwargs): - if args: - return FileReader(args[0], mode=_mode, parser_func=_func, pass_file=True, args=args[1:], kwargs=kwargs, - encoding=kwargs.pop('encoding', None)) - source = kwargs.pop('source', None) - return FileReader(source, mode=_mode, parser_func=_func, pass_file=True, args=(), kwargs=kwargs, encoding=kwargs.pop('encoding', None)) - return helper - return decorator - - -def _file_writer(_mode='w'): - def decorator(_func): - """A decorator that opens output files for writer functions. - """ - @wraps(_func) - def helper(*args, **kwargs): - m = kwargs.pop('file_mode', _mode) - enc = kwargs.pop('encoding', None) - if len(args) > 1: - out_arg = args[1] - else: - out_arg = kwargs.pop('output', None) - - with _file_obj(out_arg, m, encoding=enc) as out: - if len(args) > 1: - call_args = (args[0], out) + args[2:] - call_kwargs = kwargs - else: - call_args = args - call_kwargs = dict(output=out, **kwargs) - return _func(*call_args, **call_kwargs) - return helper - return decorator - - -class WritableIndex(object): - schema_version = (1, 0, 0) - _schema_version_tag_key = "@pyteomics_schema_version" - - def _serializable_container(self): - container = {'index': list(self.items())} - return container - - def save(self, fp): - container = self._serializable_container() - container[self._schema_version_tag_key] = self.schema_version - json.dump(container, fp) - - @classmethod - def load(cls, fp): - container = json.load(fp, object_hook=OrderedDict) - version_tag = container.get(cls._schema_version_tag_key) - if version_tag is None: - # The legacy case, no special processing yet - inst = cls() - inst.schema_version = None - return inst - version_tag = tuple(version_tag) - index = container.get("index") - if version_tag < cls.schema_version: - # schema upgrade case, no special processing yet - inst = cls(index) - inst.schema_version = version_tag - return inst - # no need to upgrade - return cls(index) - - -class OffsetIndex(OrderedDict, WritableIndex): - '''An augmented OrderedDict that formally wraps getting items by index - ''' - - def __init__(self, *args, **kwargs): - super(OffsetIndex, self).__init__(*args, **kwargs) - self._index_sequence = None - - def _invalidate(self): - self._index_sequence = None - - @property - def index_sequence(self): - """Keeps a cached copy of the :meth:`items` sequence - stored as a :class:`tuple` to avoid repeatedly copying - the sequence over many method calls. - - Returns - ------- - :class:`tuple` - """ - if self._index_sequence is None: - self._index_sequence = tuple(self.items()) - return self._index_sequence - - def __setitem__(self, key, value): - self._invalidate() - return super(OffsetIndex, self).__setitem__(key, value) - - def pop(self, *args, **kwargs): - self._invalidate() - return super(OffsetIndex, self).pop(*args, **kwargs) - - def find(self, key, *args, **kwargs): - return self[key] - - def from_index(self, index, include_value=False): - '''Get an entry by its integer index in the ordered sequence - of this mapping. - - Parameters - ---------- - index: int - The index to retrieve. - include_value: bool - Whether to return both the key and the value or just the key. - Defaults to :const:`False`. - - Returns - ------- - object: - If ``include_value`` is :const:`True`, a tuple of (key, value) at ``index`` - else just the key at ``index``. - ''' - items = self.index_sequence - if include_value: - return items[index] - else: - return items[index][0] - - def from_slice(self, spec, include_value=False): - '''Get a slice along index in the ordered sequence - of this mapping. - - Parameters - ---------- - spec: slice - The slice over the range of indices to retrieve - include_value: bool - Whether to return both the key and the value or just the key. - Defaults to :const:`False` - - Returns - ------- - list: - If ``include_value`` is :const:`True`, a tuple of (key, value) at ``index`` - else just the key at ``index`` for each ``index`` in ``spec`` - ''' - items = self.index_sequence - return [(k, v) if include_value else k for k, v in items[spec]] - - def between(self, start, stop, include_value=False): - keys = list(self) - if start is not None: - try: - start_index = keys.index(start) - except ValueError: - raise KeyError(start) - else: - start_index = 0 - if stop is not None: - try: - stop_index = keys.index(stop) - except ValueError: - raise KeyError(stop) - else: - stop_index = len(keys) - 1 - if start is None or stop is None: - pass # won't switch indices - else: - start_index, stop_index = min(start_index, stop_index), max(start_index, stop_index) - - if include_value: - return [(k, self[k]) for k in keys[start_index:stop_index + 1]] - return keys[start_index:stop_index + 1] - - def __repr__(self): - template = "{self.__class__.__name__}({items})" - return template.format(self=self, items=list(self.items())) - - def _integrity_check(self): - indices = list(self.values()) - sorted_indices = sorted(self.values()) - return indices == sorted_indices - - def sort(self): - sorted_pairs = sorted(self.items(), key=lambda x: x[1]) - self.clear() - self._invalidate() - for key, value in sorted_pairs: - self[key] = value - return self - - -class IndexSavingTextReader(IndexSavingMixin, IndexedTextReader): - _index_class = OffsetIndex - - -class HierarchicalOffsetIndex(WritableIndex): - _inner_type = OffsetIndex - - def __init__(self, base=None): - self.mapping = defaultdict(self._inner_type) - for key, value in (base or {}).items(): - self.mapping[key] = self._inner_type(value) - - def _integrity_check(self): - for key, value in self.items(): - if not value._integrity_check(): - return False - return True - - def sort(self): - for key, value in self.items(): - value.sort() - return self - - def __getitem__(self, key): - return self.mapping[key] - - def __setitem__(self, key, value): - self.mapping[key] = value - - def __iter__(self): - return iter(self.mapping) - - def __len__(self): - return sum(len(group) for key, group in self.items()) - - def __contains__(self, key): - return key in self.mapping - - def find(self, key, element_type=None): - if element_type is None: - for element_type in self.keys(): - try: - return self.find(key, element_type) - except KeyError: - continue - raise KeyError(key) - else: - return self[element_type][key] - - def find_no_type(self, key): - """Try to find `key` in each of the lower-level indexes, returning both - value and the element type that match the key.""" - for element_type in self.keys(): - try: - return self.find(key, element_type), element_type - except KeyError: - continue - raise KeyError(key) - - def update(self, *args, **kwargs): - self.mapping.update(*args, **kwargs) - - def pop(self, key, default=None): - return self.mapping.pop(key, default) - - def keys(self): - return self.mapping.keys() - - def values(self): - return self.mapping.values() - - def items(self): - return self.mapping.items() - - def _serializable_container(self): - encoded_index = {} - container = { - 'keys': list(self.keys()) - } - for key, offset in self.items(): - encoded_index[key] = list(offset.items()) - container['index'] = encoded_index - return container - - -def _make_chain(reader, readername, full_output=False): - - def concat_results(*args, **kwargs): - results = [reader(arg, **kwargs) for arg in args] - if pd is not None and all(isinstance(a, pd.DataFrame) for a in args): - return pd.concat(results) - return np.concatenate(results) - - def _iter(files, kwargs): - for f in files: - with reader(f, **kwargs) as r: - for item in r: - yield item - - def chain(*files, **kwargs): - return _iter(files, kwargs) - - def from_iterable(files, **kwargs): - return _iter(files, kwargs) - - @contextmanager - def _chain(*files, **kwargs): - yield chain(*files, **kwargs) - - @contextmanager - def _from_iterable(files, **kwargs): - yield from_iterable(files, **kwargs) - - def dispatch(*args, **kwargs): - return dispatch_from_iterable(args, **kwargs) - - def dispatch_from_iterable(args, **kwargs): - if kwargs.get('full_output', full_output): - return concat_results(*args, **kwargs) - return _chain(*args, **kwargs) - - dispatch.__doc__ = """Chain :py:func:`{0}` for several files. - Positional arguments should be file names or file objects. - Keyword arguments are passed to the :py:func:`{0}` function. - """.format(readername) - dispatch_from_iterable.__doc__ = """Chain :py:func:`{0}` for several files. - Keyword arguments are passed to the :py:func:`{0}` function. - - Parameters - ---------- - files : iterable - Iterable of file names or file objects. - """.format(readername) - dispatch.from_iterable = dispatch_from_iterable - - return dispatch - - -def _check_use_index(source, use_index, default): - try: - if use_index is not None: - use_index = bool(use_index) - - # if a file name is given, do not override anything; short-circuit - if isinstance(source, basestring): - return use_index if use_index is not None else default - - # collect information on source - if hasattr(source, 'seekable'): - seekable = source.seekable() - else: - seekable = None - - if hasattr(source, 'mode'): - binary = 'b' in source.mode - else: - binary = None - - # now check for conflicts - if seekable is False: - if binary: - raise PyteomicsError('Cannot work with non-seekable file in binary mode: {}.'.format(source)) - if use_index: - warnings.warn('Cannot use indexing as {} is not seekable. Setting `use_index` to False.'.format(source)) - use_index = False - elif binary is not None: - if use_index is not None and binary != use_index: - warnings.warn('use_index is {}, but the file mode is {}. ' - 'Setting `use_index` to {}'.format(use_index, source.mode, binary)) - use_index = binary - elif use_index is None: - warnings.warn('Could not check mode on {}. Specify `use_index` explicitly to avoid errors.'.format(source)) - - if use_index is not None: - return use_index - - return default - - except PyteomicsError: - raise - except Exception as e: - if use_index is None: - warnings.warn('Could not check mode on {}. Reason: {!r}. ' - 'Specify `use_index` explicitly to avoid errors.'.format(source, e)) - return default - return use_index - - -class FileReadingProcess(mp.Process): - """Process that does a share of distributed work on entries read from file. - Reconstructs a reader object, parses an entries from given indexes, - optionally does additional processing, sends results back. - - The reader class must support the :py:meth:`__getitem__` dict-like lookup. - """ - - def __init__(self, reader_spec, target_spec, qin, qout, args_spec, kwargs_spec): - super(FileReadingProcess, self).__init__(name='pyteomics-map-worker') - self.reader_spec = reader_spec - self.target_spec = target_spec - self.args_spec = args_spec - self.kwargs_spec = kwargs_spec - self._qin = qin - self._qout = qout - # self._in_flag = in_flag - self._done_flag = mp.Event() - self.daemon = True - - def run(self): - reader = serializer.loads(self.reader_spec) - target = serializer.loads(self.target_spec) - args = serializer.loads(self.args_spec) - kwargs = serializer.loads(self.kwargs_spec) - for key in iter(self._qin.get, None): - item = reader[key] - if target is not None: - result = target(item, *args, **kwargs) - else: - result = item - self._qout.put(result) - self._done_flag.set() - - def is_done(self): - return self._done_flag.is_set() - - -try: - _NPROC = mp.cpu_count() -except NotImplementedError: - _NPROC = 4 -_QUEUE_TIMEOUT = 4 -_QUEUE_SIZE = int(1e7) - - -class TaskMappingMixin(NoOpBaseReader): - def __init__(self, *args, **kwargs): - ''' - Instantiate a :py:class:`TaskMappingMixin` object, set default parameters for IPC. - - Parameters - ---------- - - queue_timeout : float, keyword only, optional - The number of seconds to block, waiting for a result before checking to see if - all workers are done. - queue_size : int, keyword only, optional - The length of IPC queue used. - processes : int, keyword only, optional - Number of worker processes to spawn when :py:meth:`map` is called. This can also be - specified in the :py:meth:`map` call. - ''' - self._queue_size = kwargs.pop('queue_size', _QUEUE_SIZE) - self._queue_timeout = kwargs.pop('timeout', _QUEUE_TIMEOUT) - self._nproc = kwargs.pop('processes', _NPROC) - super(TaskMappingMixin, self).__init__(*args, **kwargs) - - def _get_reader_for_worker_spec(self): - return self - - def _build_worker_spec(self, target, args, kwargs): - serialized = [] - for obj, objname in [(self._get_reader_for_worker_spec(), 'reader'), (target, 'target'), (args, 'args'), - (kwargs, 'kwargs')]: - try: - serialized.append(serializer.dumps(obj)) - except serializer.PicklingError: - msg = 'Could not serialize {0} {1} with {2.__name__}.'.format(objname, obj, serializer) - if serializer is not dill: - msg += ' Try installing `dill`.' - raise PyteomicsError(msg) - return serialized - - def _spawn_workers(self, specifications, in_queue, out_queue, processes): - reader_spec, target_spec, args_spec, kwargs_spec = specifications - workers = [] - for _ in range(processes): - worker = FileReadingProcess( - reader_spec, target_spec, in_queue, out_queue, args_spec, kwargs_spec) - workers.append(worker) - return workers - - def _spawn_feeder_thread(self, in_queue, iterator, processes): - def feeder(): - for key in iterator: - in_queue.put(key) - for _ in range(processes): - in_queue.put(None) - - feeder_thread = threading.Thread(target=feeder) - feeder_thread.daemon = True - feeder_thread.start() - return feeder_thread - - def map(self, target=None, processes=-1, args=None, kwargs=None, **_kwargs): - """Execute the ``target`` function over entries of this object across up to ``processes`` - processes. - - Results will be returned out of order. - - Parameters - ---------- - target : :class:`Callable`, optional - The function to execute over each entry. It will be given a single object yielded by - the wrapped iterator as well as all of the values in ``args`` and ``kwargs`` - processes : int, optional - The number of worker processes to use. If 0 or negative, - defaults to the number of available CPUs. - This parameter can also be set at reader creation. - args : :class:`Sequence`, optional - Additional positional arguments to be passed to the target function - kwargs : :class:`Mapping`, optional - Additional keyword arguments to be passed to the target function - **_kwargs - Additional keyword arguments to be passed to the target function - - Yields - ------ - object - The work item returned by the target function. - """ - if self._offset_index is None: - raise PyteomicsError('The reader needs an index for map() calls. Create the reader with `use_index=True`.') - - if processes < 1: - processes = self._nproc - iterator = self._task_map_iterator() - - if args is None: - args = tuple() - else: - args = tuple(args) - if kwargs is None: - kwargs = dict() - else: - kwargs = dict(kwargs) - kwargs.update(_kwargs) - - serialized = self._build_worker_spec(target, args, kwargs) - - in_queue = mp.Queue(self._queue_size) - out_queue = mp.Queue(self._queue_size) - - workers = self._spawn_workers(serialized, in_queue, out_queue, processes) - feeder_thread = self._spawn_feeder_thread(in_queue, iterator, processes) - for worker in workers: - worker.start() - - def iterate(): - while True: - try: - result = out_queue.get(True, self._queue_timeout) - yield result - except Empty: - if all(w.is_done() for w in workers): - break - else: - continue - - feeder_thread.join() - for worker in workers: - worker.join() - return iterate() - - def _task_map_iterator(self): - """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC - queue used by :meth:`map` - - Returns - ------- - :class:`Iteratable` - """ - - return iter(self._offset_index.keys()) - - -class ChainBase(object): - """Chain :meth:`sequence_maker` for several sources into a - single iterable. Positional arguments should be sources like - file names or file objects. Keyword arguments are passed to - the :meth:`sequence_maker` function. - - Parameters - ---------- - sources : :class:`Iterable` - Sources for creating new sequences from, such as paths or - file-like objects - kwargs : :class:`Mapping` - Additional arguments used to instantiate each sequence - """ - - def __init__(self, *sources, **kwargs): - self.sources = sources - self.kwargs = kwargs - self._iterator = None - - @classmethod - def from_iterable(cls, sources, **kwargs): - return cls(*sources, **kwargs) - - @classmethod - def _make_chain(cls, sequence_maker): - if isinstance(sequence_maker, type): - tp = type('%sChain' % sequence_maker.__class__.__name__, (cls,), { - 'sequence_maker': sequence_maker, - '__doc__': cls.__doc__.replace(':meth:`sequence_maker`', ':class:`{}`'.format(sequence_maker.__name__)) - }) - else: - tp = type('FunctionChain', (cls,), { - 'sequence_maker': staticmethod(sequence_maker), - '__doc__': cls.__doc__.replace(':meth:`sequence_maker`', ':func:`{}`'.format(sequence_maker.__name__)) - }) - return tp - - def sequence_maker(self, file): - raise NotImplementedError() - - def _create_sequence(self, file): - return self.sequence_maker(file, **self.kwargs) - - def _iterate_over_series(self): - for f in self.sources: - with self._create_sequence(f) as r: - for item in r: - yield item - - def __enter__(self): - self._iterator = iter(self._iterate_over_series()) - return self - - def __exit__(self, *args, **kwargs): - self._iterator = None - - def __iter__(self): - return self - - def __next__(self): - if self._iterator is None: - self._iterator = self._iterate_over_series() - return next(self._iterator) - - def next(self): - return self.__next__() - - def map(self, target=None, processes=-1, queue_timeout=_QUEUE_TIMEOUT, args=None, kwargs=None, **_kwargs): - """Execute the ``target`` function over entries of this object across up to ``processes`` - processes. - - Results will be returned out of order. - - Parameters - ---------- - target : :class:`Callable`, optional - The function to execute over each entry. It will be given a single object yielded by - the wrapped iterator as well as all of the values in ``args`` and ``kwargs`` - processes : int, optional - The number of worker processes to use. If negative, the number of processes - will match the number of available CPUs. - queue_timeout : float, optional - The number of seconds to block, waiting for a result before checking to see if - all workers are done. - args : :class:`Sequence`, optional - Additional positional arguments to be passed to the target function - kwargs : :class:`Mapping`, optional - Additional keyword arguments to be passed to the target function - **_kwargs - Additional keyword arguments to be passed to the target function - - Yields - ------ - object - The work item returned by the target function. - """ - for f in self.sources: - with self._create_sequence(f) as r: - for result in r.map(target, processes, queue_timeout, args, kwargs, **_kwargs): - yield result - - -class TableJoiner(ChainBase): - def concatenate(self, results): - if pd is not None and all(isinstance(a, pd.DataFrame) for a in results): - return pd.concat(results) - if isinstance(results[0], np.ndarray): - return np.concatenate(results) - else: - return np.array([b for a in results for b in a]) - - def _iterate_over_series(self): - results = [self._create_sequence(f) for f in self.sources] - return self.concatenate(results) diff --git a/pyteomics/auxiliary/math.py b/pyteomics/auxiliary/math.py deleted file mode 100644 index 1f1f46a72bba707d928b5c3d96a119aa6e1a125e..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/math.py +++ /dev/null @@ -1,97 +0,0 @@ -from .structures import PyteomicsError - - -def linear_regression_vertical(x, y=None, a=None, b=None): - """Calculate coefficients of a linear regression y = a * x + b. - The fit minimizes *vertical* distances between the points and the line. - - Requires :py:mod:`numpy`. - - Parameters - ---------- - x, y : array_like of float - 1-D arrays of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). - a : float, optional - If specified then the slope coefficient is fixed and equals a. - b : float, optional - If specified then the free term is fixed and equals b. - - Returns - ------- - out : 4-tuple of float - The structure is (a, b, r, stderr), where - a -- slope coefficient, - b -- free term, - r -- Peason correlation coefficient, - stderr -- standard deviation. - """ - - import numpy as np - x = np.array(x, copy=False) - if y is not None: - y = np.array(y, copy=False) - else: - if len(x.shape) != 2 or x.shape[-1] != 2: - raise PyteomicsError( - 'If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) - y = x[:, 1] - x = x[:, 0] - if (a is not None and b is None): - b = (y - a * x).mean() - elif (a is not None and b is not None): - pass - else: - a, b = np.polyfit(x, y, 1) - - r = np.corrcoef(x, y)[0, 1] - stderr = (y - a * x - b).std() - - return a, b, r, stderr - - -def linear_regression(x, y=None, a=None, b=None): - """Alias of :py:func:`linear_regression_vertical`.""" - return linear_regression_vertical(x, y, a, b) - - -def linear_regression_perpendicular(x, y=None): - """Calculate coefficients of a linear regression y = a * x + b. - The fit minimizes *perpendicular* distances between the points and the line. - - Requires :py:mod:`numpy`. - - Parameters - ---------- - x, y : array_like of float - 1-D arrays of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). - - Returns - ------- - out : 4-tuple of float - The structure is (a, b, r, stderr), where - a -- slope coefficient, - b -- free term, - r -- Peason correlation coefficient, - stderr -- standard deviation. - """ - - import numpy as np - x = np.array(x, copy=False) - if y is not None: - y = np.array(y, copy=False) - data = np.hstack((x.reshape((-1, 1)), y.reshape((-1, 1)))) - else: - if len(x.shape) != 2 or x.shape[-1] != 2: - raise PyteomicsError( - 'If `y` is not given, x.shape should be (N, 2), given: {}'.format(x.shape)) - data = x - mu = data.mean(axis=0) - eigenvectors, eigenvalues, V = np.linalg.svd((data - mu).T, full_matrices=False) - a = eigenvectors[0][1] / eigenvectors[0][0] - xm, ym = data.mean(axis=0) - b = ym - a * xm - - r = np.corrcoef(data[:, 0], data[:, 1])[0, 1] - stderr = ((data[:, 1] - a * data[:, 0] - b) / np.sqrt(a**2 + 1)).std() - - return a, b, r, stderr diff --git a/pyteomics/auxiliary/patch.py b/pyteomics/auxiliary/patch.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/pyteomics/auxiliary/structures.py b/pyteomics/auxiliary/structures.py deleted file mode 100644 index 56e56e15eb13a4434f6440ade7e73c8cad8d248b..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/structures.py +++ /dev/null @@ -1,504 +0,0 @@ -import re -from collections import defaultdict, Counter -import warnings - -try: - basestring - PY2 = True -except NameError: - basestring = (str, bytes) - PY2 = False - - -_UNIT_CV_INTERN_TABLE = dict() - - -def clear_unit_cv_table(): - """Clear the module-level unit name and - controlled vocabulary accession table. - """ - _UNIT_CV_INTERN_TABLE.clear() - - -def _intern_unit_or_cv(unit_or_cv): - """Intern `unit_or_cv` in :const:`~._UNIT_CV_INTERN_TABLE`, potentially - keeping a reference to the object stored for the duration of the program. - - Parameters - ---------- - unit_or_cv : object - The value to intern - - Returns - ------- - object: - The object which `unit_or_cv` hash-equals in :const:`~._UNIT_CV_INTERN_TABLE`. - """ - if unit_or_cv is None: - return None - try: - return _UNIT_CV_INTERN_TABLE[unit_or_cv] - except KeyError: - _UNIT_CV_INTERN_TABLE[unit_or_cv] = unit_or_cv - return _UNIT_CV_INTERN_TABLE[unit_or_cv] - - -class PyteomicsError(Exception): - """Exception raised for errors in Pyteomics library. - - Attributes - ---------- - message : str - Error message. - """ - - def __init__(self, msg, *values): - self.message = msg - self.values = values - - def __str__(self): - if not self.values: - return "Pyteomics error, message: %s" % (repr(self.message),) - else: - return "Pyteomics error, message: %s %r" % (repr(self.message), self.values) - - -class Charge(int): - """A subclass of :py:class:`int`. Can be constructed from strings in "N+" - or "N-" format, and the string representation of a :py:class:`Charge` is - also in that format. - """ - def __new__(cls, *args, **kwargs): - try: - return super(Charge, cls).__new__(cls, *args) - except ValueError as e: - if isinstance(args[0], basestring): - try: - num, sign = re.match(r'^(\d+)(\+|-)$', args[0]).groups() - return super(Charge, cls).__new__(cls, sign + num, *args[1:], **kwargs) - except Exception: - pass - raise PyteomicsError(*e.args) - - def __str__(self): - return str(abs(self)) + '+-'[self < 0] - - -class Ion(str): - """Represents an Ion, right now just a subclass of String. - """ - _pattern = r'([abcxyz]\d+(\-H2O|\-NH3)?)([\+|-]\d+)' # "y2-H2O+1" - - def __init__(self, *args, **kwargs): - if args and isinstance(args[0], basestring): - try: - self.ion_type, self.neutral_loss, self.charge = re.match(self._pattern, args[0]).groups() - except Exception: - raise PyteomicsError("Malformed ion string, must match the regex {!r}".format(self._pattern)) - - -class ChargeList(list): - """Just a list of :py:class:`Charge`s. When printed, looks like an - enumeration of the list contents. Can also be constructed from such - strings (e.g. "2+, 3+ and 4+"). - """ - - def __init__(self, *args, **kwargs): - if args and isinstance(args[0], basestring): - delim = r'(?:,\s*)|(?:\s*and\s*)' - self.extend(map(Charge, re.split(delim, args[0]))) - else: - try: - super(ChargeList, self).__init__( - sorted(set(args[0])), *args[1:], **kwargs) - except Exception: - super(ChargeList, self).__init__(*args, **kwargs) - self[:] = map(Charge, self) - - def __str__(self): - if len(self) > 1: - return ', '.join(map(str, self[:-1])) + ' and {}'.format(self[-1]) - elif self: - return str(self[0]) - return super(ChargeList, self).__str__() - - -def _parse_charge(s, list_only=False): - if not list_only: - try: - return Charge(s) - except PyteomicsError: - pass - return ChargeList(s) - - -def _parse_ion(ion_text): - try: - return Ion(ion_text) - except Exception as e: - warnings.warn('Could not parse ion string: {} ({})'.format(ion_text, e.args[0])) - - -class BasicComposition(defaultdict, Counter): - """A generic dictionary for compositions. - Keys should be strings, values should be integers. - Allows simple arithmetics.""" - - def __init__(self, *args, **kwargs): - defaultdict.__init__(self, int) - Counter.__init__(self, *args, **kwargs) - for k, v in list(self.items()): - if not v: - del self[k] - - def __str__(self): - return '{}({})'.format(type(self).__name__, dict.__repr__(self)) - - def __repr__(self): - return str(self) - - def _repr_pretty_(self, p, cycle): - if cycle: # should never happen - p.text('{} object with a cyclic reference'.format(type(self).__name__)) - p.text(str(self)) - - def __add__(self, other): - result = self.copy() - for elem, cnt in other.items(): - result[elem] += cnt - return result - - def __iadd__(self, other): - for elem, cnt in other.items(): - self[elem] += cnt - return self - - def __radd__(self, other): - return self + other - - def __sub__(self, other): - result = self.copy() - for elem, cnt in other.items(): - result[elem] -= cnt - return result - - def __isub__(self, other): - for elem, cnt in other.items(): - self[elem] -= cnt - return self - - def __rsub__(self, other): - return (self - other) * (-1) - - def __mul__(self, other): - if not isinstance(other, int): - raise PyteomicsError('Cannot multiply Composition by non-integer', - other) - return type(self)({k: v * other for k, v in self.items()}) - - def __imul__(self, other): - if not isinstance(other, int): - raise PyteomicsError('Cannot multiply Composition by non-integer', - other) - for elem in self: - self[elem] *= other - return self - - def __rmul__(self, other): - return self * other - - def __eq__(self, other): - if not isinstance(other, dict): - return False - self_items = {i for i in self.items() if i[1]} - other_items = {i for i in other.items() if i[1]} - return self_items == other_items - - # override default behavior: - # we don't want to add 0's to the dictionary - def __missing__(self, key): - return 0 - - def __setitem__(self, key, value): - if isinstance(value, float): - value = int(round(value)) - elif not isinstance(value, int): - raise PyteomicsError('Only integers allowed as values in ' - 'Composition, got {}.'.format(type(value).__name__)) - if value: # reject 0's - super(BasicComposition, self).__setitem__(key, value) - elif key in self: - del self[key] - - def copy(self): - return type(self)(self) - - def __reduce__(self): - class_, args, state, list_iterator, dict_iterator = super( - BasicComposition, self).__reduce__() - # Override the reduce of defaultdict so we do not provide the - # `int` type as the first argument - # which prevents from correctly unpickling the object - args = () - return class_, args, state, list_iterator, dict_iterator - - -class _MappingOverAttributeProxy(object): - '''A replacement for __dict__ for unpickling an object which once - has __slots__ now but did not before.''' - - def __init__(self, obj): - self.obj = obj - - def __getitem__(self, key): - return getattr(self.obj, key) - - def __setitem__(self, key, value): - setattr(self.obj, key, value) - - def __contains__(self, key): - return hasattr(self.obj, key) - - def __repr__(self): - return "{self.__class__.__name__}({self.obj})".format(self=self) - - -class unitint(int): - '''Represents an integer value with a unit name. - - Behaves identically to a built-in :class:`int` type. - - Attributes - ---------- - unit_info : :class:`str` - The name of the unit this value posseses. - ''' - def __new__(cls, value, unit_info=None): - inst = int.__new__(cls, value) - inst.unit_info = unit_info - return inst - - def __reduce__(self): - return self.__class__, (int(self), self.unit_info) - - def _repr_pretty_(self, p, cycle): - base = super(unitint, self).__repr__() - if self.unit_info: - string = "%s %s" % (base, self.unit_info) - else: - string = base - p.text(string) - - -class unitfloat(float): - '''Represents an float value with a unit name. - - Behaves identically to a built-in :class:`float` type. - - Attributes - ---------- - unit_info : :class:`str` - The name of the unit this value posseses. - ''' - __slots__ = ('unit_info', ) - - def __new__(cls, value, unit_info=None): - inst = float.__new__(cls, value) - inst.unit_info = unit_info - return inst - - @property - def __dict__(self): - return _MappingOverAttributeProxy(self) - - def __reduce__(self): - return self.__class__, (float(self), self.unit_info) - - def _repr_pretty_(self, p, cycle): - base = super(unitfloat, self).__repr__() - if self.unit_info: - string = "%s %s" % (base, self.unit_info) - else: - string = base - p.text(string) - - -class unitstr(str): - '''Represents an string value with a unit name. - - Behaves identically to a built-in :class:`str` type. - - Attributes - ---------- - unit_info : :class:`str` - The name of the unit this value posseses. - ''' - if not PY2: - __slots__ = ("unit_info", ) - - def __new__(cls, value, unit_info=None): - if PY2 and isinstance(value, unicode): - value = value.encode('utf-8') - inst = str.__new__(cls, value) - inst.unit_info = unit_info - return inst - - @property - def __dict__(self): - return _MappingOverAttributeProxy(self) - - def __reduce__(self): - return self.__class__, (str(self), self.unit_info) - - def _repr_pretty_(self, p, cycle): - base = super(unitstr, self).__repr__() - if self.unit_info: - string = "%s %s" % (base, self.unit_info) - else: - string = base - p.text(string) - - -class cvstr(str): - '''A helper class to associate a controlled vocabullary accession - number with an otherwise plain :class:`str` object - - Attributes - ---------- - accession : str - The accession number for this parameter, e.g. MS:1000040 - unit_accession : str - The accession number for the unit of the value, if any - ''' - - if not PY2: - __slots__ = ('accession', 'unit_accession') - - _cache = {} - - def __new__(cls, value, accession=None, unit_accession=None): - try: - inst = cls._cache[value] - if inst.accession == accession and inst.unit_accession == unit_accession: - return inst - except KeyError: - pass - - if PY2 and isinstance(value, unicode): - value = value.encode('utf-8') - inst = str.__new__(cls, value) - inst.accession = _intern_unit_or_cv(accession) - inst.unit_accession = _intern_unit_or_cv(unit_accession) - cls._cache[value] = inst - return inst - - @property - def __dict__(self): - return _MappingOverAttributeProxy(self) - - def __reduce__(self): - return self.__class__, (str(self), self.accession, self.unit_accession) - - -class CVQueryEngine(object): - '''Traverse an arbitrarily nested dictionary looking - for keys which are :class:`cvstr` instances, or objects - with an attribute called ``accession``. - ''' - - def _accession(self, key): - return getattr(key, 'accession', None) - - def _query_dict(self, data, accession): - for key, value in data.items(): - if self._accession(key) == accession: - if not isinstance(value, str) or value != '': - return value - else: - return key - elif isinstance(value, dict): - inner = self._query_dict(value, accession) - if inner is not None: - return inner - elif isinstance(value, (list, tuple)): - inner = self._query_sequence(value, accession) - if inner is not None: - return inner - elif self._accession(value) == accession: - return value - - def _query_sequence(self, data, accession): - for value in data: - if isinstance(value, dict): - inner = self._query_dict(value, accession) - if inner is not None: - return inner - elif isinstance(value, (list, tuple)): - inner = self._query_sequence(value, accession) - if inner is not None: - return inner - elif self._accession(value) == accession: - return value - - def query(self, data, accession): - '''Search ``data`` for a key with the accession - number ``accession``. Returns :const:`None` if - not found. - ''' - if accession is None: - raise TypeError("`accession` cannot be None") - return self._query_dict(data, accession) - - def _is_empty(self, value): - if isinstance(value, basestring): - return value == '' - return False - - def _walk_dict(self, data, index): - for key, value in data.items(): - accession = self._accession(key) - if accession: - if not self._is_empty(value): - index[accession] = value - else: - index[accession] = key - elif isinstance(value, dict): - self._walk_dict(value, index) - elif isinstance(value, (list, tuple)): - self._walk_sequence(value, index) - accession = self._accession(value) - if accession: - index[accession] = value - return index - - def _walk_sequence(self, data, index): - for value in data: - if isinstance(value, dict): - self._walk_dict(value, index) - elif isinstance(value, (list, tuple)): - self._walk_sequence(value, index) - else: - accession = self._accession(value) - if accession: - index[accession] = value - - def index(self, data): - '''Construct a flat :class:`dict` whose keys are the - accession numbers for all qualified keys in ``data`` - and whose values are the mapped values from ``data``. - ''' - index = self._walk_dict(data, {}) - return index - - def __call__(self, data, accession=None): - '''If ``accession`` is :const:`None`, calls - :meth:`index` on ``data``, otherwise calls - :meth:`query` with ``data`` and ``accession``. - ''' - if accession is None: - return self.index(data) - else: - return self.query(data, accession) - -'''A ready-to-use instance of :class:`~.CVQueryEngine`''' -cvquery = CVQueryEngine() diff --git a/pyteomics/auxiliary/target_decoy.py b/pyteomics/auxiliary/target_decoy.py deleted file mode 100644 index 3c563deafdc810b67d2d084f8b105b846dbea932..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/target_decoy.py +++ /dev/null @@ -1,997 +0,0 @@ -from __future__ import absolute_import -import re -import operator as op -import math - -try: - basestring -except NameError: - basestring = (str, bytes) - -try: - from collections.abc import Container, Sized -except ImportError: - from collections import Container, Sized -from bisect import bisect_right -from contextlib import contextmanager -try: - import pandas as pd -except ImportError: - pd = None - -from .structures import PyteomicsError -from .file_helpers import _keepstate, IteratorContextManager, _make_chain, ChainBase, TableJoiner - - -def _fix_docstring(f, **defaults): - for argname, v in defaults.items(): - if v is not None: - f.__doc__ = re.sub('{} : .*'.format(argname), - lambda m: m.group() + ', optional', f.__doc__) - - -def _calculate_qvalues(scores, isdecoy, peps=False, **kwargs): - """Actual q-value calculation. - - Parameters - ---------- - scores : numpy.ndarray - Sorted array of PSMs. - isdecoy : numpy.ndarray - Sorted array of bools (decoy/target) or floats (PEPs). - - Returns - ------- - out : numpy.ndarray - Calculated q-values. - """ - correction = kwargs.pop('correction', 0) - ratio = kwargs.pop('ratio', 1) - if ratio == 0: - raise PyteomicsError('Size ratio cannot be zero!') - remove_decoy = kwargs.get('remove_decoy', False) - formula = kwargs.pop('formula', (2, 1)[bool(remove_decoy)]) - if formula not in {1, 2}: - raise PyteomicsError('`formula` must be either 1 or 2') - - # score_label = kwargs['score_label'] - cumsum = isdecoy.cumsum(dtype=np.float64) - tfalse = cumsum.copy() - ind = np.arange(1., scores.shape[0] + 1., dtype=np.float64) - - if peps: - q = cumsum / ind - else: - if isinstance(correction, int): - if correction == 1: - tfalse += 1 - elif correction == 2: - p = 1. / (1. + ratio) - targ = ind - cumsum - for i in range(tfalse.size): - tfalse[i] = _expectation(cumsum[i], targ[i], p) - elif 0 < correction < 1: - p = 1. / (1. + ratio) - targ = ind - cumsum - for i in range(tfalse.size): - tfalse[i] = _confidence_value( - correction, cumsum[i], targ[i], p) - elif correction: - raise PyteomicsError('Invalid value for `correction`: {}.'.format(correction)) - - if formula == 1: - q = tfalse / (ind - cumsum) / ratio - else: - q = (cumsum + tfalse / ratio) / ind - - # Make sure that q-values are equal for equal scores (conservatively) - # and that q-values are monotonic - for i in range(scores.size - 1, 0, -1): - if (scores[i] == scores[i - 1] or q[i - 1] > q[i]): - q[i - 1] = q[i] - - return q - - -def _qvalues_df(psms, keyf, isdecoy, **kwargs): - full = kwargs.get('full_output', False) - remove_decoy = kwargs.get('remove_decoy', False) - peps = kwargs.get('pep') - decoy_or_pep_label = _decoy_or_pep_label(**kwargs) - q_label = kwargs.setdefault('q_label', 'q') - score_label = kwargs.setdefault('score_label', 'score') - if callable(keyf): - keyf = psms.apply(keyf, axis=1) - if callable(isdecoy): - isdecoy = psms.apply(isdecoy, axis=1) - if not isinstance(keyf, basestring): - if psms.shape[0]: - psms[score_label] = keyf - else: - psms[score_label] = [] - keyf = kwargs['score_label'] - if not isinstance(isdecoy, basestring): - if psms.shape[0]: - psms[decoy_or_pep_label] = isdecoy - else: - psms[decoy_or_pep_label] = [] - isdecoy = decoy_or_pep_label - reverse = kwargs.get('reverse', False) - - if not full: # create fields early - if peps is None: - fields = [(keyf, np.float64), (isdecoy, np.bool_), - (q_label, np.float64)] - else: - fields = [(isdecoy, np.float64), (q_label, np.float64)] - dtype = np.dtype(fields) - - psms.sort_values([keyf, isdecoy], ascending=[ - not reverse, True], inplace=True) - - if not psms.shape[0]: - if full: - psms[q_label] = [] - return psms - else: - return np.array([], dtype=dtype) - - q = _calculate_qvalues(psms[keyf].values, psms[ - isdecoy].values, peps is not None, **kwargs) - if remove_decoy: - q = q[~psms[isdecoy].values] - psms = psms[~psms[isdecoy]].copy() - if not full: - psms_ = np.empty_like(q, dtype=dtype) - if peps is None: - psms_[keyf] = psms[keyf] - psms_[isdecoy] = psms[isdecoy] - psms_[q_label] = q - psms = psms_ - else: - q_label = kwargs['q_label'] - psms[q_label] = q - return psms - - -def _decoy_or_pep_label(**kwargs): - peps = kwargs.get('pep') - return kwargs.get('decoy_label', 'is decoy') if peps is None else kwargs.get( - 'pep_label', peps if isinstance(peps, basestring) else 'PEP') - - -def _construct_dtype(*args, **kwargs): - full = kwargs.pop('full_output', False) - peps = kwargs.get('pep') - q_label = kwargs.setdefault('q_label', 'q') - score_label = kwargs.setdefault('score_label', 'score') - - fields = [(score_label, np.float64), - (_decoy_or_pep_label(**kwargs), - np.bool_ if peps is None else np.float64), - (q_label, np.float64)] - # if all args are NumPy arrays with common dtype, use it in the output - if full: - dtypes = {getattr(arg, 'dtype', None) for arg in args} - if len(dtypes) == 1 and None not in dtypes: - psm_dtype = dtypes.pop() - else: - psm_dtype = np.object_ - dtype = np.dtype(fields + [('psm', psm_dtype)]) - else: - dtype = np.dtype(fields) - return dtype - - -def _make_qvalues(read, is_decoy_prefix, is_decoy_suffix, key): - """Create a function that reads PSMs from a file and calculates q-values - for each value of `key`.""" - - def qvalues(*args, **kwargs): - """Read `args` and return a NumPy array with scores and q-values. - q-values are calculated either using TDA or based on provided values of PEP. - - Requires :py:mod:`numpy` (and optionally :py:mod:`pandas`). - - Parameters - ---------- - - positional args : file or str - Files to read PSMs from. All positional arguments are treated as - files. The rest of the arguments must be named. - - key : callable / array-like / iterable / str, keyword only - If callable, a function used for sorting of PSMs. Should accept - exactly one argument (PSM) and return a number (the smaller the better). - If array-like, should contain scores for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`DataFrame`). - - .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - reverse : bool, keyword only, optional - If :py:const:`True`, then PSMs are sorted in descending order, - i.e. the value of the key function is higher for better PSMs. - Default is :py:const:`False`. - - is_decoy : callable / array-like / iterable / str, keyword only - If callable, a function used to determine if the PSM is decoy or not. - Should accept exactly one argument (PSM) and return a truthy value if the - PSM should be considered decoy. - If array-like, should contain boolean values for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`DataFrame`). - - .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. - - pep : callable / array-like / iterable / str, keyword only, optional - If callable, a function used to determine the posterior error probability (PEP). - Should accept exactly one argument (PSM) and return a float. - If array-like, should contain float values for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`DataFrame`). - - .. note:: If this parameter is given, then PEP values will be used to calculate - q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with: - `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`. - `key` can still be provided. Without `key`, PSMs will be sorted by PEP. - - remove_decoy : bool, keyword only, optional - Defines whether decoy matches should be removed from the output. - Default is :py:const:`False`. - - .. note:: If set to :py:const:`False`, then by default the decoy - PSMs will be taken into account when estimating FDR. Refer to the - documentation of :py:func:`fdr` for math; basically, if - `remove_decoy` is :py:const:`True`, then formula 1 is used - to control output FDR, otherwise it's formula 2. This can be - changed by overriding the `formula` argument. - - formula : int, keyword only, optional - Can be either 1 or 2, defines which formula should be used for FDR - estimation. Default is 1 if `remove_decoy` is :py:const:`True`, - else 2 (see :py:func:`fdr` for definitions). - - ratio : float, keyword only, optional - The size ratio between the decoy and target databases. Default is - 1. In theory, the "size" of the database is the number of - theoretical peptides eligible for assignment to spectra that are - produced by *in silico* cleavage of that database. - - correction : int or float, keyword only, optional - Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. - - 0 (default): no correction; - - 1: enable "+1" correction. This accounts for the probability that a false - positive scores better than the first excluded decoy PSM; - - 2: this also corrects that probability for finite size of the sample, - so the correction will be slightly less than "+1". - - If a floating point number - is given, then instead of the expectation value for the number of false PSMs, - the confidence value is used. The value of `correction` is then interpreted as - desired confidence level. E.g., if correction=0.95, then the calculated q-values - do not exceed the "real" q-values with 95% probability. - - See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. - - q_label : str, optional - Field name for q-value in the output. Default is ``'q'``. - - score_label : str, optional - Field name for score in the output. Default is ``'score'``. - - decoy_label : str, optional - Field name for the decoy flag in the output. Default is ``'is decoy'``. - - pep_label : str, optional - Field name for PEP in the output. Default is ``'PEP'``. - - full_output : bool, keyword only, optional - If :py:const:`True`, then the returned array has PSM objects along - with scores and q-values. Default is :py:const:`False`. - - **kwargs : passed to the :py:func:`chain` function. - - Returns - ------- - out : numpy.ndarray - A sorted array of records with the following fields: - - - 'score': :py:class:`np.float64` - - 'is decoy': :py:class:`np.bool_` - - 'q': :py:class:`np.float64` - - 'psm': :py:class:`np.object_` (if `full_output` is :py:const:`True`) - """ - import numpy as np - - @_keepstate - def get_scores(*args, **kwargs): - scores = [] - with read(*args, **kwargs) as f: - for i, psm in enumerate(f): - row = [] - for func in (keyf, isdecoy): - if callable(func): - row.append(func(psm)) - elif isinstance(func, basestring): - row.append(psm[func]) - else: - row.append(func[i]) - row.append(None) - if full: - row.append(psm) - scores.append(tuple(row)) - return scores - - peps = kwargs.get('pep', None) - if peps is not None: - x = {'is_decoy', 'remove_decoy', 'formula', - 'ratio', 'correction'}.intersection(kwargs) - if x: - raise PyteomicsError( - "Can't use these parameters with `pep`: " + ', '.join(x)) - keyf = kwargs.pop('key', key) - reverse = kwargs.get('reverse', False) - if keyf is None: - keyf = peps - if reverse: - raise PyteomicsError( - 'reverse = True when using PEPs for sorting') - - if not callable(keyf) and not isinstance(keyf, (Sized, Container)): - keyf = np.array(list(keyf)) - - if peps is None: - if 'is_decoy' not in kwargs: - if 'decoy_suffix' in kwargs: - isdecoy = lambda x: is_decoy_suffix(x, kwargs['decoy_suffix']) - elif 'decoy_prefix' in kwargs: - isdecoy = lambda x: is_decoy_prefix(x, kwargs['decoy_prefix']) - else: - isdecoy = is_decoy_prefix - else: - isdecoy = kwargs['is_decoy'] - else: - isdecoy = peps - - if not callable(isdecoy) and not isinstance(isdecoy, (Sized, Container)): - isdecoy = np.array(list(isdecoy)) - - remove_decoy = kwargs.get('remove_decoy', False) - decoy_or_pep_label = _decoy_or_pep_label(**kwargs) - score_label = kwargs.setdefault('score_label', 'score') - q_label = kwargs.setdefault('q_label', 'q') - dtype = _construct_dtype(*args, **kwargs) - full = kwargs.get('full_output', False) - arr_flag = False - psms = None - - # time to check arg type - if pd is not None and all(isinstance(arg, pd.DataFrame) for arg in args): - psms = pd.concat(args) - return _qvalues_df(psms, keyf, isdecoy, **kwargs) - - if not all(isinstance(arg, np.ndarray) for arg in args): - if isinstance(keyf, basestring): - keyf = op.itemgetter(keyf) - if isinstance(isdecoy, basestring): - isdecoy = op.itemgetter(isdecoy) - if isinstance(peps, basestring): - peps = op.itemgetter(peps) - - if callable(keyf) or callable(isdecoy): - kwargs.pop('full_output', None) - scores = np.array(get_scores(*args, **kwargs), dtype=dtype) - else: - if all(isinstance(arg, np.ndarray) for arg in args): - psms = np.concatenate(args) - - if not isinstance(keyf, basestring): - keyf = np.array(keyf) - arr_flag = True - if not isinstance(isdecoy, basestring): - isdecoy = np.array(isdecoy) - arr_flag = True - - if arr_flag: - scores = np.empty(keyf.size if hasattr( - keyf, 'size') else isdecoy.size, dtype=dtype) - for func, label in zip((keyf, isdecoy), (score_label, decoy_or_pep_label)): - if not isinstance(func, basestring): - scores[label] = func - else: - scores[label] = psms[func] - else: - scores = np.empty(psms.shape[0], dtype=dtype) - scores[score_label] = psms[keyf] - scores[decoy_or_pep_label] = psms[isdecoy] - - if not scores.size: - if full and psms is not None: - return psms - return scores - - if not reverse: - keys = scores[decoy_or_pep_label], scores[score_label] - else: - keys = scores[decoy_or_pep_label], -scores[score_label] - lexsort = np.lexsort(keys) - scores = scores[lexsort] - if psms is not None: - psms = psms[lexsort] - - scores[q_label] = _calculate_qvalues(scores[score_label], scores[ - decoy_or_pep_label], peps is not None, **kwargs) - if remove_decoy: - if psms is not None: - psms = psms[~scores[decoy_or_pep_label]] - scores = scores[~scores[decoy_or_pep_label]] - - if full and psms is not None: - if isinstance(psms, np.ndarray): - fields = sorted(psms.dtype.fields, - key=lambda x: psms.dtype.fields[x][1]) - extra = [] - for func, label in zip((keyf, isdecoy), ('score', decoy_or_pep_label)): - if not (isinstance(func, basestring) or label in psms.dtype.fields): - extra.append(label) - elif label in psms.dtype.fields: - psms[label] = scores[label] - newdt = [(name, psms.dtype.fields[name][0]) for name in fields] + [ - (name, np.float64) for name in extra] + [(q_label, np.float64)] - psms_ = psms - psms = np.empty_like(psms_, dtype=newdt) - for f in fields: - psms[f] = psms_[f] - for f in extra: - psms[f] = scores[f] - else: - for func, label in zip((keyf, isdecoy), ('score', decoy_or_pep_label)): - if not isinstance(label, basestring): - psms[label] = scores[label] - psms[q_label] = scores[q_label] - return psms - return scores - - _fix_docstring(qvalues, is_decoy=is_decoy_prefix, key=key) - if read is _iter: - qvalues.__doc__ = qvalues.__doc__.replace("""positional args : file or str - Files to read PSMs from. All positional arguments are treated as - files.""", """positional args : iterables - Iterables to read PSMs from. All positional arguments are chained.""" - ).replace("""\n .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "") - - return qvalues - - -def _make_filter(read, is_decoy_prefix, is_decoy_suffix, key, qvalues): - """Create a function that reads PSMs from a file and filters them to - the desired FDR level (estimated by TDA), returning the top PSMs - sorted by `key`. - """ - def filter(*args, **kwargs): - try: - fdr = kwargs.pop('fdr') - except KeyError: - raise PyteomicsError('Keyword argument required: fdr') - - args = [list(arg) if not isinstance( - arg, (Container, Sized)) else arg for arg in args] - peps = kwargs.get('pep') - if peps is None: - remove_decoy = kwargs.pop('remove_decoy', True) - scores = qvalues(*args, remove_decoy=remove_decoy, **kwargs) - else: - scores = qvalues(*args, **kwargs) - keyf = kwargs.pop('key', key) - if keyf is None: - keyf = peps - reverse = kwargs.pop('reverse', False) - better = [op.lt, op.gt][bool(reverse)] - if 'is_decoy' not in kwargs: - if 'decoy_suffix' in kwargs: - isdecoy = lambda x: is_decoy_suffix(x, kwargs['decoy_suffix']) - elif 'decoy_prefix' in kwargs: - isdecoy = lambda x: is_decoy_prefix(x, kwargs['decoy_prefix']) - else: - isdecoy = is_decoy_prefix - else: - isdecoy = kwargs['is_decoy'] - kwargs.pop('formula', None) - decoy_or_pep_label = _decoy_or_pep_label(**kwargs) - score_label = kwargs.setdefault('score_label', 'score') - q_label = kwargs.get('q_label', 'q') - - try: - i = scores[q_label].searchsorted(fdr, side='right') - if isinstance(i, Sized): - i = i[0] - except AttributeError: - i = bisect_right(scores['q'], fdr) - if kwargs.pop('full_output', False): - if pd is not None and isinstance(scores, pd.DataFrame): - return scores.iloc[:i] - elif callable(keyf) or callable(isdecoy): - return scores['psm'][:i] - else: - return scores[:i] - elif not scores.size: - return (_ for _ in ()) - if peps is None: - label = score_label - else: - label = decoy_or_pep_label - cutoff = scores[label][i] if i < scores.size else ( - scores[label][-1] + (1, -1)[bool(reverse)]) - - def out(): - with read(*args, **kwargs) as f: - for p, s in zip(f, scores): - if peps is not None or not remove_decoy or not s[decoy_or_pep_label]: - if better(s[label], cutoff): - yield p - return out() - - def _filter(*args, **kwargs): - """Read `args` and yield only the PSMs that form a set with - estimated false discovery rate (FDR) not exceeding `fdr`. - - Requires :py:mod:`numpy` and, optionally, :py:mod:`pandas`. - - Parameters - ---------- - positional args : file or str - Files to read PSMs from. All positional arguments are treated as - files. The rest of the arguments must be named. - - fdr : float, keyword only, 0 <= fdr <= 1 - Desired FDR level. - - key : callable / array-like / iterable / str, keyword only - A function used for sorting of PSMs. Should accept exactly one - argument (PSM) and return a number (the smaller the better). The - default is a function that tries to extract e-value from the PSM. - - .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - reverse : bool, keyword only, optional - If :py:const:`True`, then PSMs are sorted in descending order, - i.e. the value of the key function is higher for better PSMs. - Default is :py:const:`False`. - - is_decoy : callable / array-like / iterable / str, keyword only - A function used to determine if the PSM is decoy or not. Should - accept exactly one argument (PSM) and return a truthy value if the - PSM should be considered decoy. - - .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. - - remove_decoy : bool, keyword only, optional - Defines whether decoy matches should be removed from the output. - Default is :py:const:`True`. - - .. note:: If set to :py:const:`False`, then by default the decoy - PSMs will be taken into account when estimating FDR. Refer to the - documentation of :py:func:`fdr` for math; basically, if - `remove_decoy` is :py:const:`True`, then formula 1 is used - to control output FDR, otherwise it's formula 2. This can be - changed by overriding the `formula` argument. - - formula : int, keyword only, optional - Can be either 1 or 2, defines which formula should be used for FDR - estimation. Default is 1 if `remove_decoy` is :py:const:`True`, - else 2 (see :py:func:`fdr` for definitions). - - ratio : float, keyword only, optional - The size ratio between the decoy and target databases. Default is - 1. In theory, the "size" of the database is the number of - theoretical peptides eligible for assignment to spectra that are - produced by *in silico* cleavage of that database. - - correction : int or float, keyword only, optional - Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. - - 0 (default): no correction; - - 1: enable "+1" correction. This accounts for the probability that a false - positive scores better than the first excluded decoy PSM; - - 2: this also corrects that probability for finite size of the sample, - so the correction will be slightly less than "+1". - - If a floating point number - is given, then instead of the expectation value for the number of false PSMs, - the confidence value is used. The value of `correction` is then interpreted as - desired confidence level. E.g., if correction=0.95, then the calculated q-values - do not exceed the "real" q-values with 95% probability. - - See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. - - pep : callable / array-like / iterable / str, keyword only, optional - If callable, a function used to determine the posterior error probability (PEP). - Should accept exactly one argument (PSM) and return a float. - If array-like, should contain float values for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`DataFrame`). - - .. note:: If this parameter is given, then PEP values will be used to calculate - q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with: - `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`. - `key` can still be provided. Without `key`, PSMs will be sorted by PEP. - - full_output : bool, keyword only, optional - If :py:const:`True`, then an array of PSM objects is returned. - Otherwise, an iterator / context manager object is returned, and the - files are parsed twice. This saves some RAM, but is ~2x slower. - Default is :py:const:`True`. - - .. note:: The name for the parameter comes from the fact that it is - internally passed to :py:func:`qvalues`. - - q_label : str, optional - Field name for q-value in the output. Default is ``'q'``. - - score_label : str, optional - Field name for score in the output. Default is ``'score'``. - - decoy_label : str, optional - Field name for the decoy flag in the output. Default is ``'is decoy'``. - - pep_label : str, optional - Field name for PEP in the output. Default is ``'PEP'``. - - **kwargs : passed to the :py:func:`chain` function. - - Returns - ------- - out : iterator or :py:class:`numpy.ndarray` or :py:class:`pandas.DataFrame` - """ - if kwargs.pop('full_output', True): - return filter(*args, full_output=True, **kwargs) - return IteratorContextManager(*args, parser_func=filter, **kwargs) - - _fix_docstring(_filter, is_decoy=is_decoy_prefix, key=key) - if read is _iter: - _filter.__doc__ = _filter.__doc__.replace("""positional args : file or str - Files to read PSMs from. All positional arguments are treated as - files.""", """positional args : iterables - Iterables to read PSMs from. All positional arguments are chained.""").replace( - """\n .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "") - return _filter - - -@contextmanager -def _itercontext(x, **kw): - try: - yield (row for i, row in x.iterrows()) - except AttributeError: - yield x - - -# _iter = _make_chain(_itercontext, 'iter') -_iter = ChainBase._make_chain(_itercontext) -qvalues = _make_qvalues(_iter, None, None, None) - -filter = _make_filter(_iter, None, None, None, qvalues) -filter.chain = _make_chain(filter, 'filter', True) -# filter.chain = TableJoiner._make_chain(filter) - - -try: - import numpy as np - _precalc_fact = np.log([math.factorial(n) for n in range(20)]) - - def log_factorial(x): - x = np.array(x) - pf = _precalc_fact - m = (x >= pf.size) - out = np.empty(x.shape) - out[~m] = pf[x[~m].astype(int)] - x = x[m] - out[m] = x * np.log(x) - x + 0.5 * np.log(2 * np.pi * x) - return out - - def _expectation(d, T, p=0.5): - if T is None: - return d + 1 - T = np.array(T, dtype=int) - m = np.arange(T.max() + 1, dtype=int) - pi = np.exp(_log_pi(d, m, p)) - return ((m * pi).cumsum() / pi.cumsum())[T] - - def _confidence_value(conf, d, T, p=0.5): - if T is not None: - T = np.array(T, dtype=int) - m = np.arange(T.max() + 1, dtype=int) - else: - m = np.arange(max(50 * d, 10000)) - log_pi = _log_pi(d, m, p) - pics = np.exp(log_pi).cumsum() - return np.searchsorted(pics, conf * (pics[T] if T is not None else 1)) - -except ImportError: - def log_factorial(n): - if n > 10: - return n * math.log(n) - n + 0.5 * math.log(2 * math.pi * n) - else: - return math.log(math.factorial(n)) - - def _expectation(*a, **k): - raise NotImplementedError('NumPy required') - - def _confidence_value(*a, **k): - raise NotImplementedError('NumPy required') - - -def _log_pi_r(d, k, p=0.5): - return k * math.log(p) + log_factorial(k + d) - log_factorial(k) - log_factorial(d) - - -def _log_pi(d, k, p=0.5): - return _log_pi_r(d, k, p) + (d + 1) * math.log(1 - p) - - -def _count_psms(psms, is_decoy, pep, decoy_prefix, decoy_suffix, is_decoy_prefix, is_decoy_suffix): - total, decoy = 0, 0 - if pep is not None: - is_decoy = pep - elif is_decoy is None: - if decoy_suffix is not None: - is_decoy = lambda x: is_decoy_suffix(x, decoy_suffix) - else: - is_decoy = lambda x: is_decoy_prefix(x, decoy_prefix) - if isinstance(is_decoy, basestring): - decoy = psms[is_decoy].sum() - total = psms.shape[0] - elif callable(is_decoy): - for psm in psms: - total += 1 - d = is_decoy(psm) - decoy += d if pep is not None else bool(d) - else: - if not isinstance(is_decoy, (Sized, Container)): - is_decoy = list(is_decoy) - if pep is not None: - decoy = sum(is_decoy) - else: - decoy = sum(map(bool, is_decoy)) - total = len(is_decoy) - return decoy, total - - -def _make_fdr(is_decoy_prefix, is_decoy_suffix): - def fdr(psms=None, formula=1, is_decoy=None, ratio=1, correction=0, pep=None, decoy_prefix='DECOY_', decoy_suffix=None): - """Estimate FDR of a data set using TDA or given PEP values. - Two formulas can be used. The first one (default) is: - - .. math:: - - FDR = \\frac{N_{decoy}}{N_{target} * ratio} - - The second formula is: - - .. math:: - - FDR = \\frac{N_{decoy} * (1 + \\frac{1}{ratio})}{N_{total}} - - .. note:: - This function is less versatile than :py:func:`qvalues`. To obtain FDR, - you can call :py:func:`qvalues` and take the last q-value. This function - can be used (with `correction = 0` or `1`) when :py:mod:`numpy` is not available. - - Parameters - ---------- - psms : iterable, optional - An iterable of PSMs, e.g. as returned by :py:func:`read`. - Not needed if `is_decoy` is an iterable. - - formula : int, optional - Can be either 1 or 2, defines which formula should be used for FDR - estimation. Default is 1. - - is_decoy : callable, iterable, or str - If callable, should accept exactly one argument (PSM) and return a truthy value - if the PSM is considered decoy. Default is :py:func:`is_decoy`. - If array-like, should contain float values for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`pandas.DataFrame`). - - .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. - - pep : callable, iterable, or str, optional - If callable, a function used to determine the posterior error probability (PEP). - Should accept exactly one argument (PSM) and return a float. - If array-like, should contain float values for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`pandas.DataFrame`). - - .. note:: If this parameter is given, then PEP values will be used to calculate FDR. - Otherwise, decoy PSMs will be used instead. This option conflicts with: - `is_decoy`, `formula`, `ratio`, `correction`. - - ratio : float, optional - The size ratio between the decoy and target databases. Default is 1. - In theory, the "size" of the database is the number of - theoretical peptides eligible for assignment to spectra that are - produced by *in silico* cleavage of that database. - - correction : int or float, optional - Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. - - 0 (default): no correction; - - 1: enable "+1" correction. This accounts for the probability that a false - positive scores better than the first excluded decoy PSM; - - 2: this also corrects that probability for finite size of the sample, - so the correction will be slightly less than "+1". - - If a floating point number - is given, then instead of the expectation value for the number of false PSMs, - the confidence value is used. The value of `correction` is then interpreted as - desired confidence level. E.g., if correction=0.95, then the calculated q-values - do not exceed the "real" q-values with 95% probability. - - See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. - - .. note:: - Requires :py:mod:`numpy`, if `correction` is a float or 2. - - .. note:: - Correction is only needed if the PSM set at hand was obtained using TDA - filtering based on decoy counting (as done by using :py:func:`!filter` without - `correction`). - - Returns - ------- - out : float - The estimation of FDR, (roughly) between 0 and 1. - """ - if formula not in {1, 2}: - raise PyteomicsError('`formula` must be either 1 or 2.') - if ratio == 0: - raise PyteomicsError('Size ratio cannot be zero!') - - decoy, total = _count_psms(psms, is_decoy, pep, decoy_prefix, decoy_suffix, is_decoy_prefix, is_decoy_suffix) - if pep is not None: - return float(decoy) / total - tfalse = decoy - if correction == 1 or (correction == 2 and total / decoy > 10): - tfalse += 1 - elif correction == 2: - p = 1. / (1. + ratio) - tfalse = _expectation(decoy, total - decoy, p) - elif 0 < correction < 1: - p = 1. / (1. + ratio) - tfalse = _confidence_value(correction, decoy, total - decoy, p) - if formula == 1: - if total == decoy: - raise PyteomicsError('Cannot compute FDR using formula 1: no target IDs found.') - return float(tfalse) / (total - decoy) / ratio - return (decoy + tfalse / ratio) / total - - _fix_docstring(fdr, is_decoy=is_decoy_prefix) - if is_decoy_prefix is None: - fdr.__doc__ = fdr.__doc__.replace( - """\n .. warning:: - The default function may not work - with your files, because format flavours are diverse. - - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`.\n""", "") - return fdr - - -fdr = _make_fdr(None, None) - -def _sigma_T(decoy, ratio): - return math.sqrt((decoy + 1) * (ratio + 1) / (ratio * ratio)) - -def sigma_T(psms, is_decoy, ratio=1): - """Calculates the standard error for the number of false positive target PSMs. - - The formula is:: - - .. math :: - - \\sigma(T) = \\sqrt{\\frac{(d + 1) \\cdot {p}}{(1 - p)^{2}}} = \\sqrt{\\frac{d+1}{r^{2}} \\cdot (r+1)} - - This estimation is accurate for low FDRs. - See the `article <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for more details. - """ - decoy, total = _count_psms(psms, is_decoy, None, None, None, None, None) - return _sigma_T(decoy, ratio) - -def sigma_fdr(psms=None, formula=1, is_decoy=None, ratio=1): - """Calculates the standard error of FDR using the formula for negative binomial distribution. - See :py:func:`sigma_T` for math. This estimation is accurate for low FDRs. - See also the `article <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for more details. - """ - - if formula not in {1, 2}: - raise PyteomicsError('`formula` must be either 1 or 2.') - decoy, total = _count_psms(psms, is_decoy, None, None, None, None, None) - sigmaT = _sigma_T(decoy, ratio) - if formula == 1: - return sigmaT / (total - decoy) / ratio - return sigmaT / total / ratio diff --git a/pyteomics/auxiliary/utils.py b/pyteomics/auxiliary/utils.py deleted file mode 100644 index 728b38507f3845b2923833fe4f50c4e9e5b4de1b..0000000000000000000000000000000000000000 --- a/pyteomics/auxiliary/utils.py +++ /dev/null @@ -1,317 +0,0 @@ -from __future__ import print_function - -import base64 -import zlib -from functools import wraps -from collections import namedtuple - - -try: - basestring -except NameError: - basestring = (str, bytes) - -try: - import numpy as np -except ImportError: - np = None - -try: - import pynumpress -except ImportError: - pynumpress = None - -from .structures import PyteomicsError - -def print_tree(d, indent_str=' -> ', indent_count=1): - """Read a nested dict (with strings as keys) and print its structure. - """ - def structure(d): - out = {} - for k, v in d.items(): - if isinstance(v, dict): - out[k] = structure(v) - elif isinstance(v, list) and v and isinstance(v[0], dict): - out['{} [list]'.format(k)] = structure(v[0]) - else: - out[k] = None - return out - - def _print(d, level=0): - for k, v in d.items(): - print('{}{}'.format(indent_str * indent_count * level, k)) - if v is not None: - _print(v, level + 1) - _print(structure(d)) - - -def memoize(maxsize=1000): - """Make a memoization decorator. A negative value of `maxsize` means - no size limit.""" - def deco(f): - """Memoization decorator. Items of `kwargs` must be hashable.""" - memo = {} - - @wraps(f) - def func(*args, **kwargs): - key = (args, frozenset(kwargs.items())) - if key not in memo: - if len(memo) == maxsize: - memo.popitem() - memo[key] = f(*args, **kwargs) - return memo[key] - return func - return deco - - -def _decode_base64_data_array(source, dtype, is_compressed): - """Read a base64-encoded binary array. - - Parameters - ---------- - source : str - A binary array encoded with base64. - dtype : dtype - The type of the array in numpy dtype notation. - is_compressed : bool - If True then the array will be decompressed with zlib. - - Returns - ------- - out : numpy.array - """ - - decoded_source = base64.b64decode(source.encode('ascii')) - if is_compressed: - decoded_source = zlib.decompress(decoded_source) - output = np.frombuffer(bytearray(decoded_source), dtype=dtype) - return output - - -_default_compression_map = { - 'no compression': lambda x: x, - 'zlib compression': zlib.decompress, -} - - -def _pynumpressDecompress(decoder): - def decode(data): - return decoder(np.frombuffer(data, dtype=np.uint8)) - return decode - - -def _zlibNumpress(decoder): - def decode(data): - return decoder(np.frombuffer(zlib.decompress(data), dtype=np.uint8)) - return decode - - -if pynumpress: - _default_compression_map.update( - { - 'MS-Numpress short logged float compression': _pynumpressDecompress(pynumpress.decode_slof), - 'MS-Numpress positive integer compression': _pynumpressDecompress(pynumpress.decode_pic), - 'MS-Numpress linear prediction compression': _pynumpressDecompress(pynumpress.decode_linear), - 'MS-Numpress short logged float compression followed by zlib compression': _zlibNumpress(pynumpress.decode_slof), - 'MS-Numpress positive integer compression followed by zlib compression': _zlibNumpress(pynumpress.decode_pic), - 'MS-Numpress linear prediction compression followed by zlib compression': _zlibNumpress(pynumpress.decode_linear), - }) - - -class ArrayConversionMixin(object): - _dtype_dict = {} - _array_keys = ['m/z array', 'intensity array'] - - def __init__(self, *args, **kwargs): - self._dtype_dict = {None: None} - dtype = kwargs.pop('dtype', None) - if isinstance(dtype, dict): - self._dtype_dict.update(dtype) - elif dtype: - self._dtype_dict = {k: dtype for k in self._array_keys} - self._dtype_dict[None] = dtype - self._convert_arrays = kwargs.pop('convert_arrays', 1) - if self._convert_arrays and np is None: - raise PyteomicsError('numpy is required for array conversion') - super(ArrayConversionMixin, self).__init__(*args, **kwargs) - - def __getstate__(self): - state = super(ArrayConversionMixin, self).__getstate__() - state['_dtype_dict'] = self._dtype_dict - state['_convert_arrays'] = self._convert_arrays - state['_array_keys'] = self._array_keys - return state - - def __setstate__(self, state): - super(ArrayConversionMixin, self).__setstate__(state) - self._dtype_dict = state['_dtype_dict'] - self._convert_arrays = state['_convert_arrays'] - self._array_keys = state['_array_keys'] - - def _build_array(self, k, data): - dtype = self._dtype_dict.get(k) - return np.array(data, dtype=dtype) - - def _convert_array(self, k, array): - dtype = self._dtype_dict.get(k) - if dtype is not None: - return array.astype(dtype) - return array - - def _build_all_arrays(self, info): - if self._convert_arrays: - for k in self._array_keys: - if k in info: - info[k] = self._build_array(k, info[k]) - - -class MaskedArrayConversionMixin(ArrayConversionMixin): - _masked_array_keys = ['charge array'] - _mask_value = 0 - - def __init__(self, *args, **kwargs): - self._convert_arrays = kwargs.pop('convert_arrays', 2) - kwargs['convert_arrays'] = self._convert_arrays - super(MaskedArrayConversionMixin, self).__init__(*args, **kwargs) - - def __getstate__(self): - state = super(MaskedArrayConversionMixin, self).__getstate__() - state['_masked_array_keys'] = self._masked_array_keys - state['_mask_value'] = self._mask_value - return state - - def __setstate__(self, state): - super(MaskedArrayConversionMixin, self).__setstate__(state) - self._masked_array_keys = state['_masked_array_keys'] - self._mask_value = state['_mask_value'] - - def _build_masked_array(self, k, data): - array = self._build_array(k, data) - return self._convert_masked_array(k, array) - - def _convert_masked_array(self, k, array): - return np.ma.masked_equal(array, self._mask_value) - - def _ensure_masked_array(self, k, data): - if isinstance(data, np.ndarray): - return self._convert_masked_array(k, data) - return self._build_masked_array(self, k, data) - - def _build_all_arrays(self, info): - super(MaskedArrayConversionMixin, self)._build_all_arrays(info) - if self._convert_arrays == 2: - for k in self._masked_array_keys: - if k in info: - info[k] = self._ensure_masked_array(k, info[k]) - - -if np is not None: - class BinaryDataArrayTransformer(object): - """A base class that provides methods for reading - base64-encoded binary arrays. - - Attributes - ---------- - compression_type_map : dict - Maps compressor type name to decompression function - """ - - compression_type_map = _default_compression_map - - class binary_array_record(namedtuple( - "binary_array_record", ("data", "compression", "dtype", "source", "key"))): - """Hold all of the information about a base64 encoded array needed to - decode the array. - """ - - def decode(self): - """Decode :attr:`data` into a numerical array - - Returns - ------- - np.ndarray - """ - return self.source._decode_record(self) - - def _make_record(self, data, compression, dtype, key=None): - return self.binary_array_record(data, compression, dtype, self, key) - - def _decode_record(self, record): - array = self.decode_data_array( - record.data, record.compression, record.dtype) - return self._finalize_record_conversion(array, record) - - def _finalize_record_conversion(self, array, record): - return array - - def _base64_decode(self, source): - decoded_source = base64.b64decode(source.encode('ascii')) - return decoded_source - - def _decompress(self, source, compression_type=None): - if compression_type is None: - return source - decompressor = self.compression_type_map.get(compression_type) - decompressed_source = decompressor(source) - return decompressed_source - - def _transform_buffer(self, binary, dtype): - if isinstance(binary, np.ndarray): - return binary.astype(dtype, copy=False) - return np.frombuffer(binary, dtype=dtype) - - def decode_data_array(self, source, compression_type=None, dtype=np.float64): - """Decode a base64-encoded, compressed bytestring into a numerical - array. - - Parameters - ---------- - source : bytes - A base64 string encoding a potentially compressed numerical - array. - compression_type : str, optional - The name of the compression method used before encoding the - array into base64. - dtype : type, optional - The data type to use to decode the binary array from the - decompressed bytes. - - Returns - ------- - np.ndarray - """ - binary = self._base64_decode(source) - binary = self._decompress(binary, compression_type) - if isinstance(binary, bytes): - binary = bytearray(binary) - array = self._transform_buffer(binary, dtype) - return array - - - class BinaryArrayConversionMixin(ArrayConversionMixin, BinaryDataArrayTransformer): - def _finalize_record_conversion(self, array, record): - key = record.key - return self._convert_array(key, array) - - -else: - BinaryDataArrayTransformer = None - BinaryArrayConversionMixin = None - - -def add_metaclass(metaclass): - """Class decorator for creating a class with a metaclass.""" - def wrapper(cls): - orig_vars = cls.__dict__.copy() - slots = orig_vars.get('__slots__') - if slots is not None: - if isinstance(slots, str): - slots = [slots] - for slots_var in slots: - orig_vars.pop(slots_var) - orig_vars.pop('__dict__', None) - orig_vars.pop('__weakref__', None) - if hasattr(cls, '__qualname__'): - orig_vars['__qualname__'] = cls.__qualname__ - return metaclass(cls.__name__, cls.__bases__, orig_vars) - return wrapper diff --git a/pyteomics/electrochem.py b/pyteomics/electrochem.py deleted file mode 100644 index 4fa2937d0cc3f7b96639a749054368d4e969295b..0000000000000000000000000000000000000000 --- a/pyteomics/electrochem.py +++ /dev/null @@ -1,499 +0,0 @@ -""" -electrochem - electrochemical properties of polypeptides -======================================================== - -Summary -------- - -This module is used to calculate the -electrochemical properties of polypeptide molecules. - -The theory behind most of this module is based on the Henderson-Hasselbalch -equation and was thoroughly described in a number of sources [#Aronson]_, -[#Moore]_. - -Briefly, the formula for the charge of a polypeptide in given pH is the following: - -.. math:: - - Q_{peptide} = \sum{\\frac{Q_i}{1+10^{Q_i(pH-pK_i)}}}, - -where the sum is taken over all ionizable groups of the polypeptide, and -:math:`Q_i` is -1 and +1 for acidic and basic functional groups, -respectively. - -Charge and pI functions ------------------------ - - :py:func:`charge` - calculate the charge of a polypeptide - - :py:func:`pI` - calculate the isoelectric point of a polypeptide - - -GRand AVerage of hYdropathicity (GRAVY) ---------------------------------------- - - :py:func:`gravy` - calculate the GRAVY index of a polypeptide - - -Data ----- - - :py:data:`pK_lehninger` - a set of pK from [#Lehninger]_. - - :py:data:`pK_sillero` - a set of pK from [#Sillero]_. - - :py:data:`pK_dawson` - a set of pK from [#Dawson]_, the pK values for NH2- - and -OH are taken from [#Sillero]_. - - :py:data:`pK_rodwell` - a set of pK from [#Rodwell]_. - - :py:data:`pK_bjellqvist` - a set of pK from [#Bjellqvist]_. - - :py:data:`pK_nterm_bjellqvist` - a set of N-terminal pK from [#Bjellqvist]_. - - :py:data:`pK_cterm_bjellqvist` - a set of C-terminal pK from [#Bjellqvist]_. - - :py:data:`hydropathicity_KD` - a set of hydropathicity indexes from [#Kyte]_. - - -References ----------- - -.. [#Aronson] Aronson, J. N. The Henderson-Hasselbalch equation - revisited. Biochemical Education, 1983, 11 (2), 68. - `Link. <http://dx.doi.org/10.1016/0307-4412(83)90046-8>`_ - -.. [#Moore] Moore, D. S.. Amino acid and peptide net charges: A - simple calculational procedure. Biochemical Education, 1986, 13 (1), 10-12. - `Link. <http://dx.doi.org/10.1016/0307-4412(85)90114-1>`_ - -.. [#Lehninger] Nelson, D. L.; Cox, M. M. Lehninger Principles of - Biochemistry, Fourth Edition; W. H. Freeman, 2004; p. 1100. - -.. [#Sillero] Sillero, A.; Ribeiro, J. Isoelectric points of proteins: - Theoretical determination. Analytical Biochemistry, 1989, 179 (2), 319-325. - `Link. <http://dx.doi.org/10.1016/0003-2697(89)90136-X>`_ - -.. [#Dawson] Dawson, R. M. C.; Elliot, D. C.; Elliot, W. H.; Jones, K. M. - Data for biochemical research. Oxford University Press, 1989; p. 592. - -.. [#Rodwell] Rodwell, J. Heterogeneity of component bands in isoelectric - focusing patterns. Analytical Biochemistry, 1982, 119 (2), 440-449. - `Link. <http://dx.doi.org/10.1016/0003-2697(82)90611-X>`_ - -.. [#Bjellqvist] Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. - Reference points for comparisons of two-dimensional maps of proteins from - different human cell types defined in a pH scale where isoelectric points - correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. - `Link. <http://dx.doi.org/10.1002/elps.1150150171>`_ - -.. [#Kyte] Kyte, J.; Doolittle, R. F.. - A simple method for displaying the hydropathic character of a protein. - Journal of molecular biology 1982, 157 (1), 105-32. - `Link. <https://doi.org/10.1016/0022-2836(82)90515-0>`_ - -------------------------------------------------------------------------------- - -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from . import parser -from .auxiliary import PyteomicsError -from collections import Counter -try: - from collections.abc import Iterable -except ImportError: - from collections import Iterable - - -def charge(sequence, pH, **kwargs): - """Calculate the charge of a polypeptide in given pH or list of pHs using - a given list of amino acid electrochemical properties. - - .. warning:: - - Be cafeful when supplying a list with a parsed sequence or a dict with - amino acid composition as `sequence`. Such values must be obtained - with enabled `show_unmodified_termini` option. - - .. warning:: - - If you provide `pK_nterm` or `pK_cterm` and provide `sequence` as a dict, - it is assumed that it was obtained with ``term_aa=True`` (see - :py:func:`pyteomics.parser.amino_acid_composition` for details). - - Parameters - ---------- - sequence : str or list or dict - A string with a polypeptide sequence, a list with a parsed - sequence or a dict of amino acid composition. - pH : float or iterable of floats - pH or iterable of pHs for which the charge is calculated. - pK : dict {str: [(float, int), ...]}, optional - A set of pK of amino acids' ionizable groups. It is a dict, where keys - are amino acid labels and the values are lists of tuples (pK, - charge_in_ionized_state), a tuple per ionizable group. The default - value is `pK_lehninger`. - - pK_nterm : dict {str: [(float, int),]}, optional - pK_cterm : dict {str: [(float, int),]}, optional - Sets of pK of N-terminal and C-terminal (respectively) amino acids' - ionizable groups. Dicts with the same structure as ``pK``. These - values (if present) are used for N-terminal and C-terminal residues, - respectively. If given, `sequence` must be a :py:class:`str` or a - :py:class:`list`. The default value is an empty dict. - - Returns - ------- - out : float or list of floats - A single value of charge or a list of charges. - """ - - peptide_dict, pK = _prepare_charge_dict(sequence, **kwargs) - - # Process the case when pH is a single float. - pH_list = pH if isinstance(pH, Iterable) else [pH,] - - charge_list = _charge_for_dict(peptide_dict, pH_list, pK) - return charge_list[0] if not isinstance(pH, Iterable) else charge_list - - -def _prepare_charge_dict(sequence, **kwargs): - nterm = cterm = n_aa = c_aa = None - pK = kwargs.get('pK', pK_lehninger).copy() - pK_nterm = kwargs.get('pK_nterm', {}) - pK_cterm = kwargs.get('pK_cterm', {}) - - if isinstance(sequence, dict): - peptide_dict = sequence.copy() - for k, v in sequence.items(): - if k[-1] == '-': - if v > 1 or nterm: - raise PyteomicsError( - 'More that one N-terminal group in {}'.format( - sequence)) - nterm = k - if k[0] == '-': - if v > 1 or cterm: - raise PyteomicsError( - 'More that one C-terminal group in {}'.format( - sequence)) - cterm = k - if k[:5] == 'nterm': - if v > 1 or n_aa: - raise PyteomicsError( - 'More that one N-terminal residue in {}'.format( - sequence)) - n_aa = k[5:] - peptide_dict[n_aa] = peptide_dict.get(n_aa, 0) + 1 - if k[:5] == 'cterm': - if v > 1 or c_aa: - raise PyteomicsError( - 'More that one C-terminal residue in {}'.format( - sequence)) - c_aa = k[5:] - peptide_dict[c_aa] = peptide_dict.get(c_aa, 0) + 1 - - if nterm is None or cterm is None: - raise PyteomicsError('Peptide must have two explicit terminal groups') - if (n_aa is None or c_aa is None) and (pK_nterm or pK_cterm): - raise PyteomicsError('Two terminal residues must be present in ' - 'peptide (designated as "ntermX" and "ctermX", where "X" is ' - 'the one-letter residue label). Use ' - '``term_aa=True`` when calling ' - '`parser.amino_acid_composition`.') - - elif isinstance(sequence, (str, list)): - if isinstance(sequence, str): - if sequence.isupper() and sequence.isalpha(): - parsed_sequence = [parser.std_nterm] + list(sequence) + [parser.std_cterm] - else: - parsed_sequence = parser.parse(sequence, show_unmodified_termini=True) - elif isinstance(sequence, list): - if sequence[0][-1] != '-' or sequence[-1][0] != '-': - raise PyteomicsError('Parsed sequences must contain terminal ' - 'groups at 0-th and last positions.') - parsed_sequence = sequence - - n_aa = parsed_sequence[1] - c_aa = parsed_sequence[-2] - nterm = parsed_sequence[0] - cterm = parsed_sequence[-1] - peptide_dict = Counter(parsed_sequence) - - else: - raise PyteomicsError('Unsupported type of sequence: %s' % type(sequence)) - - if nterm in pK_nterm: - if n_aa in pK_nterm[nterm]: - pK[nterm] = pK_nterm[nterm][n_aa] - if cterm in pK_cterm: - if c_aa in pK_cterm[cterm]: - pK[cterm] = pK_cterm[cterm][c_aa] - - return peptide_dict, pK - - -def _charge_for_dict(peptide_dict, pH_list, pK): - # Calculate the charge for each value of pH. - charge_list = [] - for pH_value in pH_list: - charge = 0 - for aa in peptide_dict: - for ionizable_group in pK.get(aa, []): - charge += peptide_dict[aa] * ionizable_group[1] * ( - 1. / (1. + 10 ** (ionizable_group[1] * (pH_value - ionizable_group[0])))) - charge_list.append(charge) - - return charge_list - - -def pI(sequence, pI_range=(0.0, 14.0), precision_pI=0.01, **kwargs): - """Calculate the isoelectric point of a polypeptide using a given set - of amino acids' electrochemical properties. - - .. warning:: - - Be cafeful when supplying a list with a parsed sequence or a dict with - amino acid composition as `sequence`. Such values must be obtained - with enabled `show_unmodified_termini` option. - - Parameters - ---------- - sequence : str or list or dict - A string with a polypeptide sequence, a list with a parsed - sequence or a dict of amino acid composition. - pI_range : tuple (float, float) - The range of allowable pI values. Default is (0.0, 14.0). - precision_pI : float - The precision of the calculated pI. Default is 0.01. - pK : dict {str: [(float, int), ...]}, optional - A set of pK of amino acids' ionizable groups. It is a dict, where keys - are amino acid labels and the values are lists of tuples (pK, - charge_in_ionized_state), a tuple per ionizable group. The default - value is `pK_lehninger`. - pK_nterm : dict {str: [(float, int),]}, optional - pK_cterm : dict {str: [(float, int),]}, optional - Sets of pK of N-terminal and C-terminal (respectively) amino acids' - ionizable groups. Dicts with the same structure as ``pK``. These - values (if present) are used for N-terminal and C-terminal residues, - respectively. If given, `sequence` must be a :py:class:`str` or a - :py:class:`list`. The default value is an empty dict. - - Returns - ------- - out : float - """ - - pK = kwargs.get('pK', pK_lehninger.copy()) - pK_nterm = {} - pK_cterm = {} - if isinstance(sequence, str) or isinstance(sequence, list): - pK_nterm = kwargs.get('pK_nterm', {}) - pK_cterm = kwargs.get('pK_cterm', {}) - elif isinstance(sequence, dict) and (('pK_nterm' in kwargs) or ('pK_cterm' in kwargs)): - raise PyteomicsError('Can not use terminal features for %s' % type(sequence)) - - peptide_dict, pK = _prepare_charge_dict(sequence, pK=pK, pK_cterm=pK_cterm, pK_nterm=pK_nterm) - # The algorithm is based on the fact that charge(pH) is a monotonic function. - left_x, right_x = pI_range - left_y = _charge_for_dict(peptide_dict, [left_x], pK)[0] - right_y = _charge_for_dict(peptide_dict, [right_x], pK)[0] - while (right_x - left_x) > precision_pI: - if left_y * right_y > 0: - return left_x if abs(left_y) < abs(right_y) else right_x - middle_x = (left_x + right_x) / 2.0 - middle_y = _charge_for_dict(peptide_dict, [middle_x], pK)[0] - if middle_y * left_y < 0: - right_x = middle_x - right_y = middle_y - else: - left_x = middle_x - left_y = middle_y - return (left_x + right_x) / 2.0 - - -pK_lehninger = { - 'E': [(4.25, -1)], - 'R': [(12.48, 1)], - 'Y': [(10.07, -1)], - 'D': [(3.65, -1)], - 'H': [(6.00, +1)], - 'K': [(10.53, +1)], - 'C': [(8.18, -1)], - 'H-': [(9.69, +1)], - '-OH': [(2.34, -1)], - } -"""A set of pK from Nelson, D. L.; Cox, M. M. Lehninger Principles of -Biochemistry, Fourth Edition; W. H. Freeman, 2004; p. 1100. -""" - -pK_sillero = { - 'E': [(4.5, -1)], - 'R': [(12.0, +1)], - 'Y': [(10.0, -1)], - 'D': [(4.0, -1)], - 'H': [(6.4, +1)], - 'K': [(10.4, +1)], - 'C': [(9.0, -1)], - 'H-': [(8.2, +1)], - '-OH': [(3.2, -1)], - } -"""A set of pK from Sillero, A.; Ribeiro, J. Isoelectric points of proteins: -Theoretical determination. Analytical Biochemistry, vol. 179 (2), pp. 319-325, -1989. -""" - -pK_dawson = { - 'E': [(4.3, -1)], - 'R': [(12.0, +1)], - 'Y': [(10.1, -1)], - 'D': [(3.9, -1)], - 'H': [(6.0, +1)], - 'K': [(10.5, +1)], - 'C': [(8.3, -1)], - 'H-': [(8.2, +1)], - '-OH': [(3.2, -1)], - } -"""A set of pK from Dawson, R. M. C.; Elliot, D. C.; Elliot, W. H.; Jones, -K. M. Data for biochemical research. Oxford University Press, 1989; p. 592. -pKs for NH2- and -OH are taken from `pK_sillero`. -""" - -pK_rodwell = { - 'E': [(4.25, -1)], - 'R': [(11.5, +1)], - 'Y': [(10.7, -1)], - 'D': [(3.86, -1)], - 'H': [(6.0, +1)], - 'K': [(11.5, +1)], - 'C': [(8.33, -1)], - 'H-': [(8.0, +1)], - '-OH': [(3.1, -1)], -} -"""A set of pK from Rodwell, J. Heterogeneity of component bands in -isoelectric focusing patterns. Analytical Biochemistry, vol. 119 (2), -pp. 440-449, 1982. -""" - -pK_bjellqvist = { - 'E': [(4.45, -1)], - 'R': [(12.0, +1)], - 'Y': [(10.0, -1)], - 'D': [(4.05, -1)], - 'H': [(5.98, +1)], - 'K': [(10.0, +1)], - 'C': [(9.0, -1)], - 'H-': [(7.5, +1)], - '-OH': [(3.55, -1)], -} -""" -A set of pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. -Reference points for comparisons of two-dimensional maps of proteins from -different human cell types defined in a pH scale where isoelectric points -correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. -""" - -pK_nterm_bjellqvist = { - 'H-': { - 'A': [(7.59, +1)], - 'M': [(7.0, +1)], - 'S': [(6.93, +1)], - 'P': [(8.36, +1)], - 'T': [(6.82, +1)], - 'V': [(7.44, +1)], - 'E': [(7.7, +1)] - } - } -""" -A set of N-terminal pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. -Reference points for comparisons of two-dimensional maps of proteins from -different human cell types defined in a pH scale where isoelectric points -correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. -""" - -pK_cterm_bjellqvist = { - '-OH': { - 'D': [(4.55, -1)], - 'E': [(4.75, -1)] - } - } -""" -A set of C-terminal pK from Bjellqvist, B., Basse, B., Olsen, E. and Celis, J.E. -Reference points for comparisons of two-dimensional maps of proteins from -different human cell types defined in a pH scale where isoelectric points -correlate with polypeptide compositions. Electrophoresis 1994, 15, 529-539. -""" - -hydropathicity_KD = { - "A": 1.800, - "R": -4.500, - "N": -3.500, - "D": -3.500, - "C": 2.500, - "Q": -3.500, - "E": -3.500, - "G": -0.400, - "H": -3.200, - "I": 4.500, - "L": 3.800, - "K": -3.900, - "M": 1.900, - "F": 2.800, - "P": -1.600, - "S": -0.800, - "T": -0.700, - "W": -0.900, - "Y": -1.300, - "V": 4.200, -} -""" -A set of hydropathicity indexes obtained from Kyte J., Doolittle F. J. Mol. Biol. 157:105-132 (1982). -""" - - -def gravy(sequence, hydropathicity=hydropathicity_KD): - """ - Calculate GRand AVerage of hYdropathicity (GRAVY) index for amino acid sequence. - - Parameters - ---------- - sequence : str - Polypeptide sequence in one-letter format. - hydropathicity : dict, optional - Hydropathicity indexes of amino acids. Default is :py:data:`hydropathicity_KD`. - - Returns - ------- - out : float - GRand AVerage of hYdropathicity (GRAVY) index. - - Examples - >>> gravy('PEPTIDE') - -1.4375 - """ - try: - return sum(hydropathicity[aa] for aa in sequence) / len(sequence) - except KeyError as e: - raise PyteomicsError("Hydropathicity for amino acid {} not provided.".format(e.args[0])) - - -if __name__ == "__main__": - import doctest - - doctest.testmod() diff --git a/pyteomics/fasta.py b/pyteomics/fasta.py deleted file mode 100644 index 61d38c3e36da9eb2b748ae086caf006f9639bed7..0000000000000000000000000000000000000000 --- a/pyteomics/fasta.py +++ /dev/null @@ -1,1072 +0,0 @@ -""" -fasta - manipulations with FASTA databases -========================================== - -FASTA is a simple file format for protein sequence databases. Please refer to -`the NCBI website <http://www.ncbi.nlm.nih.gov/blast/fasta.shtml>`_ -for the most detailed information on the format. - -Data manipulation ------------------ - -Classes -....... - -Several classes of FASTA parsers are available. All of them have common features: - - - context manager support; - - - header parsing; - - - direct iteration. - -Available classes: - - :py:class:`FASTABase` - common ancestor, suitable for type checking. - Abstract class. - - :py:class:`FASTA` - text-mode, sequential parser. - Good for iteration over database entries. - - :py:class:`IndexedFASTA` - binary-mode, indexing parser. - Supports direct indexing by header string. - - :py:class:`TwoLayerIndexedFASTA` - additionally supports - indexing by extracted header fields. - - :py:class:`UniProt` and :py:class:`IndexedUniProt`, - :py:class:`UniParc` and :py:class:`IndexedUniParc`, - :py:class:`UniMes` and :py:class:`IndexedUniMes`, - :py:class:`UniRef` and :py:class:`IndexedUniRef`, - :py:class:`SPD` and :py:class:`IndexedSPD`, - :py:class:`NCBI` and :py:class:`IndexedNCBI`, - :py:class:`RefSeq` and :py:class:`IndexedRefSeq`, - format-specific parsers. - -Functions -......... - - :py:func:`read` - returns an instance of the appropriate reader class, - for sequential iteration or random access. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`write` - write entries to a FASTA database. - - :py:func:`parse` - parse a FASTA header. - -Decoy sequence generation -------------------------- - -:py:func:`decoy_sequence` - generate a decoy sequence from a given sequence, using -one of the other functions listed in this section or any other callable. - -:py:func:`reverse` - generate a reversed decoy sequence. - -:py:func:`shuffle` - generate a shuffled decoy sequence. - -:py:func:`fused_decoy` - generate a "fused" decoy sequence. - - -Decoy database generation -------------------------- - - :py:func:`write_decoy_db` - generate a decoy database and write it to a file. - - :py:func:`decoy_db` - generate entries for a decoy database from a given FASTA - database. - - :py:func:`decoy_entries` - generate decoy entries for an iterator. - - :py:func:`decoy_chain` - a version of :py:func:`decoy_db` for multiple files. - - :py:func:`decoy_chain.from_iterable` - like :py:func:`decoy_chain`, but with - an iterable of files. - -Auxiliary ---------- - - :py:data:`std_parsers` - a dictionary with parsers for known FASTA header - formats. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -import random -from collections import namedtuple -import re -import abc -from . import auxiliary as aux -from .auxiliary.utils import add_metaclass - - -Protein = namedtuple('Protein', ('description', 'sequence')) -DECOY_PREFIX = 'DECOY_' -RAW_HEADER_KEY = '__raw__' - - -def _add_raw_field(parser): - """ - Add :py:const:`RAW_HEADER_KEY` field to the parsed dictinary. - - Parameters - ---------- - parser : func - parser function. - - Returns - ------- - None. - - """ - def _new_parser(instance, descr): - parsed = parser(instance, descr) - if RAW_HEADER_KEY not in parsed: - parsed[RAW_HEADER_KEY] = descr - elif parsed[RAW_HEADER_KEY] != descr: - raise aux.PyteomicsError('Cannot save raw protein header, since the corresponsing' - 'key ({}) already exists.'.format(RAW_HEADER_KEY)) - return parsed - - return _new_parser - - -class FASTABase(object): - """Abstract base class for FASTA file parsers. - Can be used for type checking. - """ - parser = None - _ignore_comments = False - _comments = set('>;') - - def __init__(self, source, **kwargs): - self._ignore_comments = kwargs.pop('ignore_comments', False) - parser = kwargs.pop('parser', None) - if parser is not None: - self.parser = parser - super(FASTABase, self).__init__(source, **kwargs) - - def _is_comment(self, line): - return line[0] in self._comments - - def get_entry(self, key): - raise NotImplementedError - - -class FASTA(FASTABase, aux.FileReader): - """Text-mode, sequential FASTA parser. - Suitable for iteration over the file to obtain all entries in order. - """ - def __init__(self, source, ignore_comments=False, parser=None, encoding=None): - """Create a new FASTA parser object. Supports iteration, - yields `(description, sequence)` tuples. Supports `with` syntax. - - Parameters - ---------- - - source : str or file-like - File to read. If file object, it must be opened in *text* mode. - ignore_comments : bool, optional - If :py:const:`True` then ignore the second and subsequent lines of description. - Default is :py:const:`False`, which concatenates multi-line descriptions into - a single string. - parser : function or None, optional - Defines whether the FASTA descriptions should be parsed. If it is a - function, that function will be given the description string, and - the returned value will be yielded together with the sequence. - The :py:data:`std_parsers` dict has parsers for several formats. - Hint: specify :py:func:`parse` as the parser to apply automatic - format recognition. - Default is :py:const:`None`, which means return the header "as is". - encoding : str or None, optional - File encoding (if it is given by name). - """ - super(FASTA, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={}, - encoding=encoding, ignore_comments=ignore_comments, parser=parser) - - def _read(self): - accumulated_strings = [] - - # Iterate through '>' after the file is over to retrieve the last entry. - for string in itertools.chain(self._source, '>'): - stripped_string = string.strip() - - # Skip empty lines. - if not stripped_string: - continue - - is_comment = self._is_comment(stripped_string) - if is_comment: - # If it is a continuing comment - if len(accumulated_strings) == 1: - if not self._ignore_comments: - accumulated_strings[0] += (' ' + stripped_string[1:]) - else: - continue - - elif accumulated_strings: - description = accumulated_strings[0] - sequence = ''.join(accumulated_strings[1:]) - - # Drop the translation stop sign. - if sequence and sequence[-1] == '*': - sequence = sequence[:-1] - if self.parser is not None: - description = self.parser(description) - yield Protein(description, sequence) - accumulated_strings = [stripped_string[1:]] - else: - # accumulated_strings is empty; we're probably reading - # the very first line of the file - accumulated_strings.append(stripped_string[1:]) - else: - accumulated_strings.append(stripped_string) - - def get_entry(self, key): - raise aux.PyteomicsError('Direct indexing is not supported. ' - 'Use IndexedFASTA and its subclasses') - - -def _reconstruct(cls, args, kwargs): - kwargs['_skip_index'] = True - return cls(*args, **kwargs) - - -class IndexedFASTA(FASTABase, aux.TaskMappingMixin, aux.IndexedTextReader): - """Indexed FASTA parser. Supports direct indexing by matched labels.""" - delimiter = '\n>' - label = r'^[\n]?>(.*)\s*' - - def __init__(self, source, ignore_comments=False, parser=None, **kwargs): - """Create an indexed FASTA parser object. - - Parameters - ---------- - source : str or file-like - File to read. If file object, it must be opened in *binary* mode. - ignore_comments : bool, optional - If :py:const:`True` then ignore the second and subsequent lines of description. - Default is :py:const:`False`, which concatenates multi-line descriptions into - a single string. - parser : function or None, optional - Defines whether the FASTA descriptions should be parsed. If it is a - function, that function will be given the description string, and - the returned value will be yielded together with the sequence. - The :py:data:`std_parsers` dict has parsers for several formats. - Hint: specify :py:func:`parse` as the parser to apply automatic - format recognition. - Default is :py:const:`None`, which means return the header "as is". - encoding : str or None, optional, keyword only - File encoding. Default is UTF-8. - block_size : int or None, optional, keyword only - Number of bytes to consume at once. - delimiter : str or None, optional, keyword only - Overrides the FASTA record delimiter (default is ``'\\n>'``). - label : str or None, optional, keyword only - Overrides the FASTA record label pattern. Default is ``'^[\\n]?>(.*)'``. - label_group : int or str, optional, keyword only - Overrides the matched group used as key in the byte offset index. - This in combination with `label` can be used to extract fields from headers. - However, consider using :py:class:`TwoLayerIndexedFASTA` for this purpose. - """ - super(IndexedFASTA, self).__init__(source, ignore_comments=ignore_comments, parser=parser, - parser_func=self._read, pass_file=False, args=(), kwargs={}, **kwargs) - self._init_args = (source, ignore_comments, parser) - self._init_kwargs = kwargs - - def __reduce_ex__(self, protocol): - return (_reconstruct, - (self.__class__, self._init_args, self._init_kwargs), - self.__getstate__()) - - def _read_protein_lines(self, lines): - description = [] - sequence = [] - - for string in lines: - stripped_string = string.strip() - if not stripped_string: - continue - - is_comment = self._is_comment(stripped_string) - if is_comment: - if not description or not self._ignore_comments: - description.append(stripped_string[1:]) - else: - sequence.append(stripped_string) - - description = ' '.join(description) - sequence = ''.join(sequence) - # Drop the translation stop sign. - if sequence and sequence[-1] == '*': - sequence = sequence[:-1] - if self.parser is not None: - description = self.parser(description) - return Protein(description, sequence) - - def _item_from_offsets(self, offsets): - start, end = offsets - lines = self._read_lines_from_offsets(start, end) - return self._read_protein_lines(lines) - - def _read(self, **kwargs): - for key, offsets in self._offset_index.items(): - yield self._item_from_offsets(offsets) - - def get_entry(self, key): - return self.get_by_id(key) - - -class TwoLayerIndexedFASTA(IndexedFASTA): - """Parser with two-layer index. Extracted groups are mapped to full headers (where possible), - full headers are mapped to byte offsets. - - When indexed, the key is looked up in both indexes, allowing access by meaningful IDs - (like UniProt accession) and by full header string. - """ - header_group = 1 - header_pattern = None - def __init__(self, source, header_pattern=None, header_group=None, - ignore_comments=False, parser=None, **kwargs): - """Open `source` and create a two-layer index for convenient random access - both by full header strings and extracted fields. - - Parameters - ---------- - source : str or file-like - File to read. If file object, it must be opened in *binary* mode. - header_pattern : str or RE or None, optional - Pattern to match the header string. Must capture the group used - for the second index. If :py:const:`None` (default), second-level index is not created. - header_group : int or str or None, optional - Defines which group is used as key in the second-level index. - Default is 1. - ignore_comments : bool, optional - If :py:const:`True` then ignore the second and subsequent lines of description. - Default is :py:const:`False`, which concatenates multi-line descriptions into - a single string. - parser : function or None, optional - Defines whether the FASTA descriptions should be parsed. If it is a - function, that function will be given the description string, and - the returned value will be yielded together with the sequence. - The :py:data:`std_parsers` dict has parsers for several formats. - Hint: specify :py:func:`parse` as the parser to apply automatic - format recognition. - Default is :py:const:`None`, which means return the header "as is". - - Other arguments : the same as for :py:class:`IndexedFASTA`. - """ - super(TwoLayerIndexedFASTA, self).__init__(source, ignore_comments, parser, **kwargs) - if header_group is not None: - self.header_group = header_group - if header_pattern is not None: - self.header_pattern = header_pattern - if not kwargs.get('_skip_index', False): - self.build_second_index() - self._init_args = (source, header_pattern, header_group, ignore_comments, parser) - self._init_kwargs = kwargs - - def build_second_index(self): - """Create the mapping from extracted field to whole header string.""" - if self.header_pattern is None: - self._id2header = None - else: - index = {} - for key in self._offset_index: - match = re.match(self.header_pattern, key) - if match: - index[match.group(self.header_group)] = key - self._id2header = index - - def __getstate__(self): - state = super(TwoLayerIndexedFASTA, self).__getstate__() - state['id2header'] = self._id2header - return state - - def __setstate__(self, state): - super(TwoLayerIndexedFASTA, self).__setstate__(state) - self._id2header = state['id2header'] - - def get_by_id(self, key): - """Get the entry by value of header string or extracted field.""" - try: - return super(TwoLayerIndexedFASTA, self).get_by_id(key) - except KeyError: - if self._id2header: - header = self._id2header.get(key) - if header is not None: - return super(TwoLayerIndexedFASTA, self).get_entry(header) - raise KeyError(key) - - def get_header(self, key): - if key in self._id2header: - return self._id2header[key] - raise KeyError(key) - - def __contains__(self, key): - return super(TwoLayerIndexedFASTA, self).__contains__(key) or key in self._id2header - - -class _FastaParserFlavorMeta(abc.ABCMeta): - def __new__(mcs, name, bases, namespace): - if "parser" in namespace: - namespace["parser"] = _add_raw_field(namespace["parser"]) - if name != 'FlavoredMixin': - reader_type = None - for t in (FASTA, IndexedFASTA, TwoLayerIndexedFASTA): - if t in bases: - reader_type = t - - if reader_type is not None: - # this is a "concrete" reader class - # add a unified __init__ method for it - for c in bases: - if issubclass(c, FlavoredMixin): - flavor = c - break - else: - raise aux.PyteomicsError('Could not detect flavor of {}, not a subclass of `FlavoredMixin`.') - - def __init__(self, source, parse=True, **kwargs): - reader_type.__init__(self, source, **kwargs) - flavor.__init__(self, parse) - self._init_args = (source, parse) - self._init_kwargs = kwargs - - flavor_name = name[:-5] - type_name = "Text-mode" if reader_type is FASTA else "Indexed" - __init__.__doc__ = """Creates a :py:class:`{}` object. - - Parameters - ---------- - source : str or file - The file to read. If a file object, it needs to be in *{}* mode. - parse : bool, optional - Defines whether the descriptions should be parsed in the produced tuples. - Default is :py:const:`True`. - kwargs : passed to the :py:class:`{}` constructor. - """.format(name, 'text' if reader_type is FASTA else 'binary', reader_type.__name__) - namespace['__init__'] = __init__ - namespace['__doc__'] = """{} parser for {} FASTA files.""".format(type_name, flavor_name) - - return super(_FastaParserFlavorMeta, mcs).__new__(mcs, name, bases, namespace) - - -@add_metaclass(_FastaParserFlavorMeta) -class FlavoredMixin(): - """Parser aimed at a specific FASTA flavor. - Subclasses should define `parser` and `header_pattern`. - The `parse` argument in :py:meth:`__init__` defines whether description is - parsed in output. - """ - def __init__(self, parse=True): - if not parse: - self.parser = None - - -class UniProtMixin(FlavoredMixin): - header_pattern = r'^(?P<db>\w+)\|(?P<id>[-\w]+)\|(?P<entry>\w+)\s+(?P<name>.*?)(?:(\s+OS=(?P<OS>[^=]+))|(\s+OX=(?P<OX>\d+))|(\s+GN=(?P<GN>\S+))|(\s+PE=(?P<PE>\d))|(\s+SV=(?P<SV>\d+)))*\s*$' - header_group = 'id' - - def parser(self, header): - info = re.match(self.header_pattern, header).groupdict() - for key in ['OS', 'OX', 'GN', 'PE', 'SV']: - if info[key] is None: - del info[key] - info['gene_id'], info['taxon'] = info['entry'].split('_') - _intify(info, ('PE', 'SV', 'OX')) - return info - - -class UniProt(UniProtMixin, FASTA): - pass - - -class IndexedUniProt(UniProtMixin, TwoLayerIndexedFASTA): - pass - - -class UniRefMixin(FlavoredMixin): - header_pattern = r'^(?P<id>\S+)\s+(?P<cluster>.*?)(?:(\s+n=(?P<n>\d+))|(\s+Tax=(?P<Tax>.+?))|(\s+TaxID=(?P<TaxID>\S+))|(\s+RepID=(?P<RepID>\S+)))*\s*$' - header_group = 'id' - - def parser(self, header): - assert 'Tax' in header - info = re.match(self.header_pattern, header).groupdict() - for key in ['TaxID', 'Tax', 'RepID', 'n']: - if info[key] is None: - del info[key] - _intify(info, ('n',)) - return info - - -class UniRef(UniRefMixin, FASTA): - pass - - -class IndexedUniRef(UniRefMixin, TwoLayerIndexedFASTA): - pass - - -class UniParcMixin(FlavoredMixin): - header_pattern = r'(\S+)\s+status=(\w+)\s*$' - - def parser(self, header): - ID, status = re.match(self.header_pattern, header).groups() - return {'id': ID, 'status': status} - - -class UniParc(UniParcMixin, FASTA): - pass - - -class IndexedUniParc(UniParcMixin, TwoLayerIndexedFASTA): - pass - - -class UniMesMixin(FlavoredMixin): - header_pattern = r'^(\S+)\s+([^=]*\S)((\s+\w+=[^=]+(?!\w*=))+)\s*$' - - def parser(self, header): - assert 'OS=' in header and 'SV=' in header and 'PE=' not in header - ID, name, pairs, _ = re.match(self.header_pattern, header).groups() - info = {'id': ID, 'name': name} - info.update(_split_pairs(pairs)) - _intify(info, ('SV',)) - return info - - -class UniMes(UniMesMixin, FASTA): - pass - - -class IndexedUniMes(UniMesMixin, TwoLayerIndexedFASTA): - pass - - -class SPDMixin(FlavoredMixin): - header_pattern = r'^([^|]+?)\s*\|\s*(([^|]+?)_([^|]+?))\s*\|\s*([^|]+?)\s*$' - - def parser(self, header): - assert '=' not in header - ID, gene, gid, taxon, d = re.match(self.header_pattern, header).groups() - return {'id': ID, 'gene': gene, 'description': d, - 'taxon': taxon, 'gene_id': gid} - - -class SPD(SPDMixin, FASTA): - pass - - -class IndexedSPD(SPDMixin, TwoLayerIndexedFASTA): - pass - - -class NCBIMixin(FlavoredMixin): - header_pattern = r'^(\S+)\s+(.*\S)\s+\[(.*)\]' - - def parser(self, header): - ID, description, organism = re.match(self.header_pattern, header).groups() - return {'id': ID, 'description': description, 'taxon': organism} - - -class NCBI(NCBIMixin, FASTA): - pass - - -class IndexedNCBI(NCBIMixin, TwoLayerIndexedFASTA): - pass - - -class RefSeqMixin(FlavoredMixin): - header_pattern = r'^ref\|([^|]+)\|\s*([^\[]*\S)\s*\[(.*)\]' - - def parser(self, header): - ID, description, organism = re.match(self.header_pattern, header).groups() - return {'id': ID, 'description': description, 'taxon': organism} - - -class RefSeq(RefSeqMixin, FASTA): - pass - - -class IndexedRefSeq(RefSeqMixin, TwoLayerIndexedFASTA): - pass - - -def read(source=None, use_index=None, flavor=None, **kwargs): - """Parse a FASTA file. This function serves as a dispatcher between - different parsers available in this module. - - Parameters - ---------- - source : str or file or None, optional - A file object (or file name) with a FASTA database. Default is - :py:const:`None`, which means read standard input. - use_index : bool, optional - If :py:const:`True`, the created parser object will be an instance of - :py:class:`IndexedFASTA`. If :py:const:`False` (default), it will be - an instance of :py:class:`FASTA`. - flavor : str or None, optional - A supported FASTA header format. If specified, a format-specific - parser instance is returned. - - .. note:: See :py:data:`std_parsers` for supported flavors. - - Returns - ------- - out : iterator of tuples - A named 2-tuple with FASTA header (str or dict) and sequence (str). - Attributes 'description' and 'sequence' are also provided. - """ - try: - parser = std_parsers[flavor and flavor.lower()] - except KeyError: - raise aux.PyteomicsError('No parser for flavor: {}. Supported flavors: {}'.format( - flavor, ', '.join(map(str, std_parsers)))) - use_index = aux._check_use_index(source, use_index, False) - return parser[use_index](source, **kwargs) - - -@aux._file_writer() -def write(entries, output=None): - """ - Create a FASTA file with `entries`. - - Parameters - ---------- - entries : iterable of (str/dict, str) tuples - An iterable of 2-tuples in the form (description, sequence). - If description is a dictionary, it must have a special key, whose value - will be written as protein description. The special key is defined by the variable - :py:const:`RAW_HEADER_KEY`. - output : file-like or str, optional - A file open for writing or a path to write to. If the file exists, - it will be opened for writing. Default is :py:const:`None`, which - means write to standard output. - - .. note:: - The default mode for output files specified by name has been changed - from `a` to `w` in *pyteomics 4.6*. See `file_mode` to override the mode. - - file_mode : str, keyword only, optional - If `output` is a file name, defines the mode the file will be opened in. - Otherwise will be ignored. Default is `'w'`. - - .. note :: - The default changed from `'a'` in *pyteomics 4.6*. - - Returns - ------- - output_file : file object - The file where the FASTA is written. - """ - for descr, seq in entries: - if isinstance(descr, str): - output.write('>' + descr.replace('\n', '\n;') + '\n') - elif isinstance(descr, dict) and RAW_HEADER_KEY in descr: - output.write('>' + descr[RAW_HEADER_KEY].replace('\n', '\n;') + '\n') - else: - raise aux.PyteomicsError('Cannot use provided description: ' + repr(descr)) - output.write(''.join([('%s\n' % seq[i:i+70]) - for i in range(0, len(seq), 70)]) + '\n') - - return output.file - - -def reverse(sequence, keep_nterm=False, keep_cterm=False): - """ - Create a decoy sequence by reversing the original one. - - Parameters - ---------- - sequence : str - The initial sequence string. - keep_nterm : bool, optional - If :py:const:`True`, then the N-terminal residue will be kept. - Default is :py:const:`False`. - keep_cterm : bool, optional - If :py:const:`True`, then the C-terminal residue will be kept. - Default is :py:const:`False`. - - Returns - ------- - decoy_sequence : str - The decoy sequence. - """ - start = 1 if keep_nterm else 0 - end = len(sequence)-1 if keep_cterm else len(sequence) - if start == end: - return sequence - return sequence[:start] + sequence[start:end][::-1] + sequence[end:] - - -def shuffle(sequence, keep_nterm=False, keep_cterm=False, keep_nterm_M=False, fix_aa=''): - """ - Create a decoy sequence by shuffling the original one. - - Parameters - ---------- - sequence : str - The initial sequence string. - keep_nterm : bool, optional - If :py:const:`True`, then the N-terminal residue will be kept. - Default is :py:const:`False`. - keep_cterm : bool, optional - If :py:const:`True`, then the C-terminal residue will be kept. - Default is :py:const:`False`. - keep_nterm_M : bool, optional - If :py:const:`True`, then the N-terminal methionine will be kept. - Default is :py:const:`False`. - fix_aa : iterable, optional - Single letter codes for amino acids that should preserve their position - during shuffling. - Default is ''. - - Returns - ------- - decoy_sequence : str - The decoy sequence. - """ - - # empty sequence - if len(sequence) == 0: - return '' - - # presereve the first position - if (keep_nterm_M and sequence[0] == 'M') or keep_nterm: - return sequence[0] + shuffle(sequence[1:], keep_cterm=keep_cterm, - fix_aa=fix_aa) - - # presereve the last position - if keep_cterm: - return shuffle(sequence[:-1], fix_aa=fix_aa) + sequence[-1] - - - if not isinstance(fix_aa, str): - fix_aa = ''.join(fix_aa) - - fixed = [] - position = 0 - if len(fix_aa) > 0: # non-empty fixed list - shuffled = [] - for match in re.finditer(r'[{}]'.format(fix_aa), sequence): - fixed.append((match.start(), sequence[match.start()])) - shuffled.extend(sequence[position:match.start()]) - position = match.end() - shuffled.extend(sequence[position:]) - - else: # shuffle everything - shuffled = list(sequence) - - random.shuffle(shuffled) - - for fix in fixed: - shuffled.insert(fix[0], fix[1]) - - return ''.join(shuffled) - - -def fused_decoy(sequence, decoy_mode='reverse', sep='R', **kwargs): - """ - Create a "fused" decoy sequence by concatenating a decoy sequence with the original one. - The method and its use cases are described in: - - Ivanov, M. V., Levitsky, L. I., & Gorshkov, M. V. (2016). - `Adaptation of Decoy Fusion Strategy for Existing Multi-Stage Search Workflows. - <http://doi.org/10.1007/s13361-016-1436-7>`_ - Journal of The American Society for Mass Spectrometry, 27(9), 1579-1582. - - Parameters - ---------- - sequence : str - The initial sequence string. - decoy_mode : str or callable, optional - Type of decoy sequence to use. Should be one of the standard modes or any callable. - Standard modes are: - - - 'reverse' for :py:func:`reverse`; - - 'shuffle' for :py:func:`shuffle`; - - 'fused' for :py:func:`fused_decoy` (if you love recursion). - - Default is 'reverse'. - sep : str, optional - Amino acid motif that separates the decoy sequence from the target one. - This setting should reflect the enzyme specificity used in the search against the - database being generated. Default is 'R', which is suitable for trypsin searches. - **kwargs : given to the decoy generation function. - - Examples - -------- - >>> fused_decoy('PEPT') - 'TPEPRPEPT' - >>> fused_decoy('MPEPT', 'shuffle', 'K', keep_nterm=True) - 'MPPTEKMPEPT' - """ - decoy = decoy_sequence(sequence, decoy_mode, **kwargs) - return decoy + sep + sequence - - -_decoy_functions = {'reverse': reverse, 'shuffle': shuffle, 'fused': fused_decoy} - - -def decoy_sequence(sequence, mode='reverse', **kwargs): - """ - Create a decoy sequence out of a given sequence string. - - Parameters - ---------- - sequence : str - The initial sequence string. - mode : str or callable, optional - Type of decoy sequence. Should be one of the standard modes or any callable. - Standard modes are: - - - 'reverse' for :py:func:`reverse`; - - 'shuffle' for :py:func:`shuffle`; - - 'fused' for :py:func:`fused_decoy`. - - Default is 'reverse'. - **kwargs : given to the decoy function. - - Returns - ------- - decoy_sequence : str - The decoy sequence. - """ - fmode = mode - if isinstance(mode, str): - fmode = _decoy_functions.get(mode) - if fmode is None: - raise aux.PyteomicsError('Unsupported decoy mode: {}'.format(mode)) - return fmode(sequence, **kwargs) - - -def decoy_entries(entries, mode='reverse', prefix=DECOY_PREFIX, decoy_only=True, **kwargs): - """Iterate over protein `entries` (tuples) and produce decoy entries. - The `entries` are only iterated once. - - Parameters - ---------- - entries : iterable of tuples - Any iterable of (description, sequence) pairs. - mode : str or callable, optional - Algorithm of decoy sequence generation. 'reverse' by default. - See :py:func:`decoy_sequence` for more information. - prefix : str, optional - A prefix to the protein descriptions of decoy entries. The default - value is `'DECOY_'`. - decoy_only : bool, optional - If set to :py:const:`True`, only the decoy entries will be written to - `output`. If :py:const:`False`, each consumed entry is yielded unchanged, - followed by its decoy couterpart. - :py:const:`True` by default. - **kwargs : given to :py:func:`decoy_sequence`. - - Returns - ------- - out : iterator - An iterator over new entries. - """ - for item in entries: - if not decoy_only: - yield item - yield Protein(prefix + item[0], decoy_sequence(item[1], mode, **kwargs)) - - -@aux._file_reader() -def decoy_db(source=None, mode='reverse', prefix=DECOY_PREFIX, decoy_only=False, - ignore_comments=False, parser=None, **kwargs): - """Iterate over sequences for a decoy database out of a given ``source``. - - Parameters - ---------- - source : file-like object or str or None, optional - A path to a FASTA database or a file object itself. Default is - :py:const:`None`, which means read standard input. - mode : str or callable, optional - Algorithm of decoy sequence generation. 'reverse' by default. - See :py:func:`decoy_sequence` for more information. - prefix : str, optional - A prefix to the protein descriptions of decoy entries. The default - value is `'DECOY_'`. - decoy_only : bool, optional - If set to :py:const:`True`, only the decoy entries will be written to - `output`. If :py:const:`False`, the entries from `source` will be - written first. - :py:const:`False` by default. - ignore_comments : bool, optional - If True then ignore the second and subsequent lines of description. - Default is :py:const:`False`. - parser : function or None, optional - Defines whether the fasta descriptions should be parsed. If it is a - function, that function will be given the description string, and - the returned value will be yielded together with the sequence. - The :py:data:`std_parsers` dict has parsers for several formats. - Hint: specify :py:func:`parse` as the parser to apply automatic - format guessing. - Default is :py:const:`None`, which means return the header "as is". - **kwargs : given to :py:func:`decoy_sequence`. - - Returns - ------- - out : iterator - An iterator over entries of the new database. - """ - - # store the initial position - pos = source.tell() - if not decoy_only: - with read(source, ignore_comments, parser) as f: - for x in f: - yield x - # return to the initial position in the source file to read again - source.seek(pos) - - parser = parser or (lambda x: x) - with read(source, ignore_comments) as f: - for descr, seq in f: - yield Protein(parser(prefix + descr), decoy_sequence(seq, mode, **kwargs)) - - -@aux._file_writer() -def write_decoy_db(source=None, output=None, mode='reverse', prefix=DECOY_PREFIX, - decoy_only=False, **kwargs): - """Generate a decoy database out of a given ``source`` and write to file. - - If `output` is a path, the file will be open for appending, so no information - will be lost if the file exists. Although, the user should be careful when - providing open file streams as `source` and `output`. The reading and writing - will start from the current position in the files, which is where the last I/O - operation finished. One can use the :py:func:`file.seek` method to change it. - - Parameters - ---------- - source : file-like object or str or None, optional - A path to a FASTA database or a file object itself. Default is - :py:const:`None`, which means read standard input. - output : file-like object or str, optional - A path to the output database or a file open for writing. - Defaults to :py:const:`None`, the results go to the standard output. - mode : str or callable, optional - Algorithm of decoy sequence generation. 'reverse' by default. - See :py:func:`decoy_sequence` for more details. - prefix : str, optional - A prefix to the protein descriptions of decoy entries. The default - value is `'DECOY_'` - decoy_only : bool, optional - If set to :py:const:`True`, only the decoy entries will be written to - `output`. If :py:const:`False`, the entries from `source` will be - written as well. - :py:const:`False` by default. - file_mode : str, keyword only, optional - If `output` is a file name, defines the mode the file will be opened in. - Otherwise will be ignored. Default is 'a'. - **kwargs : given to :py:func:`decoy_sequence`. - - Returns - ------- - output : file - A (closed) file object for the created file. - """ - with decoy_db(source, mode, prefix, decoy_only, **kwargs) as entries: - write(entries, output) - return output.file - - -# auxiliary functions for parsing of FASTA headers -def _split_pairs(s): - return dict(map(lambda x: x.strip(), x.split('=')) - for x in re.split(r' (?=\w+=)', s.strip())) - - -def _intify(d, keys): - for k in keys: - if k in d: - d[k] = int(d[k]) - - -std_parsers = {'uniprot': (UniProt, IndexedUniProt), 'uniref': (UniRef, IndexedUniRef), - 'uniparc': (UniParc, IndexedUniParc), 'unimes': (UniMes, IndexedUniMes), - 'spd': (SPD, IndexedSPD), 'ncbi': (NCBI, IndexedNCBI), - 'refseq': (RefSeq, IndexedRefSeq), - None: (FASTA, IndexedFASTA)} -"""A dictionary with parsers for known FASTA header formats. For now, supported -formats are those described at -`UniProt help page <http://www.uniprot.org/help/fasta-headers>`_.""" - - -_std_mixins = {'uniprot': UniProtMixin, 'uniref': UniRefMixin, - 'uniparc': UniParcMixin, 'unimes': UniMesMixin, 'spd': SPDMixin, - 'ncbi': NCBIMixin, 'refseq': RefSeqMixin} - - -def parse(header, flavor='auto', parsers=None): - """Parse the FASTA header and return a nice dictionary. - - Parameters - ---------- - - header : str - FASTA header to parse - flavor : str, optional - Short name of the header format (case-insensitive). Valid values are - :py:const:`'auto'` and keys of the `parsers` dict. Default is - :py:const:`'auto'`, which means try all formats in turn and return the - first result that can be obtained without an exception. - parsers : dict, optional - A dict where keys are format names (lowercased) and values are functions - that take a header string and return the parsed header. - - Returns - ------- - - out : dict - A dictionary with the info from the header. The format depends on the - flavor. - """ - parser_function = lambda cls: cls().parser - flavor = flavor.lower() - # accept strings with and without leading '>' - if header and header[0] == '>': - header = header[1:] - - # choose the format - known = parsers or _std_mixins - - if flavor == 'auto': - for parser in known.values(): - try: - return parser_function(parser)(header) - except Exception: - pass - raise aux.PyteomicsError('Unknown FASTA header format: ' + header) - elif flavor in known: - try: - return parser_function(known[flavor])(header) - except Exception as e: - raise aux.PyteomicsError('Could not parse header as "{}". ' - 'The error message was: {}: {}. Header: "{}"'.format( - flavor, type(e).__name__, e.args[0], header)) - raise aux.PyteomicsError('Unknown flavor: {}'.format(flavor)) - - -chain = aux._make_chain(read, 'read') -decoy_chain = aux._make_chain(decoy_db, 'decoy_db') diff --git a/pyteomics/mass/__init__.py b/pyteomics/mass/__init__.py deleted file mode 100644 index a5b9981ebfaaff792ed64ea1e10581d062d76f07..0000000000000000000000000000000000000000 --- a/pyteomics/mass/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .mass import * -try: - from . import unimod -except ImportError: - # SQLAlchemy is not available - pass \ No newline at end of file diff --git a/pyteomics/mass/mass.py b/pyteomics/mass/mass.py deleted file mode 100644 index 03afee248adc371397477d41e753619e2f3db8ac..0000000000000000000000000000000000000000 --- a/pyteomics/mass/mass.py +++ /dev/null @@ -1,1231 +0,0 @@ -""" -mass - molecular masses and isotope distributions -================================================= - -Summary -------- - -This module defines general functions for mass and isotope abundance -calculations. For most of the functions, the user can define a given -substance in various formats, but all of them would be reduced to the -:py:func:`Composition <Composition.__init__>` object describing its -chemical composition. - - -Classes -------- - - :py:func:`Composition <Composition.__init__>` - a class storing chemical - composition of a substance. - - :py:class:`Unimod` - a class representing a Python interface to the - `Unimod database <http://unimod.org/>`_ - (see :py:mod:`pyteomics.mass.unimod` for a much more powerful alternative). - -Mass calculations ------------------ - - :py:func:`calculate_mass` - a general routine for mass / m/z - calculation. Can calculate mass for a polypeptide sequence, chemical - formula or elemental composition. Supplied with an ion type and - charge, the function would calculate m/z. - - :py:func:`fast_mass` - a less powerful but much faster function for - polypeptide mass calculation. - - :py:func:`fast_mass2` - a version of `fast_mass` that supports *modX* notation. - -Isotopic abundances -------------------- - - :py:func:`isotopic_composition_abundance` - calculate the relative - abundance of a given isotopic composition. - - :py:func:`most_probable_isotopic_composition` - finds the most - abundant isotopic composition for a molecule defined by a - polypeptide sequence, chemical formula or elemental composition. - - :py:func:`isotopologues` - iterate over possible isotopic conposition of a molecule, - possibly filtered by abundance. - -Data ----- - - :py:data:`nist_mass` - a dict with exact masses of the most abundant - isotopes. - - :py:data:`std_aa_comp` - a dict with the elemental compositions - of the standard twenty amino acid residues, selenocysteine and pyrrolysine. - - :py:data:`std_ion_comp` - a dict with the relative elemental - compositions of the standard peptide fragment ions. - - :py:data:`std_aa_mass` - a dict with the monoisotopic masses - of the standard twenty amino acid residues, selenocysteine and pyrrolysine. - ------------------------------------------------------------------------------ -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -import math -from .. import parser -from ..auxiliary import PyteomicsError, _nist_mass, BasicComposition -from itertools import chain, product, combinations_with_replacement -from collections import defaultdict -try: - from urllib import urlopen -except ImportError: - from urllib.request import urlopen -from datetime import datetime -import re -import operator -import warnings - -nist_mass = _nist_mass -""" -A dict with the exact element masses downloaded from the NIST website: -http://www.nist.gov/pml/data/comp.cfm . There are entries for each -element containing the masses and relative abundances of several -abundant isotopes and a separate entry for undefined isotope with zero -key, mass of the most abundant isotope and 1.0 abundance. -""" - -PROTON = 'H+' - -def _make_isotope_string(element_name, isotope_num): - """Form a string label for an isotope.""" - if isotope_num == 0: - return element_name - else: - return '{}[{}]'.format(element_name, isotope_num) - - -def _parse_isotope_string(label): - """Parse an string with an isotope label and return the element name and - the isotope number. - - >>> _parse_isotope_string('C') - ('C', 0) - >>> _parse_isotope_string('C[12]') - ('C', 12) - """ - element_name, num = re.match(_isotope_string, label).groups() - isotope_num = int(num) if num else 0 - return element_name, isotope_num - - -# Initialize std_aa_comp and std_ion_comp before the Composition class -# description, fill it later. -std_aa_comp = {} -"""A dictionary with elemental compositions of the twenty standard -amino acid residues, selenocysteine, pyrrolysine, -and standard H- and -OH terminal groups. -""" - -std_ion_comp = {} -"""A dict with relative elemental compositions of the standard peptide -fragment ions. An elemental composition of a fragment ion is calculated as a -difference between the total elemental composition of an ion -and the sum of elemental compositions of its constituting amino acid residues. -""" - -_isotope_string = r'^([A-Z][a-z+]*)(?:\[(\d+)\])?$' -_atom = r'([A-Z][a-z+]*)(?:\[(\d+)\])?([+-]?\d+)?' -_formula = r'^({})*$'.format(_atom) - - -class Composition(BasicComposition): - """ - A Composition object stores a chemical composition of a - substance. Basically, it is a dict object, with the names - of chemical elements as keys and values equal to an integer number of - atoms of the corresponding element in a substance. - - The main improvement over dict is that Composition objects allow - adding and subtraction. - """ - _kw_sources = {'formula', 'sequence', 'parsed_sequence', 'split_sequence', 'composition'} - _carrier_spec = r"^(?P<formula>\S+?)(?:(?P<sign>[+-])(?P<charge>\d+)?)?$" - - def _from_parsed_sequence(self, parsed_sequence, aa_comp): - self.clear() - comp = defaultdict(int) - for label in parsed_sequence: - if label in aa_comp: - for elem, cnt in aa_comp[label].items(): - comp[elem] += cnt - else: - try: - mod, aa = parser._split_label(label) - for elem, cnt in chain( - aa_comp[mod].items(), aa_comp[aa].items()): - comp[elem] += cnt - - except (PyteomicsError, KeyError): - raise PyteomicsError('No information for %s in `aa_comp`' % label) - self._from_composition(comp) - - def _from_split_sequence(self, split_sequence, aa_comp): - self.clear() - comp = defaultdict(int) - for group in split_sequence: - i = 0 - while i < len(group): - for j in range(len(group) + 1, -1, -1): - try: - label = ''.join(group[i:j]) - for elem, cnt in aa_comp[label].items(): - comp[elem] += cnt - except KeyError: - continue - else: - i = j - break - if j == 0: - raise PyteomicsError("Invalid group starting from position %d: %s" % (i + 1, group)) - self._from_composition(comp) - - def _from_sequence(self, sequence, aa_comp): - parsed_sequence = parser.parse( - sequence, - labels=aa_comp, - show_unmodified_termini=True) - self._from_parsed_sequence(parsed_sequence, aa_comp) - - def _from_formula(self, formula): - if not re.match(_formula, formula): - raise PyteomicsError('Invalid formula: ' + formula) - for elem, isotope, number in re.findall(_atom, formula): - self[_make_isotope_string(elem, int(isotope) if isotope else 0)] += int(number) if number else 1 - - def _from_composition(self, comp): - for isotope_string, num_atoms in comp.items(): - element_name, isotope_num = _parse_isotope_string( - isotope_string) - - # Remove explicitly undefined isotopes (e.g. X[0]). - self[_make_isotope_string(element_name, isotope_num)] = num_atoms - - def __init__(self, *args, **kwargs): - """ - A Composition object stores a chemical composition of a - substance. Basically it is a dict object, in which keys are the names - of chemical elements and values contain integer numbers of - corresponding atoms in a substance. - - The main improvement over dict is that Composition objects allow - addition and subtraction. - - A Composition object can be initialized with one of the - following arguments: formula, sequence, parsed_sequence or - split_sequence. - - If none of these are specified, the constructor will look at the first - positional argument and try to build the object from it. Without - positional arguments, a Composition will be constructed directly from - keyword arguments. - - If there's an ambiguity, i.e. the argument is both a valid sequence - and a formula (such as 'HCN'), it will be treated as a sequence. You - need to provide the 'formula' keyword to override this. - - .. warning:: - - Be careful when supplying a list with a parsed sequence or a split - sequence as a keyword argument. It must be - obtained with enabled `show_unmodified_termini` option. - When supplying it as a positional argument, the option doesn't - matter, because the positional argument is always converted to - a sequence prior to any processing. - - Parameters - ---------- - formula : str, optional - A string with a chemical formula. All elements must be present in - `mass_data`. - sequence : str, optional - A polypeptide sequence string in modX notation. - parsed_sequence : list of str, optional - A polypeptide sequence parsed into a list of amino acids. - split_sequence : list of tuples of str, optional - A polypeptyde sequence parsed into a list of tuples - (as returned be :py:func:`pyteomics.parser.parse` with - ``split=True``). - aa_comp : dict, optional - A dict with the elemental composition of the amino acids (the - default value is std_aa_comp). - ion_comp : dict, optional - A dict with the relative elemental compositions of peptide ion - fragments (default is :py:data:`std_ion_comp`). - ion_type : str, optional - If specified, then the polypeptide is considered to be in the form - of the corresponding ion. - """ - defaultdict.__init__(self, int) - - aa_comp = kwargs.get('aa_comp', std_aa_comp) - - kw_given = self._kw_sources.intersection(kwargs) - if len(kw_given) > 1: - raise PyteomicsError('Only one of {} can be specified!\n' - 'Given: {}'.format(', '.join(self._kw_sources), - ', '.join(kw_given))) - elif kw_given: - kwa = kw_given.pop() - if kwa == 'formula': - self._from_formula(kwargs['formula']) - else: - getattr(self, '_from_' + kwa)(kwargs[kwa], aa_comp) - - # can't build from kwargs - elif args: - if isinstance(args[0], dict): - self._from_composition(args[0]) - elif isinstance(args[0], str): - try: - self._from_sequence(args[0], aa_comp) - except PyteomicsError: - try: - self._from_formula(args[0]) - except PyteomicsError: - raise PyteomicsError( - 'Could not create a Composition object from ' - 'string: "{}": not a valid sequence or ' - 'formula'.format(args[0])) - else: - try: - self._from_sequence(parser.tostring(args[0], True), aa_comp) - except Exception: - raise PyteomicsError('Could not create a Composition object' - ' from `{}`. A Composition object must be ' - 'specified by sequence, parsed or split sequence,' - ' formula or dict.'.format(args[0])) - else: - self._from_composition(kwargs) - - ion_comp = kwargs.get('ion_comp', std_ion_comp) - if 'ion_type' in kwargs: - self += ion_comp[kwargs['ion_type']] - - # Charge is not supported in kwargs - charge = self['H+'] - if 'charge' in kwargs: - if charge: - raise PyteomicsError('Charge is specified both by the number of protons and `charge` in kwargs') - else: - warnings.warn('charge and charge carrier should be specified when calling mass(). ' - 'Support for charge in Composition.__init__ will be removed in a future version.', - FutureWarning) - self['H+'] = kwargs['charge'] - - @classmethod - def _parse_carrier(cls, spec): - """Parse a charge carrier spec. - The spec syntax is: <formula>[+-][N] - <formula> is a chemical formula as supported by :py:meth:`_from_formula`. - [+-] is one of "+" or "-", N is a natural number (1 is assumed if omitted). - If both the sign and the charge are missing, the charge of this group can be - specified as the number of protons in `<formula>`. Otherwise, having protons - in `<formula>` is an error. - - Returns - ------- - out : tuple - Parsed :py:class:`Composition` and charge of the charge carrier. - """ - if spec is None: - return cls({PROTON: 1}), 1 - try: - formula, sign, charge = re.match(cls._carrier_spec, spec).groups() - except AttributeError: - raise PyteomicsError('Invalid charge carrier specification: ' + spec) - comp = cls(formula=formula) - if sign is not None and PROTON in comp: - raise PyteomicsError('Carrier contains protons and also has a charge specified.') - if sign is None: - # only formula is given - if PROTON not in comp: - charge = None - charge = comp[PROTON] - elif charge is None: - charge = (-1, 1)[sign == '+'] - else: - charge = int(charge) * (-1, 1)[sign == '+'] - return comp, charge - - @staticmethod - def _mass_to_mz(mass, composition=None, **kwargs): - mass_data = kwargs.get('mass_data', nist_mass) - absolute = kwargs.get('absolute', True) - average = kwargs.get('average', False) - - # Calculate m/z if required - charge = kwargs.get('charge') - if charge: - # get charge carrier mass and charge - charge_carrier = kwargs.get('charge_carrier') - ccharge = kwargs.get('carrier_charge') - if isinstance(charge_carrier, dict): - carrier_comp = Composition(charge_carrier) - if ccharge and PROTON in carrier_comp: - raise PyteomicsError('`carrier_charge` specified but the charge carrier contains protons.') - carrier_charge = ccharge or carrier_comp[PROTON] - if not carrier_charge: - raise PyteomicsError('Charge carrier charge not specified.') - else: - carrier_comp, carrier_charge = (composition or Composition)._parse_carrier(charge_carrier) - if carrier_charge and ccharge: - raise PyteomicsError('Both `carrier_charge` and charge in carrier spec are given.') - carrier_charge = ccharge or carrier_charge - if not carrier_charge: - raise PyteomicsError('Charge of the charge carrier group not specified.') - if charge % carrier_charge: - raise PyteomicsError('The `charge` must be a multiple of the carrier charge. Given: {} and {}'.format( - charge, carrier_charge)) - num = charge // carrier_charge - carrier_mass = carrier_comp.mass(mass_data=mass_data, average=average, charge=0) - - if charge and (composition is None or not composition['H+']): - mass += carrier_mass * num - if charge and composition and composition['H+']: - raise PyteomicsError('Composition contains protons and charge is explicitly specified.') - if charge is None and composition and composition['H+']: - warnings.warn('Charge is not specified, but the Composition contains protons. Assuming m/z calculation.') - charge = composition['H+'] - if charge: - mass /= charge - if charge and charge < 0 and absolute: - mass = abs(mass) - return mass - - def mass(self, **kwargs): - """Calculate the mass or *m/z* of a :py:class:`Composition`. - - Parameters - ---------- - average : bool, optional - If :py:const:`True` then the average mass is calculated. - Note that mass is not averaged for elements with specified isotopes. - Default is :py:const:`False`. - charge : int, optional - If not 0 then m/z is calculated. See also: `charge_carrier`. - charge_carrier : str or dict, optional - Chemical group carrying the charge. Defaults to a proton, "H+". - If string, must be a chemical formula, as supported by the - :class:`Composition` `formula` argument, - except it must end with a charge formatted as "[+-][N]". - If N is omitted, single charge is assumed. - Examples of `charge_carrier`: "H+", "NH3+" - (here, 3 is part of the composition, and + is a single charge), - "Fe+2" ("Fe" is the formula and "+2" is the charge). - .. note :: `charge` must be a multiple of `charge_carrier` charge. - - If dict, it is the atomic composition of the group. - In this case, the charge can be passed separately as `carrier_charge` - or it will be deduced from the number of protons in `charge_carrier`. - carrier_charge : int, optional - Charge of the charge carrier group (if `charge_carrier` is specified - as a composition dict). - - .. note :: `charge` must be a multiple of `charge_charge`. - - mass_data : dict, optional - A dict with the masses of the chemical elements (the default - value is :py:data:`nist_mass`). - ion_comp : dict, optional - A dict with the relative elemental compositions of peptide ion - fragments (default is :py:data:`std_ion_comp`). - ion_type : str, optional - If specified, then the polypeptide is considered to be in the form - of the corresponding ion. Do not forget to specify the charge state! - absolute : bool, optional - If :py:const:`True` (default), the m/z value returned will always be positive, - even for negatively charged ions. - - .. note :: - `absolute` only applies when `charge` is negative. - The mass can still be negative for negative compositions. - - Returns - ------- - mass : float - """ - composition = self - mass_data = kwargs.get('mass_data', nist_mass) - - # Calculate mass - mass = 0.0 - average = kwargs.get('average', False) - - for isotope_string, amount in composition.items(): - element_name, isotope_num = _parse_isotope_string(isotope_string) - # Calculate average mass if required and the isotope number is - # not specified. - if (not isotope_num) and average: - for isotope, data in mass_data[element_name].items(): - if isotope: - mass += (amount * data[0] * data[1]) - else: - mass += (amount * mass_data[element_name][isotope_num][0]) - - return self._mass_to_mz(mass, self, **kwargs) - - -std_aa_comp.update({ - 'A': Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1}), - 'C': Composition({'H': 5, 'C': 3, 'S': 1, 'O': 1, 'N': 1}), - 'D': Composition({'H': 5, 'C': 4, 'O': 3, 'N': 1}), - 'E': Composition({'H': 7, 'C': 5, 'O': 3, 'N': 1}), - 'F': Composition({'H': 9, 'C': 9, 'O': 1, 'N': 1}), - 'G': Composition({'H': 3, 'C': 2, 'O': 1, 'N': 1}), - 'H': Composition({'H': 7, 'C': 6, 'N': 3, 'O': 1}), - 'I': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), - 'J': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), - 'K': Composition({'H': 12, 'C': 6, 'N': 2, 'O': 1}), - 'L': Composition({'H': 11, 'C': 6, 'O': 1, 'N': 1}), - 'M': Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1}), - 'N': Composition({'H': 6, 'C': 4, 'O': 2, 'N': 2}), - 'P': Composition({'H': 7, 'C': 5, 'O': 1, 'N': 1}), - 'Q': Composition({'H': 8, 'C': 5, 'O': 2, 'N': 2}), - 'R': Composition({'H': 12, 'C': 6, 'N': 4, 'O': 1}), - 'S': Composition({'H': 5, 'C': 3, 'O': 2, 'N': 1}), - 'T': Composition({'H': 7, 'C': 4, 'O': 2, 'N': 1}), - 'V': Composition({'H': 9, 'C': 5, 'O': 1, 'N': 1}), - 'W': Composition({'C': 11, 'H': 10, 'N': 2, 'O': 1}), - 'Y': Composition({'H': 9, 'C': 9, 'O': 2, 'N': 1}), - 'U': Composition({'H': 5, 'C': 3, 'O': 1, 'N': 1, 'Se' : 1}), - 'O': Composition({'H': 19, 'C': 12, 'O': 2, 'N': 3}), - 'H-': Composition({'H': 1}), - '-OH': Composition({'O': 1, 'H': 1}), - }) - - -std_ion_comp.update({ - 'M': Composition(formula=''), - 'M-H2O': Composition(formula='H-2O-1'), - 'M-NH3': Composition(formula='N-1H-3'), - 'a': Composition(formula='H-2O-1' + 'C-1O-1'), - 'a-H2O': Composition(formula='H-2O-1' + 'C-1O-1' + 'H-2O-1'), - 'a-NH3': Composition(formula='H-2O-1' + 'C-1O-1' + 'N-1H-3'), - 'b': Composition(formula='H-2O-1'), - 'b-H2O': Composition(formula='H-2O-1' + 'H-2O-1'), - 'b-NH3': Composition(formula='H-2O-1' + 'N-1H-3'), - 'c': Composition(formula='H-2O-1' + 'NH3'), - 'c-1': Composition(formula='H-2O-1' + 'NH3' + 'H-1'), - 'c-dot': Composition(formula='H-2O-1' + 'NH3' + 'H1'), - 'c+1': Composition(formula='H-2O-1' + 'NH3' + 'H1'), - 'c+2': Composition(formula='H-2O-1' + 'NH3' + 'H2'), - 'c-H2O': Composition(formula='H-2O-1' + 'NH3' + 'H-2O-1'), - 'c-NH3': Composition(formula='H-2O-1'), - 'x': Composition(formula='H-2O-1' + 'CO2'), - 'x-H2O': Composition(formula='H-2O-1' + 'CO2' + 'H-2O-1'), - 'x-NH3': Composition(formula='H-2O-1' + 'CO2' + 'N-1H-3'), - 'y': Composition(formula=''), - 'y-H2O': Composition(formula='H-2O-1'), - 'y-NH3': Composition(formula='N-1H-3'), - 'z': Composition(formula='H-2O-1' + 'ON-1H-1'), - 'z-dot': Composition(formula='H-2O-1' + 'ON-1'), - 'z+1': Composition(formula='H-2O-1' + 'ON-1H1'), - 'z+2': Composition(formula='H-2O-1' + 'ON-1H2'), - 'z+3': Composition(formula='H-2O-1' + 'ON-1H3'), - 'z-H2O': Composition(formula='H-2O-1' + 'ON-1H-1' + 'H-2O-1'), - 'z-NH3': Composition(formula='H-2O-1' + 'ON-1H-1' + 'N-1H-3'), - }) - - -def calculate_mass(*args, **kwargs): - """Calculates the monoisotopic mass of a polypeptide defined by a - sequence string, parsed sequence, chemical formula or - Composition object. - - One or none of the following keyword arguments is required: - **formula**, **sequence**, **parsed_sequence**, **split_sequence** - or **composition**. - All arguments given are used to create a :py:class:`Composition` object, - unless an existing one is passed as a keyword argument. - - Note that if a sequence string is supplied and terminal groups are not - explicitly shown, then the mass is calculated for a polypeptide with - standard terminal groups (NH2- and -OH). - - .. warning:: - - Be careful when supplying a list with a parsed sequence. It must be - obtained with enabled `show_unmodified_termini` option. - - Parameters - ---------- - formula : str, optional - A string with a chemical formula. - sequence : str, optional - A polypeptide sequence string in modX notation. - proforma : str, optional - A polypeptide sequeence string in `ProForma notation <https://www.psidev.info/proforma>`_, - or a :py:class:`pyteomics.proforma.ProForma` object. - parsed_sequence : list of str, optional - A polypeptide sequence parsed into a list of amino acids. - composition : Composition, optional - A Composition object with the elemental composition of a substance. - aa_comp : dict, optional - A dict with the elemental composition of the amino acids (the - default value is std_aa_comp). - average : bool, optional - If :py:const:`True` then the average mass is calculated. Note that mass - is not averaged for elements with specified isotopes. Default is - :py:const:`False`. - charge : int, optional - If not 0 then m/z is calculated: the mass is increased - by the corresponding number of proton masses and divided - by `charge`. - charge_carrier : str or dict, optional - Chemical group carrying the charge. Defaults to a proton, "H+". - If string, must be a chemical formula, as supported by the - :class:`Composition` `formula` argument, - except it must end with a charge formatted as "[+-][N]". - If N is omitted, single charge is assumed. - Examples of `charge_carrier`: "H+", "NH3+" - (here, 3 is part of the composition, and + is a single charge), - "Fe+2" ("Fe" is the formula and "+2" is the charge). - - .. note :: - `charge` must be a multiple of `charge_carrier` charge. - - If dict, it is the atomic composition of the group. - In this case, the charge can be passed separately as `carrier_charge` - or it will be deduced from the number of protons in `charge_carrier`. - carrier_charge : int, optional - Charge of the charge carrier group (if `charge_carrier` is specified - as a composition dict). - - .. note :: - `charge` must be a multiple of `charge_charge`. - - mass_data : dict, optional - A dict with the masses of the chemical elements (the default - value is :py:data:`nist_mass`). - ion_comp : dict, optional - A dict with the relative elemental compositions of peptide ion - fragments (default is :py:data:`std_ion_comp`). - ion_type : str, optional - If specified, then the polypeptide is considered to be in the form - of the corresponding ion. Do not forget to specify the charge state! - absolute : bool, optional - If :py:const:`True` (default), the m/z value returned will always be positive, - even for negatively charged ions. - - .. note :: - `absolute` only applies when `charge` is negative. - The mass can still be negative for negative compositions. - - Returns - ------- - mass : float - """ - if 'proforma' in kwargs: - # do not try to create a composition - from .. import proforma - proteoform = kwargs.pop('proforma') - if isinstance(proteoform, str): - proteoform = proforma.ProForma.parse(proteoform) - return Composition._mass_to_mz(proteoform.mass, **kwargs) - - # These parameters must be passed to mass(), not __init__ - mass_kw = {} - for k in ['charge', 'charge_carrier', 'carrier_charge', 'absolute']: - if k in kwargs: - mass_kw[k] = kwargs.pop(k) - # Make a copy of `composition` keyword argument. - composition = (Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs)) - kwargs.update(mass_kw) - return composition.mass(**kwargs) - - -def most_probable_isotopic_composition(*args, **kwargs): - """Calculate the most probable isotopic composition of a peptide - molecule/ion defined by a sequence string, parsed sequence, - chemical formula or :py:class:`Composition` object. - - Note that if a sequence string without terminal groups is supplied then the - isotopic composition is calculated for a polypeptide with standard - terminal groups (H- and -OH). - - For each element, only two most abundant isotopes are considered. - - Parameters - ---------- - formula : str, optional - A string with a chemical formula. - sequence : str, optional - A polypeptide sequence string in modX notation. - parsed_sequence : list of str, optional - A polypeptide sequence parsed into a list of amino acids. - composition : :py:class:`Composition`, optional - A :py:class:`Composition` object with the elemental composition of a - substance. - elements_with_isotopes : list of str - A list of elements to be considered in isotopic distribution - (by default, every element has a isotopic distribution). - aa_comp : dict, optional - A dict with the elemental composition of the amino acids (the - default value is :py:data:`std_aa_comp`). - mass_data : dict, optional - A dict with the masses of chemical elements (the default - value is :py:data:`nist_mass`). - ion_comp : dict, optional - A dict with the relative elemental compositions of peptide ion - fragments (default is :py:data:`std_ion_comp`). - - Returns - ------- - out: tuple (Composition, float) - A tuple with the most probable isotopic composition and its - relative abundance. - """ - - composition = (dict(kwargs['composition']) if 'composition' in kwargs - else Composition(*args, **kwargs)) - - # Removing isotopes from the composition. - for isotope_string in composition: - element_name, isotope_num = _parse_isotope_string(isotope_string) - if isotope_num: - composition[element_name] += composition.pop(isotope_string) - - mass_data = kwargs.get('mass_data', nist_mass) - elements_with_isotopes = kwargs.get('elements_with_isotopes') - isotopic_composition = Composition() - - for element_name in composition: - if not elements_with_isotopes or (element_name in elements_with_isotopes): - # Take the two most abundant isotopes. - first_iso, second_iso = sorted([(i[0], i[1][1]) for i in mass_data[element_name].items() if i[0]], - key=lambda x: -x[1])[:2] - - # Write the number of isotopes of the most abundant type. - first_iso_str = _make_isotope_string(element_name, first_iso[0]) - isotopic_composition[first_iso_str] = int(math.ceil( - composition[element_name])) * first_iso[1] - - # Write the number of the second isotopes. - second_iso_str = _make_isotope_string(element_name, second_iso[0]) - isotopic_composition[second_iso_str] = composition[element_name] - isotopic_composition[first_iso_str] - else: - isotopic_composition[element_name] = composition[element_name] - - return (isotopic_composition, - isotopic_composition_abundance(composition=isotopic_composition, mass_data=mass_data)) - - -def isotopic_composition_abundance(*args, **kwargs): - """Calculate the relative abundance of a given isotopic composition - of a molecule. - - Parameters - ---------- - formula : str, optional - A string with a chemical formula. - composition : Composition, optional - A Composition object with the isotopic composition of a substance. - mass_data : dict, optional - A dict with the masses of chemical elements (the default - value is :py:data:`nist_mass`). - - Returns - ------- - relative_abundance : float - The relative abundance of a given isotopic composition. - """ - - composition = (Composition(kwargs['composition']) - if 'composition' in kwargs - else Composition(*args, **kwargs)) - - isotopic_composition = defaultdict(dict) - - # Check if there are default and non-default isotopes of the same - # element and rearrange the elements. - for element in composition: - element_name, isotope_num = _parse_isotope_string(element) - - # If there is already an entry for this element and either it - # contains a default isotope or newly added isotope is default - # then raise an exception. - if (element_name in isotopic_composition) and (isotope_num == 0 or 0 in isotopic_composition[element_name]): - raise PyteomicsError( - 'Please specify the isotopic states of all atoms of %s or do not specify them at all.' % element_name) - else: - isotopic_composition[element_name][isotope_num] = composition[element] - - # Calculate relative abundance. - mass_data = kwargs.get('mass_data', nist_mass) - num1, num2, denom = 1, 1, 1 - for element_name, isotope_dict in isotopic_composition.items(): - num1 *= math.factorial(sum(isotope_dict.values())) - for isotope_num, isotope_content in isotope_dict.items(): - denom *= math.factorial(isotope_content) - if isotope_num: - num2 *= mass_data[element_name][isotope_num][1] ** isotope_content - - return num2 * (num1 / denom) - - -def isotopologues(*args, **kwargs): - """Iterate over possible isotopic states of a molecule. - The molecule can be defined by formula, sequence, parsed sequence, or composition. - The space of possible isotopic compositions is restrained by parameters - ``elements_with_isotopes``, ``isotope_threshold``, ``overall_threshold``. - - Parameters - ---------- - formula : str, optional - A string with a chemical formula. - sequence : str, optional - A polypeptide sequence string in modX notation. - parsed_sequence : list of str, optional - A polypeptide sequence parsed into a list of amino acids. - composition : :py:class:`Composition`, optional - A :py:class:`Composition` object with the elemental composition of a - substance. - report_abundance : bool, optional - If :py:const:`True`, the output will contain 2-tuples: `(composition, abundance)`. - Otherwise, only compositions are yielded. Default is :py:const:`False`. - elements_with_isotopes : container of str, optional - A set of elements to be considered in isotopic distribution - (by default, every element has an isotopic distribution). - isotope_threshold : float, optional - The threshold abundance of a specific isotope to be considered. - Default is :py:const:`5e-4`. - overall_threshold : float, optional - The threshold abundance of the calculateed isotopic composition. - Default is :py:const:`0`. - aa_comp : dict, optional - A dict with the elemental composition of the amino acids (the - default value is :py:data:`std_aa_comp`). - mass_data : dict, optional - A dict with the masses of chemical elements (the default - value is :py:data:`nist_mass`). - - Returns - ------- - out : iterator - Iterator over possible isotopic compositions. - """ - iso_threshold = kwargs.pop('isotope_threshold', 5e-4) - overall_threshold = kwargs.pop('overall_threshold', 0.0) - mass_data = kwargs.get('mass_data', nist_mass) - elements_with_isotopes = kwargs.get('elements_with_isotopes') - report_abundance = kwargs.get('report_abundance', False) - composition = Composition(kwargs['composition']) if 'composition' in kwargs else Composition(*args, **kwargs) - other_kw = kwargs.copy() - for k in Composition._kw_sources: - other_kw.pop(k, None) - - dict_elem_isotopes = {} - for element in composition: - if elements_with_isotopes is None or element in elements_with_isotopes: - element_name, isotope_num = _parse_isotope_string(element) - isotopes = {k: v for k, v in mass_data[element_name].items() if k != 0 and v[1] >= iso_threshold} - list_isotopes = [_make_isotope_string(element_name, k) for k in isotopes] - dict_elem_isotopes[element] = list_isotopes - else: - dict_elem_isotopes[element] = [element] - all_isotoplogues = [] - for element, list_isotopes in dict_elem_isotopes.items(): - n = composition[element] - list_comb_element_n = [] - for elementXn in combinations_with_replacement(list_isotopes, n): - list_comb_element_n.append(elementXn) - all_isotoplogues.append(list_comb_element_n) - - for isotopologue in product(*all_isotoplogues): - ic = Composition(formula=''.join(atom for el in isotopologue for atom in el), **other_kw) - if report_abundance or overall_threshold > 0.0: - abundance = isotopic_composition_abundance(composition=ic, **other_kw) - if abundance > overall_threshold: - if report_abundance: - yield (ic, abundance) - else: - yield ic - else: - yield ic - - -std_aa_mass = { - 'G': 57.02146372057, - 'A': 71.03711378471, - 'S': 87.03202840427001, - 'P': 97.05276384885, - 'V': 99.06841391299, - 'T': 101.04767846841, - 'C': 103.00918478471, - 'L': 113.08406397713001, - 'I': 113.08406397713001, - 'J': 113.08406397713001, - 'N': 114.04292744114001, - 'D': 115.02694302383001, - 'Q': 128.05857750527997, - 'K': 128.09496301399997, - 'E': 129.04259308796998, - 'M': 131.04048491299, - 'H': 137.05891185845002, - 'F': 147.06841391298997, - 'U': 150.95363508471, - 'R': 156.10111102359997, - 'Y': 163.06332853254997, - 'W': 186.07931294985997, - 'O': 237.14772686284996} -"""A dictionary with monoisotopic masses of the twenty standard -amino acid residues, selenocysteine and pyrrolysine. -""" - - -def fast_mass(sequence, ion_type=None, charge=None, **kwargs): - """Calculate monoisotopic mass of an ion using the fast - algorithm. May be used only if amino acid residues are presented in - one-letter code. - - Parameters - ---------- - sequence : str - A polypeptide sequence string. - ion_type : str, optional - If specified, then the polypeptide is considered to be - in a form of corresponding ion. Do not forget to - specify the charge state! - charge : int, optional - If not 0 then m/z is calculated: the mass is increased - by the corresponding number of proton masses and divided - by z. - mass_data : dict, optional - A dict with the masses of chemical elements (the default - value is :py:data:`nist_mass`). - aa_mass : dict, optional - A dict with the monoisotopic mass of amino acid residues - (default is std_aa_mass); - ion_comp : dict, optional - A dict with the relative elemental compositions of peptide ion - fragments (default is :py:data:`std_ion_comp`). - - Returns - ------- - mass : float - Monoisotopic mass or m/z of a peptide molecule/ion. - """ - aa_mass = kwargs.get('aa_mass', std_aa_mass) - try: - mass = sum(aa_mass[i] for i in sequence) - except KeyError as e: - raise PyteomicsError('No mass data for residue: ' + e.args[0]) - - mass_data = kwargs.get('mass_data', nist_mass) - mass += mass_data['H'][0][0] * 2 + mass_data['O'][0][0] - - if ion_type: - try: - icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type] - except KeyError: - raise PyteomicsError('Unknown ion type: {}'.format(ion_type)) - - mass += sum(mass_data[element][0][0] * num for element, num in icomp.items()) - - if charge: - mass = (mass + mass_data['H+'][0][0] * charge) / charge - - return mass - - -def fast_mass2(sequence, ion_type=None, charge=None, **kwargs): - """Calculate monoisotopic mass of an ion using the fast - algorithm. *modX* notation is fully supported. - - Parameters - ---------- - sequence : str - A polypeptide sequence string. - ion_type : str, optional - If specified, then the polypeptide is considered to be - in a form of corresponding ion. Do not forget to - specify the charge state! - charge : int, optional - If not 0 then m/z is calculated: the mass is increased - by the corresponding number of proton masses and divided - by z. - mass_data : dict, optional - A dict with the masses of chemical elements (the default - value is :py:data:`nist_mass`). - aa_mass : dict, optional - A dict with the monoisotopic mass of amino acid residues - (default is std_aa_mass); - ion_comp : dict, optional - A dict with the relative elemental compositions of peptide ion - fragments (default is :py:data:`std_ion_comp`). - - Returns - ------- - mass : float - Monoisotopic mass or m/z of a peptide molecule/ion. - """ - aa_mass = kwargs.get('aa_mass', std_aa_mass) - mass_data = kwargs.get('mass_data', nist_mass) - try: - comp = parser.amino_acid_composition(sequence, - show_unmodified_termini=True, - allow_unknown_modifications=True, - labels=aa_mass) - except PyteomicsError: - raise PyteomicsError('Mass not specified for label(s): {}'.format( - ', '.join(set(parser.parse(sequence)).difference(aa_mass)))) - - try: - mass = 0 - for aa, num in comp.items(): - if aa in aa_mass: - mass += aa_mass[aa] * num - elif parser.is_term_mod(aa): - assert num == 1 - mass += calculate_mass(formula=aa.strip('-'), mass_data=mass_data) - else: - mod, X = parser._split_label(aa) - mass += (aa_mass[mod] + aa_mass[X]) * num - except KeyError as e: - raise PyteomicsError('Unspecified mass for modification: "{}"'.format(e.args[0])) - - if ion_type: - try: - icomp = kwargs.get('ion_comp', std_ion_comp)[ion_type] - except KeyError: - raise PyteomicsError('Unknown ion type: {}'.format(ion_type)) - - mass += sum(mass_data[element][0][0] * num - for element, num in icomp.items()) - - if charge: - mass = (mass + mass_data['H+'][0][0] * charge) / charge - - return mass - - -class Unimod(): - """A class for Unimod database of modifications. - The list of all modifications can be retrieved via `mods` attribute. - Methods for convenient searching are `by_title` and `by_name`. - For more elaborate filtering, iterate manually over the list. - - .. note:: - See :py:mod:`pyteomics.mass.unimod` for a new alternative class with - more features. - """ - - def __init__(self, source='http://www.unimod.org/xml/unimod.xml'): - """Create a database and fill it from XML file retrieved from `source`. - - Parameters - ---------- - - source : str or file, optional - A file-like object or a URL to read from. Don't forget the ``'file://'`` - prefix when pointing to local files. - """ - from lxml import etree - from ..xml import _local_name - - def process_mod(mod): - d = mod.attrib - new_d = {} - for key in ('date_time_modified', 'date_time_posted'): - new_d[key] = datetime.strptime(d.pop(key), '%Y-%m-%d %H:%M:%S') - comp = Composition() - for delta in self._xpath('delta', mod): # executed 1 time - for key in ('avge_mass', 'mono_mass'): - new_d[key] = float(delta.attrib.pop(key)) - for elem in self._xpath('element', delta): - e_d = elem.attrib - amount = int(e_d.pop('number')) - label = e_d.pop('symbol') - isotope, symbol = re.match(r'^(\d*)(\D+)$', label).groups() - if not isotope: - isotope = 0 - else: - isotope = int(isotope) - comp += Composition(formula=_make_isotope_string(symbol, isotope), mass_data=self._massdata) * amount - new_d['composition'] = comp - new_d['record_id'] = int(d.pop('record_id')) - new_d['approved'] = d.pop('approved') == '1' - new_d.update(d) - spec = [] - for sp in self._xpath('specificity', mod): - sp_d = sp.attrib - sp_new_d = {} - sp_new_d['hidden'] = (sp_d.pop('hidden') == '1') - sp_new_d['spec_group'] = int(sp_d.pop('spec_group')) - sp_new_d.update(sp_d) - notes = [] - for note in self._xpath('*', sp): - if note.text and note.text.strip(): - notes.append(note.text.strip()) - if notes: - sp_new_d['note'] = '\n'.join(notes) - spec.append(sp_new_d) - new_d['specificity'] = spec - - alt_names = [] - for alt_name in self._xpath('alt_name', mod): - alt_names.append(alt_name.text) - if alt_names: - new_d['alt_names'] = alt_names - - refs = [] - for ref in self._xpath('xref', mod): - ref_d = {} - for sub in ref.iterchildren(): - ref_d[_local_name(sub)] = sub.text - for key in ('text', 'source', 'url'): - if key not in ref_d: - ref_d[key] = None - refs.append(ref_d) - new_d['refs'] = refs - return new_d - - if isinstance(source, str): - self._tree = etree.parse(urlopen(source)) - else: - self._tree = etree.parse(source) - self._massdata = self._mass_data() - self._mods = [] - self._id = {} - for i, mod in enumerate(self._xpath('/unimod/modifications/mod')): - mod_dict = process_mod(mod) - self._mods.append(mod_dict) - self._id[mod_dict['record_id']] = i - - def _xpath(self, path, element=None): - from ..xml import xpath - if element is None: - return xpath(self._tree, path, 'umod') - return xpath(element, path, 'umod') - - def _mass_data(self): - massdata = defaultdict(dict) - elements = [x.attrib for x in self._xpath('/unimod/elements/elem')] - avg = {} - for elem in elements: - i, label = re.match(r'^(\d*)(\D+)$', elem['title']).groups() - if not i: - iso = 0 - else: - iso = int(i) - massdata[label][iso] = (float(elem['mono_mass']), float(iso == 0)) - if not iso: - avg[label] = float(elem['avge_mass']) - for elem, isotopes in massdata.items(): - isotopes[int(round(isotopes[0][0]))] = isotopes[0] - if len(isotopes) == 3: - m1, m2 = (x[1][0] for x in sorted(isotopes.items())[1:]) - m_avg = avg[elem] - a = (m2 - m_avg) / (m2 - m1) - b = (m_avg - m1) / (m2 - m1) - for state, abundance in zip(sorted(isotopes)[1:], (a, b)): - isotopes[state] = (isotopes[state][0], abundance) - return massdata - - @property - def mods(self): - """Get the list of Unimod modifications""" - return self._mods - - @property - def mass_data(self): - """Get element mass data extracted from the database""" - return self._massdata - - def by_title(self, title, strict=True): - """Search modifications by title. If a single modification is found, - it is returned. Otherwise, a list will be returned. - - Parameters - ---------- - title : str - The modification title. - strict : bool, optional - If :py:const:`False`, the search will return all modifications - whose title **contains** `title`, otherwise equality is required. - :py:const:`True` by default. - - Returns - ------- - out : dict or list - A single modification or a list of modifications. - """ - f = {True: operator.eq, False: operator.contains} - func = f[strict] - result = [m for m in self._mods if func(m['title'], title)] - if len(result) == 1: - return result[0] - return result - - def by_name(self, name, strict=True): - """Search modifications by name. If a single modification is found, - it is returned. Otherwise, a list will be returned. - - Parameters - ---------- - name : str - The full name of the modification(s). - strict : bool, optional - If :py:const:`False`, the search will return all modifications - whose full name **contains** `title`, otherwise equality is - required. :py:const:`True` by default. - - Returns - ------- - out : dict or list - A single modification or a list of modifications. - """ - f = {True: operator.eq, False: operator.contains} - func = f[strict] - result = [m for m in self._mods if func(m['full_name'], name)] - if len(result) == 1: - return result[0] - return result - - def by_id(self, i): - """Search modifications by record ID. If a modification is found, - it is returned. Otherwise, :py:exc:`KeyError` is raised. - - Parameters - ---------- - i : int or str - The Unimod record ID. - - Returns - ------- - out : dict - A single modification dict. - """ - if isinstance(i, str): - i = int(i) - return self._mods[self._id[i]] - - __getitem__ = by_id - - -def neutral_mass(mz, z, charge_carrier=_nist_mass[PROTON][0][0]): - return (mz * abs(z)) - (z * charge_carrier) - - -def mass_charge_ratio(neutral_mass, z, charge_carrier=_nist_mass[PROTON][0][0]): - return (neutral_mass + (z * charge_carrier)) / abs(z) diff --git a/pyteomics/mass/unimod.py b/pyteomics/mass/unimod.py deleted file mode 100644 index 471d00ee9c349d6c194ae2db95aa9752d2293dd6..0000000000000000000000000000000000000000 --- a/pyteomics/mass/unimod.py +++ /dev/null @@ -1,798 +0,0 @@ -""" -unimod - interface to the Unimod database -========================================= - -This module provides an interface to the relational Unimod database. -The main class is :py:class:`Unimod`. - -Dependencies ------------- - -This module requires :py:mod:`lxml` and :py:mod:`sqlalchemy`. -""" - -# Copyright 2015 Joshua Klein, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re - -from lxml import etree -from sqlalchemy.ext.declarative import declarative_base, DeclarativeMeta -from sqlalchemy.orm import relationship, backref, object_session -from sqlalchemy.ext.associationproxy import association_proxy -from sqlalchemy import (Numeric, Unicode, - Column, Integer, ForeignKey, - UnicodeText, Boolean, event) -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker - -from . import mass - -model_registry = set() - - -class SubclassRegisteringDeclarativeMeta(DeclarativeMeta): - def __new__(cls, name, parents, attrs): - new_type = super(SubclassRegisteringDeclarativeMeta, - cls).__new__(cls, name, parents, attrs) - model_registry.add(new_type) - return new_type - - -Base = declarative_base(metaclass=SubclassRegisteringDeclarativeMeta) - -_unimod_xml_download_url = 'http://www.unimod.org/xml/unimod_tables.xml' - -try: - basestring -except: - basestring = (str, bytes) - - -CompositionType = mass.Composition - - -def simple_repr(self): # pragma: no cover - template = '{self.__class__.__name__}({d})' - d = {'%s=%r' % (k, v) for k, v in self.__dict__.items() if not k.startswith('_')} - return template.format(self=self, d=', '.join(d)) - -Base.__repr__ = simple_repr - - -def remove_namespace(doc, namespace): - """Remove namespace in the passed document in place.""" - ns = u'{%s}' % namespace - nsl = len(ns) - for elem in doc.getiterator(): - if elem.tag.startswith(ns): - elem.tag = elem.tag[nsl:] - - -def preprocess_xml(doc_path): - """ - Parse and drop namespaces from an XML document. - - Parameters - ---------- - doc_path : str - - Returns - ------- - out : etree.ElementTree - """ - tree = etree.parse(doc_path) - root = tree.getroot() - for ns in root.nsmap.values(): - remove_namespace(tree, ns) - return tree - - -def _formula_parser(formula, session): - """ - Parse a unimod formula composed of elements, - isotopes, and other bricks. - - In order to look up a Brick's composition, this - function must have access to a session. - - Parameters - ---------- - formula : str - A Unimod formula of the form `A(n) B(m)...` - where A, B, ... are element names or bricks and - (n), (m)... are parenthesized possibly signed integers or - omitted in which case they are interpreted as 1 - session : Session - An active SQLAlchemy session for looking up bricks in the database - - Returns - ------- - out : CompositionType - """ - composition = CompositionType() - for token in formula.split(' '): - match = re.search(r'(?P<isotope>\d+)?(?P<elemet>[^\(]+)(?:\((?P<count>-?\d+)\))?', token) - if match: - isotope, element, count = match.groups() - if count is not None: - count = int(count) - else: - count = 1 - if isotope is not None: - name = mass._make_isotope_string(element, isotope) - else: - name = element - is_brick = session.query(Brick).filter(Brick.brick == name).first() - if is_brick is None: - composition[name] += count - else: - composition += is_brick.composition * count - return composition - - -def _composition_listener(attr): - """ - Attach event listeners to an InstrumentedAttribute - to trigger formula parsing on load and on change. - """ - @event.listens_for(attr, 'set') - def _update_composition_from_formula(target, value, oldvalue, initiator): - session = object_session(target) - if value == '' or value is None: - return - # If the object hasn't been associated with a session, - # we can't look up bricks. - if session is None: - return - target.composition = _formula_parser(value, session) - - @event.listens_for(attr.class_, 'load') - def _update_composition_on_load(target, context): - value = getattr(target, attr.prop.key) - if value == '' or value is None: - return - session = object_session(target) - target.composition = _formula_parser(value, session) - - -def has_composition(attr_name): - """ - A decorator to simplify flagging a Model with a column - to be treated as a formula for parsing. Calls :func:`_composition_listener` - internally. - """ - def decorator(model): - _composition_listener(getattr(model, attr_name)) - return model - return decorator - - -class HasFullNameMixin(object): - """ - A simple mixin to standardize equality operators - for models with a :attr:`full_name` attribute. - """ - def __eq__(self, other): - try: - return self.full_name == other.full_name - except AttributeError: - return False - - def __ne__(self, other): - return not self == other - - -class AlternativeName(Base): - __tablename__ = 'AlternativeName' - - _tag_name = 'alt_names_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - alt_name=attrib['alt_name'], - modification_id=int(attrib['mod_key']) - ) - return inst - - id = Column(Integer, primary_key=True) - alt_name = Column(Unicode(256), index=True) - modification_id = Column(Integer, ForeignKey('Modification.id'), index=True) - - -class AminoAcid(Base, HasFullNameMixin): - __tablename__ = 'AminoAcid' - - _tag_name = 'amino_acids_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - full_name=attrib['full_name'], - one_letter=attrib['one_letter'], - three_letter=attrib['three_letter'], - num_H=int(attrib['num_H']), - num_O=int(attrib['num_O']), - num_C=int(attrib['num_C']), - num_N=int(attrib['num_N']), - num_S=int(attrib['num_S']), - ) - return inst - - id = Column(Integer, primary_key=True) - num_H = Column(Integer) - num_O = Column(Integer) - num_C = Column(Integer) - num_N = Column(Integer) - num_S = Column(Integer) - full_name = Column(Unicode(25), index=True) - one_letter = Column(Unicode(10), index=True) - three_letter = Column(Unicode(10), index=True) - - -class Classification(Base): - __tablename__ = 'Classification' - - _tag_name = 'classifications_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - classification=attrib['classification'] - ) - return inst - - id = Column(Integer, primary_key=True) - classification = Column(Unicode(30), index=True) - - -class Position(Base): - __tablename__ = 'Position' - - _tag_name = 'positions_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - position=attrib['position'] - ) - return inst - - id = Column(Integer, primary_key=True) - position = Column(Unicode(20), index=True) - - -class Brick(Base, HasFullNameMixin): - __tablename__ = 'Brick' - - _tag_name = 'bricks_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - brick=attrib['brick'], - full_name=attrib['full_name'] - ) - return inst - - id = Column(Integer, primary_key=True) - brick = Column(Unicode(64), index=True) - full_name = Column(Unicode(128), index=True) - - elements = relationship('BrickToElement') - - @property - def composition(self): - composition = CompositionType() - for element_relation in self.elements: - symbol = element_relation.element - isotope, element = re.search(r'(?P<isotope>\d+)?(?P<element>\S+)', symbol).groups() - if isotope: - isotope = int(isotope) - iso_str = mass._make_isotope_string(element, isotope) - else: - iso_str = element - count = element_relation.count - composition[iso_str] = count - return composition - - -class Fragment(Base): - __tablename__ = 'Fragment' - - _tag_name = 'fragments_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - modification_id=int(attrib['mod_key']) - ) - return inst - - id = Column(Integer, primary_key=True) - modification_id = Column(Integer, ForeignKey('Modification.id'), index=True) - - _fragment_composition = relationship('FragmentComposition') - - @property - def composition(self): - composition = CompositionType() - session = object_session(self) - for fragment_composition_relation in self._fragment_composition: - symbol = fragment_composition_relation.brick_string - isotope, element = re.search(r'(?P<isotope>\d+)?(?P<element>\S+)', symbol).groups() - count = fragment_composition_relation.count - if count is not None: - count = int(count) - else: - count = 1 - if isotope: - name = mass._make_isotope_string(element, isotope) - else: - name = element - is_brick = session.query(Brick).filter(Brick.brick == name).first() - if is_brick is None: - composition[name] += count - else: - composition += is_brick.composition * count - return composition - - -class FragmentComposition(Base): - __tablename__ = 'FragmentComposition' - - _tag_name = 'fragment_comp_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - brick_string=attrib['brick'], - fragment_id=int(attrib['fragments_key']), - count=int(attrib['num_brick']) - ) - return inst - - id = Column(Integer, primary_key=True) - brick_string = Column(Unicode(64), ForeignKey(Brick.brick), index=True) - fragment_id = Column(Integer, ForeignKey('Fragment.id'), index=True) - count = Column(Integer) - - -class ModificationToBrick(Base): - __tablename__ = 'ModificationToBrick' - - _tag_name = 'mod2brick_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - brick_string=(attrib['brick']), - modification_id=int(attrib['mod_key']), - count=int(attrib['num_brick']) - ) - return inst - - id = Column(Integer, primary_key=True) - brick_string = Column(Unicode(64), ForeignKey(Brick.brick), index=True) - modification_id = Column(Integer, ForeignKey('Modification.id'), index=True) - count = Column(Integer) - - -class BrickToElement(Base): - __tablename__ = 'BrickToElement' - - _tag_name = 'brick2element_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - brick_id=int(attrib['brick_key']), - count=int(attrib['num_element']), - element=attrib['element'] - ) - return inst - - id = Column(Integer, primary_key=True) - brick_id = Column(Integer, ForeignKey(Brick.id), index=True) - element = Column(Unicode(16), ForeignKey('Element.element'), index=True) - element_obj = relationship('Element', uselist=False) - count = Column(Integer) - - -class Element(Base, HasFullNameMixin): - __tablename__ = 'Element' - - _tag_name = 'elements_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - average_mass=float(attrib['avge_mass']), - monoisotopic_mass=float(attrib['mono_mass']), - full_name=attrib['full_name'], - element=attrib['element'] - - ) - return inst - - id = Column(Integer, primary_key=True) - average_mass = Column(Numeric(12, 6, asdecimal=False)) - monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False)) - full_name = Column(Unicode(64), index=True) - element = Column(Unicode(16), index=True) - - -@has_composition('_composition') -class Modification(Base, HasFullNameMixin): - __tablename__ = 'Modification' - - _tag_name = 'modifications_row' - - id = Column(Integer, primary_key=True) - username_of_poster = Column(Unicode(128)) - average_mass = Column(Numeric(12, 6, asdecimal=False), index=True) - ex_code_name = Column(Unicode(64), index=True) - monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False), index=True) - full_name = Column(Unicode(128), index=True) - code_name = Column(Unicode(128), index=True) - _composition = Column(Unicode(128), index=True) - approved = Column(Boolean, index=True) - - notes = relationship('MiscNotesModifications') - specificities = relationship('Specificity') - bricks = relationship(ModificationToBrick) - _fragments = relationship(Fragment) - - _alt_names = relationship(AlternativeName, backref=backref('modification')) - # Maps the list of AlternativeName instances loaded dynamically from _alt_names - # into a list of plain strings, since the AlternativeName type contains no - # additional information. - alternative_names = association_proxy('_alt_names', 'alt_name') - fragments = association_proxy('_fragments', 'composition') - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - username_of_poster=attrib['username_of_poster'], - average_mass=float(attrib['avge_mass']), - monoisotopic_mass=float(attrib['mono_mass']), - ex_code_name=attrib['ex_code_name'], - code_name=attrib['code_name'], - full_name=attrib['full_name'], - approved=bool(int(attrib['approved'])), - _composition=attrib['composition'] - ) - for note in tag: - if note.tag == MiscNotesModifications._tag_name: - model_note = MiscNotesModifications._from_tag(note, inst.id) - if model_note is not None: - inst.notes.append(model_note) - return inst - - -class MiscNotesModifications(Base): - __tablename__ = 'MiscNotesModifications' - _tag_name = 'misc_notes' - - id = Column(Integer, primary_key=True) - modification_id = Column(Integer, ForeignKey(Modification.id), index=True) - text = Column(UnicodeText) - - @classmethod - def _from_tag(cls, tag, modification_id): - if tag.text is None: - return - return cls(text=tag.text, modification_id=modification_id) - - -class Specificity(Base): - __tablename__ = 'Specificity' - - _tag_name = 'specificity_row' - - id = Column(Integer, primary_key=True) - position_id = Column(Integer, ForeignKey(Position.id), index=True) - classification_id = Column(Integer, ForeignKey(Classification.id), index=True) - classification = relationship('Classification', uselist=False) - # Map through one_letter - amino_acid = Column(Unicode(10), ForeignKey(AminoAcid.one_letter), index=True) - modification_id = Column(Integer, ForeignKey(Modification.id), index=True) - hidden = Column(Boolean, index=True) - group = Column(Integer, index=True) - neutral_losses = relationship('SpecificityToNeutralLoss') - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - position_id=int(attrib['position_key']), - classification_id=int(attrib['classifications_key']), - hidden=bool(int(attrib['hidden'])), - amino_acid=attrib['one_letter'], - modification_id=int(attrib['mod_key']), - ) - return inst - - -class NeutralLoss(Base): - __tablename__ = 'NeutralLoss' - - _tag_name = 'neutral_losses_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - brick_string=(attrib['brick']), - count=int(attrib['num_brick']), - specificity_id=int(attrib['spec_key']) - ) - return inst - - id = Column(Integer, primary_key=True) - brick_string = Column(Unicode(64), index=True) - specificity_id = Column(Integer, ForeignKey(Specificity.id), index=True) - count = Column(Integer) - - -@has_composition('_composition') -class SpecificityToNeutralLoss(Base): - __tablename__ = 'SpecificityToNeutralLoss' - - _tag_name = 'spec2nl_row' - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls( - id=int(attrib['record_id']), - specificity_id=int(attrib['spec_key']), - monoisotopic_mass=float(attrib['nl_mono_mass']), - average_mass=float(attrib['nl_avge_mass']), - is_required_peptide_neutral_loss=bool(int(attrib['is_req_pep_nl'])), - is_peptide_neutral_loss=bool(int(attrib['is_pep_nl'])), - is_slave=bool(int(attrib['is_slave_nl'])), - _composition=attrib['nl_composition'] - ) - return inst - - id = Column(Integer, primary_key=True) - specificity_id = Column(Integer, ForeignKey(Specificity.id), index=True) - specificity = relationship(Specificity, uselist=False) - monoisotopic_mass = Column(Numeric(12, 6, asdecimal=False), index=True) - average_mass = Column(Numeric(12, 6, asdecimal=False), index=True) - _composition = Column(Unicode(128)) - is_slave = Column(Boolean, index=True) - is_peptide_neutral_loss = Column(Boolean, index=True) - is_required_peptide_neutral_loss = Column(Boolean, index=True) - - -class CrossreferenceSource(Base): - __tablename__ = 'CrossreferenceSource' - _tag_name = 'xref_sources_row' - - id = Column(Integer, primary_key=True) - source = Column(Unicode(64), index=True) - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls() - inst.id = int(attrib['record_id']) - inst.source = attrib['xref_source'] - return inst - - -class Crossreference(Base): - __tablename__ = 'Crossreference' - - _tag_name = 'xrefs_row' - - id = Column(Integer, primary_key=True) - source_id = Column(Integer, ForeignKey(CrossreferenceSource.id), index=True) - source = relationship(CrossreferenceSource, uselist=False) - url = Column(Unicode(128)) - modification_id = Column(Integer, ForeignKey(Modification.id), index=True) - text = Column(UnicodeText) - - @classmethod - def from_tag(cls, tag): - attrib = tag.attrib - inst = cls() - inst.id = int(attrib['record_id']) - inst.url = attrib['xref_url'] - inst.source_id = int(attrib['xref_source_key']) - inst.modification_id = int(attrib['mod_key']) - text = [] - for node in tag.getchildren(): - if node.tag == 'xref_text': - if node.text is not None: - text.append(node.text) - inst.text = '\n'.join(text) - return inst - - -def load(doc_path, output_path='sqlite://'): - """ - Parse the relational table-like XML file provided by http://www.unimod.org/downloads.html - and convert each <tag>_row into an equivalent database entry. - - By default the table will be held in memory. - """ - tree = preprocess_xml(doc_path) - engine = create_engine(output_path) - Base.metadata.create_all(engine) - session = sessionmaker(bind=engine, autoflush=False)() - for model in model_registry: - if hasattr(model, '_tag_name') and hasattr(model, 'from_tag'): - for tag in tree.iterfind('.//' + model._tag_name): - session.add(model.from_tag(tag)) - session.commit() - return session - - -def session(path='sqlite:///unimod.db'): - engine = create_engine(path) - Base.metadata.create_all(engine) - session = sessionmaker(bind=engine, autoflush=False)() - return session - - -class Unimod(object): - """ - Main class representing the relational Unimod database. - - Examples - -------- - - If you just wish to get a new copy of the data and store it in a temporary - in-memory database, invoking the type without parameters works without issue. - - >>> new_db = Unimod() - - If you want to persist a snapshot of the Unimod database to disk and query it - from there, or to re-use a previously downloaded database copy, pass a database - driver prefixed path: - - >>> reused_db = Unimod("sqlite:///path/to/unimod.db") - - If the path did not previously exist, a new copy of Unimod will be downloaded - and stored there on the first use, but be immediately available on subsequent - uses. - """ - def __init__(self, path=None): - """ - Initialize the object from a database file. - - Parameters - ---------- - path : str or None, optional - If :py:class:`str`, should point to a database. - Use a dialect-specific prefix, like ``'sqlite://'``. - If :py:const:`None` (default), a relational - XML file will be downloaded from default location. - """ - if path is None: - self.path = None - self.session = load(_unimod_xml_download_url) - else: - self.path = path - try: - self.session = session(path) - if self.session.query(Modification).first() is None: - raise Exception() - except: - # Database may not yet exist at that location - self.session = load(_unimod_xml_download_url, path) - self.session.query(Modification).first() - - def get(self, identifier, strict=True): - """ - Get a modification matching `identifier`. - Replaces both :py:mod:`by_name` and :py:mod:`by_title` methods - in the old class. - - Parameters - ---------- - identifier : str - - strict : bool, optional - Defaults to :py:const:`True`. - - Returns - ------- - out : Modification - """ - if isinstance(identifier, int): - mod = self.session.query(Modification).get(identifier) - if mod is None: - raise KeyError(identifier) - return mod - elif isinstance(identifier, basestring): - if strict: - mod = self.session.query(Modification).filter( - (Modification.full_name == identifier) | - (Modification.code_name == identifier) | - (Modification.ex_code_name == identifier)).first() - if mod is None: - alt_name = self.session.query(AlternativeName).filter( - AlternativeName.alt_name == identifier).first() - if alt_name is None: - raise KeyError(identifier) - mod = alt_name.modification - return mod - else: - qname = '%%%s%%' % identifier - mod = self.session.query(Modification).filter( - (Modification.full_name.like(qname)) | - (Modification.code_name.like(qname)) | - (Modification.ex_code_name.like(qname))).first() - if mod is None: - alt_name = self.session.query(AlternativeName).filter( - AlternativeName.alt_name.like(qname)).first() - if alt_name is None: - raise KeyError(identifier) - mod = alt_name.modification - return mod - - by_title = by_name = get - - __getitem__ = get - - @property - def mods(self): - return self.session.query(Modification).all() - - def __iter__(self): - return iter(self.session.query(Modification).yield_per(1000)) - - def query(self, *args): - '''Compose an SQL query using SQLAlchemy's ORM interface. - - See :mod:`sqlalchemy`'s Session documentation for more details. - ''' - return self.session.query(*args) - - def execute(self, *args, **kwargs): - '''Execute an SQLAlchemy statement or a SQL string against the database, - returning the resulting database cursor. - - See :mod:`sqlalchemy`'s Session documentation for more details. - ''' - return self.session.execute(*args, **kwargs) diff --git a/pyteomics/mgf.py b/pyteomics/mgf.py deleted file mode 100644 index 811e43ebd65f90465707cb221048e6ab611092fb..0000000000000000000000000000000000000000 --- a/pyteomics/mgf.py +++ /dev/null @@ -1,830 +0,0 @@ -""" -mgf - read and write MS/MS data in Mascot Generic Format -======================================================== - -Summary -------- - -`MGF <http://www.matrixscience.com/help/data_file_help.html>`_ is a simple -human-readable format for MS/MS data. It allows storing MS/MS peak lists and -exprimental parameters. - -This module provides classes and functions for access to data stored in -MGF files. -Parsing is done using :py:class:`MGF` and :py:class:`IndexedMGF` classes. -The :py:func:`read` function can be used as an entry point. -MGF spectra are converted to dictionaries. MS/MS data points are -(optionally) represented as :py:mod:`numpy` arrays. -Also, common parameters can be read from MGF file header with -:py:func:`read_header` function. -:py:func:`write` allows creation of MGF files. - -Classes -------- - - :py:class:`MGF` - a text-mode MGF parser. Suitable to read spectra from a file consecutively. - Needs a file opened in text mode (or will open it if given a file name). - - :py:class:`IndexedMGF` - a binary-mode MGF parser. When created, builds a byte offset index - for fast random access by spectrum titles. Sequential iteration is also supported. - Needs a seekable file opened in binary mode (if created from existing file object). - - :py:class:`MGFBase` - abstract class, the common ancestor of the two classes above. - Can be used for type checking. - -Functions ---------- - - :py:func:`read` - an alias for :py:class:`MGF` or :py:class:`IndexedMGF`. - - :py:func:`get_spectrum` - read a single spectrum with given title from a file. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`read_header` - get a dict with common parameters for all spectra - from the beginning of MGF file. - - :py:func:`write` - write an MGF file. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - import numpy as np -except ImportError: - np = None -import itertools as it -import sys -import warnings -from . import auxiliary as aux - - -class MGFBase(aux.MaskedArrayConversionMixin): - """Abstract mixin class representing an MGF file. Subclasses implement different approaches to parsing.""" - _comments = set('#;!/') - _array_keys = ['m/z array', 'intensity array', 'charge array', 'ion array'] - _array_keys_unicode = [u'm/z array', u'intensity array', u'charge array', u'ion array'] - encoding = None - - def __init__(self, source=None, **kwargs): - """Create an MGF file object, set MGF-specific parameters. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MGF format. Default is - :py:const:`None`, which means read standard input. - - use_header : bool, optional, keyword only - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`True`. - - convert_arrays : one of {0, 1, 2}, optional, keyword only - If `0`, m/z, intensities and (possibly) charges or (possibly) ions will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional, keyword only - If `True` (default), fragment charges are reported. Disabling it improves performance. - - read_ions : bool, optional - If `True` (default: False), fragment ions are reported. Disabling it improves performance. - Note that right now, only one of (read_charges, read_ions) may be True. - - dtype : type or str or dict, optional, keyword only - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. - - encoding : str, optional, keyword only - File encoding. - """ - - super(MGFBase, self).__init__(source, **kwargs) - self._use_header = kwargs.pop('use_header', True) - self._read_charges = kwargs.pop('read_charges', True) - self._read_ions = kwargs.pop('read_ions', False) - # Make sure no charges are read if ions are read - if self._read_ions: - self._read_charges = False - if self._use_header: - self._read_header() - else: - self._header = None - - def __reduce_ex__(self, protocol): - return (self.__class__, (self._source_init,), self.__getstate__()) - - def __getstate__(self): - state = super(MGFBase, self).__getstate__() - state['use_header'] = self._use_header - state['header'] = self._header - return state - - def __setstate__(self, state): - super(MGFBase, self).__setstate__(state) - self._header = state['header'] - self._use_header = state['use_header'] - - @staticmethod - def parse_precursor_charge(charge_text, list_only=False): - return aux._parse_charge(charge_text, list_only=list_only) - - @staticmethod - def parse_peak_charge(charge_text, list_only=False): - return aux._parse_charge(charge_text, list_only=False) - - @staticmethod - def parse_peak_ion(ion_text): - return aux._parse_ion(ion_text) - - @property - def header(self): - if self._header is None: - self._read_header() - return self._header - - def _read_header_lines(self, header_lines): - header = {} - for line in header_lines: - if line.strip() == 'BEGIN IONS': - break - l = line.split('=') - if len(l) == 2: - key = l[0].lower() - val = l[1].strip() - header[key] = val - if 'charge' in header: - header['charge'] = self.parse_precursor_charge(header['charge'], True) - self._header = header - - def _read_spectrum_lines(self, lines): - """Read a single spectrum from ``self._source``. - - Returns - ------- - out : dict - """ - - masses = [] - intensities = [] - charges = [] - ions = [] - - params = self.header.copy() if self._use_header else {} - - for i, line in enumerate(lines): - sline = line.strip() - if sline == 'BEGIN IONS': - if i == 0: - continue - else: - raise aux.PyteomicsError('Error when parsing MGF: unexpected start of spectrum.') - if not sline or sline[0] in self._comments: - pass - elif sline == 'END IONS': - if 'pepmass' in params: - try: - pepmass = tuple(map(float, params['pepmass'].split())) - except ValueError: - raise aux.PyteomicsError('MGF format error: cannot parse ' - 'PEPMASS = {}'.format(params['pepmass'])) - else: - params['pepmass'] = pepmass + (None,) * (2-len(pepmass)) - if isinstance(params.get('charge'), aux.basestring): - params['charge'] = self.parse_precursor_charge(params['charge'], True) - if 'rtinseconds' in params: - params['rtinseconds'] = aux.unitfloat(params['rtinseconds'], 'second') - out = {'params': params, 'm/z array': masses, 'intensity array': intensities} - if self._read_charges: - out['charge array'] = charges - if self._read_ions: - out['ion array'] = ions - self._build_all_arrays(out) - if self.encoding and sys.version_info.major == 2: - for key, ukey in zip(self._array_keys + ['params'], self._array_keys_unicode + [u'params']): - if key in out: - out[ukey] = out.pop(key) - return out - - else: - if '=' in sline: # spectrum-specific parameters! - l = sline.split('=', 1) - params[l[0].lower()] = l[1].strip() - else: # this must be a peak list - l = sline.split() - try: - masses.append(float(l[0])) - intensities.append(float(l[1])) - if self._read_charges: - charges.append(self.parse_peak_charge(l[2]) if len(l) > 2 else 0) - if self._read_ions: - ions.append(self.parse_peak_ion(l[2]) if len(l) > 2 else "") - except ValueError: - raise aux.PyteomicsError( - 'Error when parsing %s. Line:\n%s' % (getattr(self._source, 'name', 'MGF file'), line)) - except IndexError: - pass - - def get_spectrum(self, title): - raise NotImplementedError() - - @staticmethod - def _get_time(spectrum): - try: - return spectrum['params']['rtinseconds'] - except KeyError: - raise aux.PyteomicsError('RT information not found.') - - -class IndexedMGF(MGFBase, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixin, aux.IndexSavingTextReader): - """ - A class representing an MGF file. Supports the `with` syntax and direct iteration for sequential - parsing. Specific spectra can be accessed by title using the indexing syntax in constant time. - If created using a file object, it needs to be opened in binary mode. - - When iterated, :py:class:`IndexedMGF` object yields spectra one by one. - Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array', - 'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - 'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints, - 'ion_array' is an array of Ions (str) - and 'params' stores a :py:class:`dict` of parameters (keys and values are - :py:class:`str`, keys corresponding to MGF, lowercased). - - Attributes - ---------- - - header : dict - The file header. - time : RTLocator - A property used for accessing spectra by retention time. - """ - delimiter = 'BEGIN IONS' - - def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True, - dtype=None, encoding='utf-8', index_by_scans=False, read_ions=False, _skip_index=False, **kwargs): - """ - Create an :py:class:`IndexedMGF` (binary-mode) reader for a given MGF file. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MGF format. Default is - :py:const:`None`, which means read standard input. - - .. note :: If a file object is given, it must be opened in binary mode. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`True`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - - read_ions : bool, optional - If `True` (default: False), fragment ion types are reported. Disabling it improves performance. - Note that right now, only one of (read_charges, read_ions) may be True. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. - - encoding : str, optional - File encoding. - - block_size : int, optinal - Size of the chunk (in bytes) used to parse the file when creating the byte offset index. - - Returns - ------- - - out : IndexedMGF - The reader object. - """ - self._index_by_scans = index_by_scans - self._read_ions = read_ions - self.label = r'SCANS=(\d+)\s*' if index_by_scans else r'TITLE=([^\n]*\S)\s*' - super(IndexedMGF, self).__init__(source, parser_func=self._read, pass_file=False, args=(), kwargs={}, - use_header=use_header, convert_arrays=convert_arrays, - read_charges=read_charges, - dtype=dtype, encoding=encoding, read_ions=read_ions, _skip_index=_skip_index, - **kwargs) - - def __reduce_ex__(self, protocol): - return (self.__class__, - (self._source_init, False, self._convert_arrays, self._read_charges, - None, self.encoding, self._index_by_scans, self._read_ions, True), - self.__getstate__()) - - @aux._keepstate_method - def _read_header(self): - try: - first = next(v for v in self._offset_index.values())[0] - except StopIteration: # the index is empty, no spectra in file - first = -1 - header_lines = self.read(first).decode(self.encoding).split('\n') - return self._read_header_lines(header_lines) - - def _item_from_offsets(self, offsets): - start, end = offsets - lines = self._read_lines_from_offsets(start, end) - return self._read_spectrum_lines(lines) - - def _read(self, **kwargs): - for _, offsets in self._offset_index.items(): - spectrum = self._item_from_offsets(offsets) - yield spectrum - - def get_spectrum(self, key): - return self.get_by_id(key) - - -class MGF(MGFBase, aux.FileReader): - """ - A class representing an MGF file. Supports the `with` syntax and direct iteration for sequential - parsing. Specific spectra can be accessed by title using the indexing syntax (if the file is seekable), - but it takes linear time to search through the file. Consider using :py:class:`IndexedMGF` for - constant-time access to spectra. - - :py:class:`MGF` object behaves as an iterator, **yielding** spectra one by one. - Each 'spectrum' is a :py:class:`dict` with five keys: 'm/z array', - 'intensity array', 'charge array', 'ion array' and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - 'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints, - 'ion_array' is a masked array of Ions (str) - and 'params' stores a :py:class:`dict` of parameters (keys and values are - :py:class:`str`, keys corresponding to MGF, lowercased). - - Attributes - ---------- - - header : dict - The file header. - - """ - - def __init__(self, source=None, use_header=True, convert_arrays=2, read_charges=True, - read_ions=False, dtype=None, encoding=None): - """ - Create an :py:class:`MGF` (text-mode) reader for a given MGF file. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MGF format. Default is - :py:const:`None`, which means read standard input. - - ..note :: If a file object is given, it must be opened in text mode. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`True`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - - read_ions : bool, optional - If `True` (default: False), fragment ion types are reported. Disabling it improves performance. - Note that right now, only one of (read_charges, read_ions) may be True. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. - - encoding : str, optional - File encoding. - - Returns - ------- - - out : MGF - The reader object. - """ - super(MGF, self).__init__(source, mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={}, - encoding=encoding, use_header=use_header, convert_arrays=convert_arrays, read_charges=read_charges, - read_ions=read_ions, dtype=dtype) - - @aux._keepstate_method - def _read_header(self): - return self._read_header_lines(self._source) - - def _read_spectrum(self): - return self._read_spectrum_lines(self._source) - - def _read(self): - for line in self._source: - if line.strip() == 'BEGIN IONS': - yield self._read_spectrum() - - @aux._keepstate_method - def get_spectrum(self, title): - for line in self._source: - sline = line.strip() - if sline[:5] == 'TITLE' and sline.split('=', 1)[1].strip() == title: - spectrum = self._read_spectrum() - spectrum['params']['title'] = title - return spectrum - - def __getitem__(self, key): - return self.get_spectrum(key) - - -def read(*args, **kwargs): - """Returns a reader for a given MGF file. Most of the parameters repeat the - instantiation signature of :py:class:`MGF` and :py:class:`IndexedMGF`. - Additional parameter `use_index` helps decide which class to instantiate - for given `source`. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MGF format. Default is - :py:const:`None`, which means read standard input. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`True`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - - read_ions : bool, optional - If `True` (default: False), fragment ion types are reported. Disabling it improves performance. - Note that right now, only one of (read_charges, read_ions) may be True. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array' and/or 'ion array'. - - encoding : str, optional - File encoding. - - use_index : bool, optional - Determines which parsing method to use. If :py:const:`True` (default), an instance of - :py:class:`IndexedMGF` is created. This facilitates random access by spectrum titles. - If an open file is passed as `source`, it needs to be open in binary mode. - - If :py:const:`False`, an instance of :py:class:`MGF` is created. It reads - `source` in text mode and is suitable for iterative parsing. Access by spectrum title - requires linear search and thus takes linear time. - - block_size : int, optinal - Size of the chunk (in bytes) used to parse the file when creating the byte offset index. - (Accepted only for :py:class:`IndexedMGF`.) - - Returns - ------- - - out : MGFBase - Instance of :py:class:`MGF` or :py:class:`IndexedMGF`. - """ - if args: - source = args[0] - else: - source = kwargs.get('source') - use_index = kwargs.pop('use_index', None) - use_index = aux._check_use_index(source, use_index, True) - tp = IndexedMGF if use_index else MGF - return tp(*args, **kwargs) - - -def get_spectrum(source, title, *args, **kwargs): - """Read one spectrum (with given `title`) from `source`. - - See :py:func:`read` for explanation of parameters affecting the output. - - .. note :: Only the key-value pairs after the "TITLE =" line will be included in the output. - - Parameters - ---------- - - source : str or file or None - File to read from. - title : str - Spectrum title. - *args - Given to :py:func:`read`. - **kwargs - Given to :py:func:`read`. - - Returns - ------- - out : dict or None - A dict with the spectrum, if it is found, and None otherwise. - - """ - with read(source, *args, **kwargs) as f: - return f[title] - - -@aux._keepstate -def read_header(source): - """ - Read the specified MGF file, get search parameters specified in the header - as a :py:class:`dict`, the keys corresponding to MGF format (lowercased). - - Parameters - ---------- - - source : str or file - File name or file object representing an file in MGF format. - - Returns - ------- - - header : dict - """ - with aux._file_obj(source, 'r') as source: - header = {} - for line in source: - if line.strip() == 'BEGIN IONS': - break - l = line.split('=') - if len(l) == 2: - key = l[0].lower() - val = l[1].strip() - header[key] = val - if 'charge' in header: - header['charge'] = aux._parse_charge(header['charge'], True) - return header - - -_default_key_order = ['title', 'pepmass', 'rtinseconds', 'charge'] - - -def _pepmass_repr(k, pepmass): - outstr = k.upper() + '=' - if not isinstance(pepmass, (str, int, float)): # assume iterable - try: - outstr += ' '.join(str(x) for x in pepmass if x is not None) - except TypeError: - raise aux.PyteomicsError('Cannot handle parameter: PEPMASS = {}'.format(pepmass)) - else: - outstr += str(pepmass) - return outstr - - -def _charge_repr(k, charge): - try: - val = aux.Charge(charge) - except (TypeError, aux.PyteomicsError): - val = aux.ChargeList(charge) - return '{}={}'.format(k.upper(), val) - - -def _default_repr(key, val): - return '{}={}'.format(key.upper(), val) - - -_default_value_formatters = {'pepmass': _pepmass_repr, 'charge': _charge_repr} - - -@aux._file_writer() -def write(spectra, output=None, header='', key_order=_default_key_order, fragment_format=None, - write_charges=True, write_ions=False, use_numpy=None, param_formatters=_default_value_formatters): - """ - Create a file in MGF format. - - Parameters - ---------- - - spectra : iterable - A **sequence** of dictionaries with keys 'm/z array', 'intensity array', - and 'params'. 'm/z array' and 'intensity array' should be sequences of - :py:class:`int`, :py:class:`float`, or :py:class:`str`. Strings will - be written 'as is'. The sequences should be of equal length, otherwise - excessive values will be ignored. - - 'params' should be a :py:class:`dict` with keys corresponding to MGF - format. Keys must be strings, they will be uppercased and used as is, - without any format consistency tests. Values can be of any type allowing - string representation. - - 'charge array' or 'ion array' can also be specified. - - .. note :: - Passing a single spectrum will work, but will trigger a warning. This usage pattern is discouraged. - To ensure correct output when writing multiple spectra, - it is recommended to construct a sequence of spectra first and then call :py:func:`write` once. - - .. seealso :: - This discussion of usage patterns of :py:func:`write`: https://github.com/levitsky/pyteomics/discussions/109 - - output : str or file or None, optional - Path or a file-like object open for writing. If an existing file is - specified by file name, it will be opened for writing. - Default value is :py:const:`None`, which means using standard output. - - .. note:: - The default mode for output files specified by name has been changed - from `a` to `w` in *pyteomics 4.6*. See `file_mode` to override the mode. - - header : dict or (multiline) str or list of str, optional - In case of a single string or a list of strings, the header will be - written 'as is'. In case of dict, the keys (must be strings) will be - uppercased. - - write_charges : bool, optional - If :py:const:`False`, fragment charges from 'charge array' will not be written. - Default is :py:const:`True`. - - write_ions : bool, optional - If :py:const:`False`, fragment ions from 'ion array' will not be written. - If :py:const:`True`, then `write_charges` is set to :py:const:`False`. - Default is :py:const:`False`. - - fragment_format : str, optional - Format string for m/z, intensity and charge (or ion annotation) of a fragment. Useful to set - the number of decimal places, e.g.: - ``fragment_format='%.4f %.0f'``. Default is ``'{} {} {}'``. - - .. note:: - The supported format syntax differs depending on other parameters. - If `use_numpy` is :py:const:`True` and :py:mod:`numpy` is available, - fragment peaks will be written using :py:func:`numpy.savetxt`. Then, - `fragment_format` must be recognized by that function. - - Otherwise, plain Python string formatting is done. - See `the docs - <https://docs.python.org/library/string.html#format-specification-mini-language>`_ - for details on writing the format string. - If some or all charges are missing, an empty string is substituted - instead, so formatting as :py:class:`!float` or :py:class:`!int` will raise an exception. - Hence it is safer to just use ``{}`` for charges. - - key_order : list, optional - A list of strings specifying the order in which params will be written in - the spectrum header. Unlisted keys will be in arbitrary order. - Default is :py:data:`_default_key_order`. - - .. note:: This does not affect the order of lines in the global header. - - param_formatters : dict, optional - A dict mapping parameter names to functions. Each function must accept - two arguments (key and value) and return a string. - Default is :py:data:`_default_value_formatters`. - - use_numpy : bool, optional - Controls whether fragment peak arrays are written using :py:func:`numpy.savetxt`. - Using :py:func:`numpy.savetxt` is faster, but cannot handle sparse arrays of fragment charges. - You may want to disable this if you need to save spectra with 'charge arrays' with missing values. - - If not specified, will be set to the opposite of `write_chrages`. - If :py:mod:`numpy` is not available, this parameter has no effect. - - file_mode : str, keyword only, optional - If `output` is a file name, defines the mode the file will be opened in. - Otherwise will be ignored. Default is `'w'`. - - .. note :: - The default changed from `'a'` in *pyteomics 4.6*. - - encoding : str, keyword only, optional - Output file encoding (if `output` is specified by name). - - Returns - ------- - - output : file - """ - def key_value_line(key, val): - return param_formatters.get(key, _default_repr)(key, val) + '\n' - - nones = (None, np.nan, np.ma.masked) if np is not None else (None,) - - if fragment_format is None: - fragment_format = '{} {} {}' - np_format_2 = '%.5f %.1f' - np_format_3 = '%.5f %.1f %d' - np_format_i = '%.5f %.1f %s' - else: - np_format_2 = np_format_3 = np_format_i = fragment_format - format_str = fragment_format + '\n' - - if write_ions: - write_charges = False - if use_numpy is None: - use_numpy = not write_charges - - if isinstance(header, dict): - head_dict = header.copy() - head_lines = [key_value_line(k, v) for k, v in header.items()] - head_str = '\n'.join(head_lines) - else: - if isinstance(header, str): - head_str = header - head_lines = header.split('\n') - else: - head_lines = list(header) - head_str = '\n'.join(header) - head_dict = {} - for line in head_lines: - if not line.strip() or any(line.startswith(c) for c in MGF._comments): - continue - l = line.split('=') - if len(l) == 2: - head_dict[l[0].lower()] = l[1].strip() - if head_str: - output.write(head_str + '\n\n') - - if isinstance(spectra, dict) and 'm/z array' in spectra: - spectra = (spectra, ) - warnings.warn("Passing a single spectrum to `write()` is discouraged. " - "To write a set of spectra, pass them to `write()` all at once. " - "For more info, see: https://github.com/levitsky/pyteomics/discussions/109.") - - for spectrum in spectra: - output.write('BEGIN IONS\n') - found = set() - for key in it.chain(key_order, spectrum['params']): - if key not in found and key in spectrum['params']: - found.add(key) - val = spectrum['params'][key] - if val != head_dict.get(key): - output.write(key_value_line(key, val)) - - try: - success = True - if np is not None and use_numpy: - if (not write_charges or 'charge array' not in spectrum) and (not write_ions or 'ion array' not in spectrum): - X = np.empty((len(spectrum['m/z array']), 2)) - X[:, 0] = spectrum['m/z array'] - X[:, 1] = spectrum['intensity array'] - np.savetxt(output, X, fmt=np_format_2) - elif isinstance(spectrum.get('charge array'), np.ndarray): - X = np.empty((len(spectrum['m/z array']), 3)) - X[:, 0] = spectrum['m/z array'] - X[:, 1] = spectrum['intensity array'] - X[:, 2] = spectrum['charge array'] - np.savetxt(output, X, fmt=np_format_3) - elif isinstance(spectrum.get('ion array'), np.ndarray): - X = np.empty((len(spectrum['m/z array']), 3), dtype=object) - X[:, 0] = spectrum['m/z array'] - X[:, 1] = spectrum['intensity array'] - X[:, 2] = spectrum['ion array'] - np.savetxt(output, X, fmt=np_format_i) - else: - success = False - else: - success = False - - if not success: - for m, i, c in zip(spectrum['m/z array'], - spectrum['intensity array'], - spectrum.get('charge array', it.cycle((None,))) if write_charges else - spectrum.get('ion array', it.cycle((None,))) if write_ions else - it.cycle((None,))): - output.write(format_str.format( - m, i, - (c if c not in nones else ''))) - except KeyError: - raise aux.PyteomicsError("'m/z array' and 'intensity array' must be present in all spectra.") - output.write('END IONS\n\n') - return output - - -chain = aux._make_chain(read, 'read') diff --git a/pyteomics/ms1.py b/pyteomics/ms1.py deleted file mode 100644 index ba7cc3be10da07f388c58db652f94b54b9ed2e36..0000000000000000000000000000000000000000 --- a/pyteomics/ms1.py +++ /dev/null @@ -1,492 +0,0 @@ -""" -ms1 - read and write MS/MS data in MS1 format -============================================= - -Summary -------- - -`MS1 <http://dx.doi.org/10.1002/rcm.1603>`_ is a simple -human-readable format for MS1 data. It allows storing MS1 peak lists and exprimental parameters. - -This module provides minimalistic infrastructure for access to data stored in MS1 files. -Two main classes are :py:class:`MS1`, which provides an iterative, text-mode parser, -and :py:class:`IndexedMS1`, which is a binary-mode parser that supports random access using scan IDs -and retention times. -The function :py:func:`read` helps dispatch between the two classes. -Also, common parameters can be read from MS1 file header with :py:func:`read_header` function. - -Classes -------- - - :py:class:`MS1` - a text-mode MS1 parser. Suitable to read spectra from a file consecutively. - Needs a file opened in text mode (or will open it if given a file name). - - :py:class:`IndexedMS1` - a binary-mode MS1 parser. When created, builds a byte offset index - for fast random access by spectrum ID. Sequential iteration is also supported. - Needs a seekable file opened in binary mode (if created from existing file object). - - :py:class:`MS1Base` - abstract class, the common ancestor of the two classes above. - Can be used for type checking. - -Functions ---------- - - :py:func:`read` - an alias for :py:class:`MS1` or :py:class:`IndexedMS1`. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`read_header` - get a dict with common parameters for all spectra - from the beginning of MS1 file. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import auxiliary as aux -try: - import numpy as np -except ImportError: - np = None - - -class MS1Base(aux.ArrayConversionMixin): - """Abstract class representing an MS1 file. Subclasses implement different approaches to parsing.""" - _array_keys = ['m/z array', 'intensity array'] - _float_keys = ['RTime', 'RetTime'] - - def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding=None, **kwargs): - """ - Create an instance of a :py:class:`MS1Base` parser. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS1 format. Default is - :py:const:`None`, which means read standard input. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`False`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array'. - - encoding : str, optional - File encoding. - """ - super(MS1Base, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, **kwargs) - if convert_arrays and np is None: - raise aux.PyteomicsError('numpy is required for array conversion') - self._use_header = use_header - if use_header: - self._header = self._read_header() - else: - self._header = None - self._source_name = getattr(source, 'name', str(source)) - - def reset(self): - super(MS1Base, self).reset() - self._pending_line = None - - @property - def header(self): - return self._header - - def _read_header_lines(self, lines): - header = {} - for line in lines: - if line[0] != 'H': - break - tokens = line.split('\t', 2) - if len(tokens) < 3: - tokens = line.split(None, 2) - key = tokens[1] - val = tokens[2].strip() - header[key] = val - return header - - def _make_scan(self, info): - for key in self._float_keys: - if key in info['params']: - info['params'][key] = float(info['params'][key]) - self._build_all_arrays(info) - return info - - def _handle_S(self, line, sline, params): - sline = line.strip().split(None, 3) - params['scan'] = tuple(sline[1:3]) - if len(sline) == 4: # in MS2 the S line contains the precursor m/z as a 4th column - params['precursor m/z'] = float(sline[3]) - - def _handle_I(self, line, sline, params): - params[sline[1]] = sline[2] - - def _handle_Z(self, line, sline, params): - params.setdefault('charge', []).append(float(sline[1])) - params.setdefault('neutral mass', []).append(float(sline[2])) - - def _handle_D(self, line, sline, params): - params.setdefault('analyzer', []).append(sline[1:]) - - def _handle_peak(self, line, sline, info): - try: - info['m/z array'].append(float(sline[0])) # this may cause - info['intensity array'].append(float(sline[1])) # exceptions... - except ValueError: - raise aux.PyteomicsError( - 'Error when parsing %s. Line: %s' % (self._source_name, line)) - except IndexError: - pass - - def _read_spectrum_lines(self, lines): - params = {} - info = {'params': params} - for k in self._array_keys: - info[k] = [] - if self._use_header: - params.update(self.header) - if self._pending_line: - reading_spectrum = True - self._handle_S(self._pending_line, None, params) - else: - reading_spectrum = False - line_count = 0 - for i, line in enumerate(lines): - line_count = i - sline = line.strip().split(None, 2) - if not sline: - continue - if not reading_spectrum: - if sline[0] == 'S': - reading_spectrum = True - self._handle_S(line, sline, params) - # otherwise we are not interested; do nothing, just move along - else: - if not sline: - pass - elif sline[0] == 'S': - self._pending_line = line - return self._make_scan(info) - - else: - if sline[0] == 'I': # spectrum-specific parameters! - self._handle_I(line, sline, params) - elif sline[0] == 'Z': # MS2-specific charge state guess - self._handle_Z(line, sline, params) - elif sline[0] == 'D': # MS2-specific analyzer annotation - self._handle_D(line, sline, params) - else: # this must be a peak list - self._handle_peak(line, sline, info) - self._pending_line = None - if line_count == 0: - return - return self._make_scan(info) - - def __getstate__(self): - state = super(MS1Base, self).__getstate__() - state['use_header'] = self._use_header - state['header'] = self._header - return state - - def __setstate__(self, state): - super(MS1Base, self).__setstate__(state) - self._use_header = state['use_header'] - self._header = state['header'] - - def __reduce_ex__(self, protocol): - return (self.__class__, - (self._source_init, False, self._convert_arrays, None, self.encoding), - self.__getstate__()) - - -class MS1(MS1Base, aux.FileReader): - """ - A class representing an MS1 file. Supports the `with` syntax and direct iteration for sequential - parsing. - - :py:class:`MS1` object behaves as an iterator, **yielding** spectra one by one. - Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', - 'intensity array', and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - and 'params' stores a :py:class:`dict` of parameters. - - Attributes - ---------- - - header : dict - The file header. - - """ - def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding=None, **kwargs): - """ - Create an :py:class:`MS1` (text-mode) reader for a given MS1 file. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS1 format. Default is - :py:const:`None`, which means read standard input. - - .. note :: If a file object is given, it must be opened in text mode. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`False`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array'. - - encoding : str, optional - File encoding. - - Returns - ------- - - out : MS1 - The reader object. - """ - super(MS1, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, - mode='r', parser_func=self._read, pass_file=False, args=(), kwargs={}) - - @aux._keepstate_method - def _read_header(self): - return self._read_header_lines(self._source) - - def _read(self): - def get_next_spectrum(): - return self._read_spectrum_lines(self._source) - - for spectrum in iter(get_next_spectrum, None): - yield spectrum - - -class IndexedMS1(MS1Base, aux.TaskMappingMixin, aux.TimeOrderedIndexedReaderMixin, aux.IndexedTextReader): - """ - A class representing an MS1 file. Supports the `with` syntax and direct iteration for sequential - parsing. Specific spectra can be accessed by title using the indexing syntax in constant time. - If created using a file object, it needs to be opened in binary mode. - - When iterated, :py:class:`IndexedMS1` object yields spectra one by one. - Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', 'intensity array' and 'params'. - 'm/z array' and 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - and 'params' stores a :py:class:`dict` of parameters (keys and values are - :py:class:`str`, keys corresponding to MS1). - - .. warning :: - Labels for scan objects are constructed as the first number in the S line, as follows: - for a line ``S 0 1`` the label is `'0'`. If these labels are not unique - for the scans in the file, the indexed parser will not work correctly. Consider using - :py:class:`MS1` instead. - - Attributes - ---------- - - header : dict - The file header. - time : RTLocator - A property used for accessing spectra by retention time. - """ - - delimiter = '\nS' - label = r'^[\n]?S\s+(\S+)' - - def __init__(self, source=None, use_header=False, convert_arrays=True, dtype=None, encoding='utf-8', _skip_index=False, **kwargs): - """ - Create an :py:class:`IndexedMS1` (binary-mode) reader for a given MS1 file. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS1 format. Default is - :py:const:`None`, which means read standard input. - - .. note :: If a file object is given, it must be opened in binary mode. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`True`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array'. - - encoding : str, optional - File encoding. - - block_size : int, optinal - Size of the chunk (in bytes) used to parse the file when creating the byte offset index. - - Returns - ------- - - out : IndexedMS1 - The reader object. - """ - super(IndexedMS1, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, encoding=encoding, - parser_func=self._read, pass_file=False, args=(), kwargs={}, _skip_index=_skip_index, **kwargs) - - def __reduce_ex__(self, protocol): - return (self.__class__, - (self._source_init, False, self._convert_arrays, None, self.encoding, True), - self.__getstate__()) - - @aux._keepstate_method - def _read_header(self): - try: - first = next(v for v in self._offset_index.values())[0] - except StopIteration: # the index is empty, no spectra in file - first = -1 - header_lines = self.read(first).decode(self.encoding).split('\n') - return self._read_header_lines(header_lines) - - def _item_from_offsets(self, offsets): - start, end = offsets - lines = self._read_lines_from_offsets(start, end) - return self._read_spectrum_lines(lines) - - def _read(self, **kwargs): - for _, offsets in self._offset_index.items(): - spectrum = self._item_from_offsets(offsets) - yield spectrum - - def get_spectrum(self, key): - return self.get_by_id(key) - - def _get_time(self, spectrum): - try: - return spectrum['params']['RTime'] - except KeyError: - raise aux.PyteomicsError('RT information not found.') - - -def read_header(source, *args, **kwargs): - """ - Read the specified MS1 file, get the parameters specified in the header - as a :py:class:`dict`. - - Parameters - ---------- - - source : str or file - File name or file object representing an file in MS1 format. - - Returns - ------- - - header : dict - """ - kwargs['use_header'] = True - return read(source, *args, **kwargs).header - - -def read(*args, **kwargs): - """Read an MS1 file and return entries iteratively. - - Read the specified MS1 file, **yield** spectra one by one. - Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', - 'intensity array', and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - and 'params' stores a :py:class:`dict` of parameters. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS1 format. Default is - :py:const:`None`, which means read standard input. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`False`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array' and/or 'intensity array'. - - encoding : str, optional - File encoding. - - use_index : bool, optional - Determines which parsing method to use. If :py:const:`True`, an instance of - :py:class:`IndexedMS1` is created. This facilitates random access by scan titles. - If an open file is passed as `source`, it needs to be open in binary mode. - - If :py:const:`False` (default), an instance of :py:class:`MS1` is created. It reads - `source` in text mode and is suitable for iterative parsing. - - .. warning :: - Labels for scan objects are constructed as the first number in the S line, as follows: - for a line ``S 0 1`` the label is `'0'`. If these labels are not unique - for the scans in the file, the indexed parser will not work correctly. - - block_size : int, optinal - Size of the chunk (in bytes) used to parse the file when creating the byte offset index. - (Accepted only for :py:class:`IndexedMS1`.) - - Returns - ------- - - out : :py:class:`MS1Base` - An instance of :py:class:`MS1` or :py:class:`IndexedMS1`, depending on `use_index` and `source`. - """ - if args: - source = args[0] - else: - source = kwargs.get('source') - use_index = kwargs.pop('use_index', None) - use_index = aux._check_use_index(source, use_index, False) - tp = IndexedMS1 if use_index else MS1 - - return tp(*args, **kwargs) - - -chain = aux._make_chain(read, 'read') diff --git a/pyteomics/ms2.py b/pyteomics/ms2.py deleted file mode 100644 index 16afbbf17eb624da3dd8b4f33a4be6a4fc7593e9..0000000000000000000000000000000000000000 --- a/pyteomics/ms2.py +++ /dev/null @@ -1,396 +0,0 @@ -""" -ms2 - read and write MS/MS data in MS2 format -============================================= - -Summary -------- - -`MS2 <http://dx.doi.org/10.1002/rcm.1603>`_ is a simple -human-readable format for MS2 data. It allows storing MS2 peak lists and -exprimental parameters. - -This module provides minimalistic infrastructure for access to data stored in -MS2 files. -Two main classes are :py:class:`MS2`, which provides an iterative, text-mode parser, -and :py:class:`IndexedMS2`, which is a binary-mode parser that supports random access using scan IDs -and retention times. -The function :py:func:`read` helps dispatch between the two classes. -Also, common parameters can be read from MS2 file header with -:py:func:`read_header` function. - -Classes -------- - - :py:class:`MS2` - a text-mode MS2 parser. Suitable to read spectra from a file consecutively. - Needs a file opened in text mode (or will open it if given a file name). - - :py:class:`IndexedMS2` - a binary-mode MS2 parser. When created, builds a byte offset index - for fast random access by spectrum ID. Sequential iteration is also supported. - Needs a seekable file opened in binary mode (if created from existing file object). - - :py:class:`MS2Base` - abstract class, the common ancestor of the two classes above. - Can be used for type checking. - -Functions ---------- - - :py:func:`read` - an alias for :py:class:`MS2` or :py:class:`IndexedMS1`. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`read_header` - get a dict with common parameters for all spectra - from the beginning of MS2 file. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pyteomics import auxiliary as aux -from pyteomics.ms1 import MS1, IndexedMS1, MS1Base - - -class MS2Base(aux.MaskedArrayConversionMixin, MS1Base): - """Abstract class representing an MS2 file. Subclasses implement different approaches to parsing.""" - _array_keys = ['m/z array', 'intensity array', 'charge array', 'resolution array'] - _float_keys = ['RTime', 'RetTime', 'IonInjectionTime', 'PrecursorInt'] - - def __init__(self, source=None, use_header=False, convert_arrays=2, dtype=None, read_charges=True, read_resolutions=True, encoding=None, **kwargs): - """ - Create an instance of a :py:class:`MS2Base` parser. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS1 format. Default is - :py:const:`None`, which means read standard input. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`False`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. - - read_resolutions : bool, optional - If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. - Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array', 'resolution array'. - - encoding : str, optional - File encoding. - """ - super(MS2Base, self).__init__(source=source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, - encoding=encoding, **kwargs) - self._read_charges = read_charges - self._read_resolutions = read_resolutions - - def _handle_peak(self, line, sline, info): - super(MS2Base, self)._handle_peak(line, sline, info) - if self._read_charges: - if len(sline) > 2: - sline = line.strip().split() - try: - info['charge array'].append(int(sline[2])) - except ValueError: - raise aux.PyteomicsError("Error parsing fragment charge on line: " + line) - else: - info['charge array'].append(0) - if self._read_resolutions: - if len(sline) > 2: - sline = line.strip().split() - try: - info['resolution array'].append(int(sline[3])) - except ValueError: - raise aux.PyteomicsError("Error parsing fragment peak resolution on line: " + line) - else: - info['resolution array'].append(0) - - def _make_scan(self, info): - if not self._read_charges: - del info['charge array'] - if not self._read_resolutions: - del info['resolution array'] - return super(MS2Base, self)._make_scan(info) - - def __reduce_ex__(self, protocol): - return (self.__class__, - (self._source_init, False, self._convert_arrays, None, self._read_charges, self._read_resolutions, self.encoding), - self.__getstate__()) - - -class MS2(MS2Base, MS1): - """ - A class representing an MS2 file. Supports the `with` syntax and direct iteration for sequential - parsing. - - :py:class:`MS2` object behaves as an iterator, **yielding** spectra one by one. - Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', - 'intensity array', and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - and 'params' stores a :py:class:`dict` of parameters. - - Attributes - ---------- - - header : dict - The file header. - - """ - def __init__(self, *args, **kwargs): - """ - Create an :py:class:`MS2` (text-mode) reader for a given MS2 file. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS2 format. Default is - :py:const:`None`, which means read standard input. - - .. note :: If a file object is given, it must be opened in text mode. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`False`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. - - read_resolutions : bool, optional - If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. - Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array'. - - encoding : str, optional - File encoding. - - Returns - ------- - - out : MS2 - The reader object. - """ - super(MS2, self).__init__(*args, **kwargs) - - -class IndexedMS2(IndexedMS1, MS2Base): - """ - A class representing an MS2 file. Supports the `with` syntax and direct iteration for sequential - parsing. Specific spectra can be accessed by title using the indexing syntax in constant time. - If created using a file object, it needs to be opened in binary mode. - - When iterated, :py:class:`IndexedMS2` object yields spectra one by one. - Each 'spectrum' is a :py:class:`dict` with four keys: 'm/z array', - 'intensity array', 'charge array' and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - 'charge array' is a masked array (:py:class:`numpy.ma.MaskedArray`) of ints, - and 'params' stores a :py:class:`dict` of parameters (keys and values are - :py:class:`str`, keys corresponding to MS2). - - .. warning :: - Labels for scan objects are constructed as the first number in the S line, as follows: - for a line ``S 0 1 123.4`` the label is `'0'`. If these labels are not unique - for the scans in the file, the indexed parser will not work correctly. Consider using - :py:class:`MS2` instead. - - Attributes - ---------- - - header : dict - The file header. - time : RTLocator - A property used for accessing spectra by retention time. - """ - def __init__(self, source=None, use_header=False, convert_arrays=2, dtype=None, read_charges=True, read_resolutions=True, - encoding='utf-8', _skip_index=False, **kwargs): - """ - Create an :py:class:`IndexedMS2` (binary-mode) reader for a given MS2 file. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS2 format. Default is - :py:const:`None`, which means read standard input. - - .. note :: If a file object is given, it must be opened in binary mode. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`True`. - - convert_arrays : one of {0, 1, 2}, optional - If `0`, m/z, intensities and (possibly) charges will be returned as regular lists. - If `1`, they will be converted to regular :py:class:`numpy.ndarray`'s. - If `2`, charges will be reported as a masked array (default). - The default option is the slowest. `1` and `2` require :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. - - read_resolutions : bool, optional - If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. - Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array', 'intensity array', 'charge array'. - - encoding : str, optional - File encoding. - - block_size : int, optinal - Size of the chunk (in bytes) used to parse the file when creating the byte offset index. - - Returns - ------- - - out : IndexedMS2 - The reader object. - """ - super(IndexedMS2, self).__init__(source, use_header=use_header, convert_arrays=convert_arrays, dtype=dtype, - read_charges=read_charges, read_resolutions=read_resolutions, encoding=encoding, _skip_index=_skip_index, **kwargs) - - def __reduce_ex__(self, protocol): - return (self.__class__, - (self._source_init, False, self._convert_arrays, None, self._read_charges, self._read_resolutions, self.encoding, True), - self.__getstate__()) - - -def read_header(source, *args, **kwargs): - """ - Read the specified MS2 file, get the parameters specified in the header - as a :py:class:`dict`. - - Parameters - ---------- - - source : str or file - File name or file object representing an file in MS2 format. - - Returns - ------- - - header : dict - """ - kwargs['use_header'] = True - return read(source, *args, **kwargs).header - - -def read(*args, **kwargs): - """Read an MS2 file and return entries iteratively. - - Read the specified MS2 file, **yield** spectra one by one. - Each 'spectrum' is a :py:class:`dict` with three keys: 'm/z array', - 'intensity array', and 'params'. 'm/z array' and - 'intensity array' store :py:class:`numpy.ndarray`'s of floats, - and 'params' stores a :py:class:`dict` of parameters. - - Parameters - ---------- - - source : str or file or None, optional - A file object (or file name) with data in MS2 format. Default is - :py:const:`None`, which means read standard input. - - use_header : bool, optional - Add the info from file header to each dict. Spectrum-specific parameters - override those from the header in case of conflict. - Default is :py:const:`False`. - - convert_arrays : bool, optional - If :py:const:`False`, m/z and intensities will be returned as regular lists. - If :py:const:`True` (default), they will be converted to regular :py:class:`numpy.ndarray`'s. - Conversion requires :py:mod:`numpy`. - - read_charges : bool, optional - If `True` (default), fragment charges are reported. Disabling it improves performance. - Charge is expected to be the **third** number on the line, after peak *m/z* and intensity. - - read_resolutions : bool, optional - If `True` (default), fragment peak resolutions are reported. Disabling it improves performance. - Resolution is expected to be the **fourth** number on the line, after peak *m/z*, intensity, and charge. - - dtype : type or str or dict, optional - dtype argument to :py:mod:`numpy` array constructor, one for all arrays or one for each key. - Keys should be 'm/z array' and/or 'intensity array'. - - encoding : str, optional - File encoding. - - use_index : bool, optional - Determines which parsing method to use. If :py:const:`True`, an instance of - :py:class:`IndexedMS2` is created. This facilitates random access by scan titles. - If an open file is passed as `source`, it needs to be open in binary mode. - - .. warning :: - Labels for scan objects are constructed as the first number in the S line, as follows: - for a line ``S 0 1 123.4`` the label is `'0'`. If these labels are not unique - for the scans in the file, the indexed parser will not work correctly. - - If :py:const:`False` (default), an instance of :py:class:`MS2` is created. It reads - `source` in text mode and is suitable for iterative parsing. - - block_size : int, optinal - Size of the chunk (in bytes) used to parse the file when creating the byte offset index. - (Accepted only for :py:class:`IndexedMS2`.) - - Returns - ------- - - out : - An instance of :py:class:`MS2` or :py:class:`IndexedMS2`, depending on `use_index` and `source`. - """ - if args: - source = args[0] - else: - source = kwargs.get('source') - use_index = kwargs.pop('use_index', None) - use_index = aux._check_use_index(source, use_index, False) - tp = IndexedMS2 if use_index else MS2 - - return tp(*args, **kwargs) - - -chain = aux._make_chain(read, 'read') diff --git a/pyteomics/mzid.py b/pyteomics/mzid.py deleted file mode 100644 index 2df70bff0fb5dfcbffc15edf3a97bfa374c3c0f5..0000000000000000000000000000000000000000 --- a/pyteomics/mzid.py +++ /dev/null @@ -1,453 +0,0 @@ -""" -mzid - mzIdentML file reader -============================ - -Summary -------- - -`mzIdentML <http://www.psidev.info/mzidentml>`_ is one of the standards -developed by the Proteomics Informatics working group of the HUPO Proteomics -Standard Initiative. - -This module provides a minimalistic way to extract information from mzIdentML -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`MzIdentML`) to iterate over entries in -``<SpectrumIdentificationResult>`` elements, i.e. groups of identifications -for a certain spectrum. Note that each entry can contain more than one PSM -(peptide-spectrum match). They are accessible with "SpectrumIdentificationItem" -key. -:py:class:`MzIdentML` objects also support direct indexing by element ID. - -Data access ------------ - - :py:class:`MzIdentML` - a class representing a single MzIdentML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through peptide-spectrum matches in an mzIdentML - file. Data from a single PSM group are converted to a human-readable dict. - Basically creates an :py:class:`MzIdentML` object and reads it. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`DataFrame` - read MzIdentML files into a :py:class:`pandas.DataFrame`. - -Target-decoy approach ---------------------- - - :py:func:`filter` - read a chain of mzIdentML files and filter to a certain - FDR using TDA. - - :py:func:`filter.chain` - chain a series of filters applied independently to - several files. - - :py:func:`filter.chain.from_iterable` - chain a series of filters applied - independently to an iterable of files. - - :py:func:`filter_df` - filter MzIdentML files and return a :py:class:`pandas.DataFrame`. - - :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be - consiudered decoy. - - :py:func:`fdr` - estimate the false discovery rate of a set of identifications - using the target-decoy approach. - - :py:func:`qvalues` - get an array of scores and local FDR values for a PSM - set using the target-decoy approach. - -Controlled Vocabularies -~~~~~~~~~~~~~~~~~~~~~~~ -mzIdentML relies on controlled vocabularies to describe its contents extensibly. See -`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ -for more details on how they are used. - -Handling Time Units and Other Qualified Quantities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -mzIdentML contains information which may be described as using a variety of different time units. -See `Unit Handling <../data.html#unit-handling>`_ for more information. - - -Deprecated functions --------------------- - - :py:func:`version_info` - get information about mzIdentML version and schema. - You can just read the corresponding attribute of the :py:class:`MzIdentML` - object. - - :py:func:`get_by_id` - get an element by its ID and extract the data from it. - You can just call the corresponding method of the :py:class:`MzIdentML` - object. - - :py:func:`iterfind` - iterate over elements in an mzIdentML file. - You can just call the corresponding method of the :py:class:`MzIdentML` - object. - -Dependencies ------------- - -This module requires :py:mod:`lxml`. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import warnings -from . import auxiliary as aux -from . import xml, _schema_defaults - - -class MzIdentML(xml.MultiProcessingXML, xml.IndexSavingXML): - """Parser class for MzIdentML files.""" - file_format = 'mzIdentML' - _root_element = 'MzIdentML' - _default_schema = _schema_defaults._mzid_schema_defaults - _default_version = '1.1.0' - _default_iter_tag = 'SpectrumIdentificationResult' - _structures_to_flatten = {'Fragmentation'} - _indexed_tags = {'SpectrumIdentificationResult', 'SpectrumIdentificationItem', - 'SearchDatabase', 'SourceFile', 'SpectraData', 'Sample', - 'DBSequence', 'Peptide', 'PeptideEvidence', - 'Measure', 'TranslationTable', 'MassTable', 'Enzyme', - 'Organization', 'AnalysisSoftware', 'BibliographicReference', 'Person', 'Provider', - 'SpectrumIdentificationList', 'SpectrumIdentificationProtocol', 'SpectrumIdentification', - 'ProteinDetectionList', 'ProteinDetectionProtocol', 'ProteinDetection', - 'ProteinDetectionHypothesis', 'ProteinAmbiguityGroup', - } - - _element_handlers = xml.XML._element_handlers.copy() - _element_handlers.update({ - "Modification": xml.XML._promote_empty_parameter_to_name, - "SpectrumIDFormat": xml.XML._promote_empty_parameter_to_name, - "FileFormat": xml.XML._promote_empty_parameter_to_name, - "Role": xml.XML._promote_empty_parameter_to_name - }) - - def __init__(self, *args, **kwargs): - kwargs.setdefault('retrieve_refs', True) - super(MzIdentML, self).__init__(*args, **kwargs) - - def _get_info_smart(self, element, **kwargs): - """Extract the info in a smart way depending on the element type""" - name = xml._local_name(element) - kwargs = dict(kwargs) - rec = kwargs.pop("recursive", None) - - # Try not to recursively unpack the root element - # unless the user really wants to. - if name == self._root_element: - return self._get_info(element, - recursive=(rec if rec is not None else False), - **kwargs) - else: - return self._get_info(element, - recursive=(rec if rec is not None else True), - **kwargs) - - def _retrieve_refs(self, info, **kwargs): - """Retrieves and embeds the data for each attribute in `info` that - ends in _ref. Removes the id attribute from `info`""" - for k, v in dict(info).items(): - if k.endswith('_ref'): - try: - by_id = self.get_by_id(v, retrieve_refs=True) - except KeyError: - warnings.warn('Ignoring unresolved reference: ' + v) - else: - info.update(by_id) - del info[k] - info.pop('id', None) - -def read(source, **kwargs): - """Parse `source` and iterate through peptide-spectrum matches. - - .. note:: This function is provided for backward compatibility only. - It simply creates an :py:class:`MzIdentML` instance using - provided arguments and returns it. - - Parameters - ---------- - source : str or file - A path to a target mzIdentML file or the file object itself. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - retrieve_refs : bool, optional - If :py:const:`True`, additional information from references will be - automatically added to the results. The file processing time will - increase. Default is :py:const:`True`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - build_id_cache : bool, optional - Defines whether a cache of element IDs should be built and stored on the - created :py:class:`MzIdentML` instance. Default value is the value of - `retrieve_refs`. - - .. note:: This parameter is ignored when ``use_index`` is ``True`` (default). - - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored. - - indexed_tags : container of bytes, optional - Defines which elements need to be indexed. Empty set by default. - - Returns - ------- - out : MzIdentML - An iterator over the dicts with PSM properties. - """ - kwargs = kwargs.copy() - kwargs.setdefault('retrieve_refs', True) - kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) - return MzIdentML(source, **kwargs) - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified "XPath". - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create an :py:class:`MzIdentML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - retrieve_refs : bool, optional - If :py:const:`True`, additional information from references will be - automatically added to the results. The file processing time will - increase. Default is :py:const:`False`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - build_id_cache : bool, optional - Defines whether a cache of element IDs should be built and stored on the - created :py:class:`MzIdentML` instance. Default value is the value of - `retrieve_refs`. - - Returns - ------- - out : iterator - """ - kwargs = kwargs.copy() - kwargs['build_id_cache'] = kwargs.get('build_id_cache', - kwargs.get('retrieve_refs')) - return MzIdentML(source, **kwargs).iterfind(path, **kwargs) - -version_info = xml._make_version_info(MzIdentML) - -def get_by_id(source, elem_id, **kwargs): - """Parse `source` and return the element with `id` attribute equal - to `elem_id`. Returns :py:const:`None` if no such element is found. - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`get_by_id` calls on one file, you should - create an :py:class:`MzIdentML` object and use its - :py:meth:`!get_by_id` method. - - Parameters - ---------- - source : str or file - A path to a target mzIdentML file of the file object itself. - - elem_id : str - The value of the `id` attribute to match. - - Returns - ------- - out : :py:class:`dict` or :py:const:`None` - """ - return MzIdentML(source, **kwargs).get_by_id(elem_id, **kwargs) - - -# chain = aux._make_chain(read, 'read') -chain = aux.ChainBase._make_chain(MzIdentML) - - -def is_decoy(psm, prefix=None): - """Given a PSM dict, return :py:const:`True` if all proteins in the dict - are marked as decoy, and :py:const:`False` otherwise. - - Parameters - ---------- - psm : dict - A dict, as yielded by :py:func:`read`. - prefix : ignored - - Returns - ------- - out : bool - """ - return all(pe['isDecoy'] for sii in psm['SpectrumIdentificationItem'] - for pe in sii['PeptideEvidenceRef']) - - -def DataFrame(*args, **kwargs): - """Read MzIdentML files into a :py:class:`pandas.DataFrame`. - - Requires :py:mod:`pandas`. - - .. warning :: Only the first 'SpectrumIdentificationItem' element is considered in every - 'SpectrumIdentificationResult'. - - Parameters - ---------- - *args - Passed to :py:func:`chain`. - **kwargs - Passed to :py:func:`chain`. - - sep : str or None, keyword only, optional - Some values related to PSMs (such as protein information) are variable-length - lists. If `sep` is a :py:class:`str`, they will be packed into single string using - this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is - :py:const:`None`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - data = [] - - sep = kwargs.pop('sep', None) - with chain(*args, **kwargs) as f: - for item in f: - info = {} - for k, v in item.items(): - if isinstance(v, (str, int, float)): - info[k] = v - sii = item.get('SpectrumIdentificationItem', [None])[0] - if sii is not None: - info.update((k, v) for k, v in sii.items() if isinstance(v, (str, int, float))) - evref = sii.get('PeptideEvidenceRef') - if evref: - prot_descr, accessions, isd, starts, ends, lengths = [], [], [], [], [], [] - for d in evref: - prot_descr.append(d.get('protein description')) - accessions.append(d.get('accession')) - isd.append(d.get('isDecoy')) - starts.append(d.get('start')) - ends.append(d.get('end')) - lengths.append(d.get('length')) - isd = all(isd) - if sep is not None: - if all(isinstance(prd, str) for prd in prot_descr): - prot_descr = sep.join(prot_descr) - - if all(isinstance(acc, str) for acc in accessions): - accessions = sep.join(accessions) - - if all(prd is None for prd in prot_descr): - prot_descr = None - if all(acc is None for acc in accessions): - accessions = None - - info.update((k, v) for k, v in evref[0].items() if isinstance(v, (str, int, float, list))) - info['protein description'] = prot_descr - info['accession'] = accessions - info['isDecoy'] = isd - info['start'] = starts - info['end'] = ends - info['length'] = lengths - data.append(info) - df = pd.DataFrame(data) - return df - - -def filter_df(*args, **kwargs): - """Read MzIdentML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. - Positional arguments can be MzIdentML files or DataFrames. - - Requires :py:mod:`pandas`. - - .. warning :: Only the first 'SpectrumIdentificationItem' element is considered in every - 'SpectrumIdentificationResult'. - - Parameters - ---------- - key : str / iterable / callable, keyword only, optional - Default is 'mascot:expectation value'. - is_decoy : str / iterable / callable, keyword only, optional - Default is 'isDecoy'. - *args - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - **kwargs - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - kwargs.setdefault('key', 'mascot:expectation value') - kwargs.setdefault('is_decoy', 'isDecoy') - if all(isinstance(arg, pd.DataFrame) for arg in args): - df = pd.concat(args) - else: - df = DataFrame(*args, **kwargs) - return aux.filter(df, **kwargs) - - -fdr = aux._make_fdr(is_decoy, None) -_key = lambda x: min( - sii['mascot:expectation value'] for sii in x['SpectrumIdentificationItem']) -qvalues = aux._make_qvalues(chain, is_decoy, None, _key) -filter = aux._make_filter(chain, is_decoy, None, _key, qvalues) -filter.chain = aux._make_chain(filter, 'filter', True) diff --git a/pyteomics/mzml.py b/pyteomics/mzml.py deleted file mode 100644 index 11a961329293b01d6318abe52cac3ac13128eb95..0000000000000000000000000000000000000000 --- a/pyteomics/mzml.py +++ /dev/null @@ -1,546 +0,0 @@ -""" -mzml - reader for mass spectrometry data in mzML format -======================================================= - -Summary -------- - -mzML is a standard rich XML-format for raw mass spectrometry data storage. -Please refer to `psidev.info <http://www.psidev.info/index.php?q=node/257>`_ -for the detailed specification of the format and structure of mzML files. - -This module provides a minimalistic way to extract information from mzML -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`MzML` or :py:class:`PreIndexedMzML`) -to iterate over entries in ``<spectrum>`` elements. -:py:class:`MzML` and :py:class:`PreIndexedMzML` also support direct indexing -with spectrum IDs. - -Data access ------------ - - :py:class:`MzML` - a class representing a single mzML file. - Other data access functions use this class internally. - - :py:class:`PreIndexedMzML` - a class representing a single mzML file. - Uses byte offsets listed at the end of the file for quick access to spectrum elements. - - :py:func:`read` - iterate through spectra in mzML file. Data from a - single spectrum are converted to a human-readable dict. Spectra themselves are - stored under 'm/z array' and 'intensity array' keys. - - :py:func:`chain` - read multiple mzML files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - -Controlled Vocabularies -~~~~~~~~~~~~~~~~~~~~~~~ -mzML relies on controlled vocabularies to describe its contents extensibly. See -`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ -for more details on how they are used. - -Handling Time Units and Other Qualified Quantities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -mzML contains information which may be described as using a variety of different time units. -See `Unit Handling <../data.html#unit-handling>`_ for more information. - -Deprecated functions --------------------- - - :py:func:`version_info` - get version information about the mzML file. - You can just read the corresponding attribute of the :py:class:`MzML` object. - - :py:func:`iterfind` - iterate over elements in an mzML file. - You can just call the corresponding method of the :py:class:`MzML` object. - -Dependencies ------------- - -This module requires :py:mod:`lxml` and :py:mod:`numpy`. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import warnings -import numpy as np -from . import xml, auxiliary as aux, _schema_defaults -from .xml import etree - -NON_STANDARD_DATA_ARRAY = 'non-standard data array' - -STANDARD_ARRAYS = set([ - 'm/z array', - 'intensity array', - 'charge array', - 'signal to noise array', - 'time array', - 'wavelength array', - 'flow rate array', - 'pressure array', - 'temperature array', - 'mean charge array', - 'resolution array', - 'baseline array', - 'noise array', - 'sampled noise m/z array', - 'sampled noise intensity array', - 'sampled noise baseline array', - 'ion mobility array', - 'deconvoluted ion mobility drift time array', - 'deconvoluted inverse reduced ion mobility array', - 'deconvoluted ion mobility array', - 'raw ion mobility drift time array', - 'raw inverse reduced ion mobility array', - 'raw ion mobility array', - 'mean inverse reduced ion mobility array', - 'mean ion mobility array', - 'mean ion mobility drift time array', - 'mass array', - 'scanning quadrupole position lower bound m/z array', - 'scanning quadrupole position upper bound m/z array', -]) - - -class MzML(aux.BinaryArrayConversionMixin, aux.TimeOrderedIndexedReaderMixin, xml.MultiProcessingXML, xml.IndexSavingXML): - """Parser class for mzML files.""" - file_format = 'mzML' - _root_element = 'mzML' - _default_schema = _schema_defaults._mzml_schema_defaults - _default_version = '1.1.0' - _default_iter_tag = 'spectrum' - _structures_to_flatten = {'binaryDataArrayList', 'referenceableParamGroupRef'} - _indexed_tags = {'spectrum', 'chromatogram'} - - def __init__(self, *args, **kwargs): - self.decode_binary = kwargs.pop('decode_binary', True) - self._referenceable_param_groups = {} - super(MzML, self).__init__(*args, **kwargs) - - def __getstate__(self): - state = super(MzML, self).__getstate__() - state['decode_binary'] = self.decode_binary - return state - - def __setstate__(self, state): - super(MzML, self).__setstate__(state) - self.decode_binary = state['decode_binary'] - - def _handle_referenceable_param_group(self, param_group_ref, **kwargs): - ref_name = param_group_ref.attrib['ref'] - if ref_name not in self._referenceable_param_groups: - params = self._referenceable_param_groups[ref_name] = self._retrieve_param_group(ref_name) - return params - return self._referenceable_param_groups[ref_name] - - @xml._keepstate - def _retrieve_param_group(self, ref_name): - group = self.get_by_id(ref_name) - group.pop("id", None) - return [xml._XMLParam(k, v, None) for k, v in group.items()] - - def _detect_array_name(self, info): - """Determine what the appropriate name for this - array is by inspecting the available param-based - keys. - - Parameters - ---------- - info : dict - The collapsed binary tag plus - associated *Param data - - Returns - ------- - out : str - The name for this array entry - """ - # If this is a non-standard array, we hope the userParams - # will conform to the same array suffix pattern. - is_non_standard = False - - # Accumulate possible name candidates - candidates = [] - for k in info: - if k.endswith(' array') and not info[k]: - if NON_STANDARD_DATA_ARRAY == k: - is_non_standard = True - else: - candidates.append(k) - # A non-standard data array term key might have the name for the data array - # as the value. - nonstandard_name = info.get(NON_STANDARD_DATA_ARRAY) - if nonstandard_name: - return nonstandard_name - if isinstance(info.get('name'), list): - for val in info['name']: - if val.endswith(' array'): - if NON_STANDARD_DATA_ARRAY == val: - is_non_standard = True - else: - candidates.append(val) - # Name candidate resolution - n_candidates = len(candidates) - # Easy case, exactly one name given - if n_candidates == 1: - return candidates[0] - # We are missing information, but at least - # if we know the array is non-standard we - # can report it as such. Otherwise fall back - # to "binary". This fallback signals special - # behavior elsewhere. - if n_candidates == 0: - invalid = {"encodedLength", "dataProcessingRef", "arrayLength", - "binary"} - for k in info: - if k in invalid: - continue - candidates.append(k) - if len(candidates) == 0: - if is_non_standard: - return NON_STANDARD_DATA_ARRAY - warnings.warn("No options for non-standard data array") - return "binary" - else: - warnings.warn( - "Multiple options for naming binary array after no valid name found: %r" % candidates) - return max(candidates, key=len) - # Multiple choices means we need to make a decision which could - # mask data from the user. This should never happen but stay safe. - # There are multiple options to choose from. There is no way to - # make a good choice here. We first prefer the standardized - # arrays before falling back to just guessing. - else: - candidates = set(candidates) - # Maybe we just have a repeated term? - if len(candidates) == 1: - return next(iter(candidates)) - warnings.warn( - "Multiple options for naming binary array: %r" % candidates) - standard_options = candidates & STANDARD_ARRAYS - if standard_options: - return max(standard_options, key=len) - return max(candidates, key=len) - - def _determine_array_dtype(self, info): - dtype = None - types = {'32-bit float': np.float32, '64-bit float': np.float64, - '32-bit integer': np.int32, '64-bit integer': np.int64, - 'null-terminated ASCII string': np.uint8} - for t, code in types.items(): - if t in info: - dtype = code - del info[t] - break - # sometimes it's under 'name' - else: - if 'name' in info: - for t, code in types.items(): - if t in info['name']: - dtype = code - info['name'].remove(t) - break - return dtype - - def _determine_compression(self, info): - known_compression_types = set(self.compression_type_map) - found_compression_types = known_compression_types & set(info) - if found_compression_types: - found_compression_types = tuple(found_compression_types) - if len(found_compression_types) == 1: - del info[found_compression_types[0]] - return found_compression_types[0] - warnings.warn("Multiple options for binary array compression: %r" % ( - found_compression_types,)) - return found_compression_types[0] - elif "name" in info: - found_compression_types = known_compression_types & set(info['name']) - if found_compression_types: - found_compression_types = tuple(found_compression_types) - if len(found_compression_types) == 1: - del info['name'][found_compression_types[0]] - return found_compression_types[0] - else: - warnings.warn("Multiple options for binary array compression: %r" % ( - found_compression_types,)) - return found_compression_types[0] - else: - return 'no compression' - - def _handle_binary(self, info, **kwargs): - """Special handling when processing and flattening - a <binary> tag and its sibling *Param tags. - - Parameters - ---------- - info : dict - Unprocessed binary array data and metadata - - Returns - ------- - out : dict - The processed and flattened data array and metadata - """ - dtype = self._determine_array_dtype(info) - compressed = self._determine_compression(info) - name = self._detect_array_name(info) - binary = info.pop('binary') - if not self.decode_binary: - info[name] = self._make_record(binary, compressed, dtype, name) - return info - - if binary: - array = self.decode_data_array(binary, compressed, dtype) - else: - array = np.array([], dtype=dtype) - - if name == 'binary': - info[name] = self._convert_array(None, array) - else: - info = {name: self._convert_array(name, array)} - return info - - def _get_info_smart(self, element, **kw): - name = xml._local_name(element) - kwargs = dict(kw) - rec = kwargs.pop('recursive', None) - if name in {'indexedmzML', 'mzML'}: - info = self._get_info(element, - recursive=(rec if rec is not None else False), - **kwargs) - else: - info = self._get_info(element, - recursive=(rec if rec is not None else True), - **kwargs) - if 'binary' in info and isinstance(info, dict): - info = self._handle_binary(info, **kwargs) - - if 'binaryDataArray' in info and isinstance(info, dict): - for array in info.pop('binaryDataArray'): - info.update(array) - intkeys = {'ms level'} - for k in intkeys: - if k in info: - try: - info[k] = int(info[k]) - except (ValueError, TypeError): - pass - return info - - def _retrieve_refs(self, info, **kwargs): - """Retrieves and embeds the data for each attribute in `info` that - ends in _ref. Removes the id attribute from `info`""" - for k, v in dict(info).items(): - if k == 'ref': - by_id = self.get_by_id(v, retrieve_refs=True) - if by_id is None: - warnings.warn('Ignoring unresolved reference: ' + v) - else: - info.update(by_id) - del info[k] - info.pop('id', None) - - @staticmethod - def _get_time(scan): - return scan['scanList']['scan'][0]['scan start time'] - - -def read(source, read_schema=False, iterative=True, use_index=False, dtype=None, huge_tree=False, decode_binary=True): - """Parse `source` and iterate through spectra. - - Parameters - ---------- - source : str or file - A path to a target mzML file or the file object itself. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzML header. Otherwise, use default parameters. - Not recommended without Internet connection or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - spectrum elements. Default is :py:const:`False`. - - dtype : type or dict, optional - dtype to convert arrays to, one for both m/z and intensity arrays or one for each key. - If :py:class:`dict`, keys should be 'm/z array' and 'intensity array'. - - decode_binary : bool, optional - Defines whether binary data should be decoded and included in the output - (under "m/z array", "intensity array", etc.). - Default is :py:const:`True`. - - huge_tree : bool, optional - This option is passed to the `lxml` parser and defines whether - security checks for XML tree depth and node size should be disabled. - Default is :py:const:`False`. - Enable this option for trusted files to avoid XMLSyntaxError exceptions - (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). - - Returns - ------- - out : iterator - An iterator over the dicts with spectrum properties. - """ - - return MzML(source, read_schema=read_schema, iterative=iterative, - use_index=use_index, dtype=dtype, huge_tree=huge_tree, - decode_binary=decode_binary) - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified "XPath". - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create an :py:class:`MzML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header. Otherwise, use default - parameters. Not recommended without Internet connection or - if you don't like to get the related warnings. - - decode_binary : bool, optional - Defines whether binary data should be decoded and included in the output - (under "m/z array", "intensity array", etc.). - Default is :py:const:`True`. - - Returns - ------- - out : iterator - """ - return MzML(source, **kwargs).iterfind(path, **kwargs) - -version_info = xml._make_version_info(MzML) - -# chain = aux._make_chain(read, 'read') - -chain = aux.ChainBase._make_chain(MzML) - - -class PreIndexedMzML(MzML): - """Parser class for mzML files, subclass of :py:class:`MzML`. - Uses byte offsets listed at the end of the file for quick access to spectrum elements. - """ - def _build_index(self): - """ - Build up a `dict` of `dict` of offsets for elements. Calls :meth:`_find_index_list` - and assigns the return value to :attr:`_offset_index` - """ - index = self._find_index_list() - if index: - self._offset_index = index - else: - warnings.warn('Could not extract the embedded offset index. Falling back to default indexing procedure.') - super(PreIndexedMzML, self)._build_index() - - @xml._keepstate - def _iterparse_index_list(self, offset): - index_map = xml.HierarchicalOffsetIndex() - index = index_map._inner_type() - self._source.seek(offset) - try: - for event, elem in etree.iterparse(self._source, events=('start', 'end'), remove_comments=True): - if event == 'start': - if elem.tag == 'index': - index = {} - index_map[elem.attrib['name']] = index - else: - if elem.tag == 'offset': - index[elem.attrib['idRef']] = int(elem.text) - elem.clear() - except etree.XMLSyntaxError: - # The iteration has reached the end of the indexList tag and the parser - # encounters the later elements in the document. - pass - return index_map - - @xml._keepstate - def _find_index_list_offset(self): - """ - Search relative to the bottom of the file upwards to find the offsets - of the index lists. - - Returns - ------- - list of int - A list of byte offsets for `<indexList>` elements - """ - self._source.seek(-1024, 2) - text = self._source.read(1024) - index_offsets = list(map(int, re.findall(br'<indexListOffset>(\d+)</indexListOffset>', text))) - return index_offsets - - @xml._keepstate - def _find_index_list(self): - """ - Extract lists of index offsets from the end of the file. - - Returns - ------- - dict of str -> dict of str -> int - """ - offsets = self._find_index_list_offset() - index_list = xml.HierarchicalOffsetIndex() - for offset in offsets: - # Sometimes the offset is at the very beginning of the file, - # due to a bug in an older version of ProteoWizard. If this crude - # check fails, don't bother searching the entire file, and fall back - # on the base class's mechanisms. - # - # Alternative behavior here would be to start searching for the start - # of the index from the bottom of the file, but this version of Proteowizard - # also emits invalid offsets which do not improve retrieval time. - if offset < 1024: - continue - index_list = self._iterparse_index_list(offset) - return index_list diff --git a/pyteomics/mzmlb.py b/pyteomics/mzmlb.py deleted file mode 100644 index 5cda3cacf5aaa1bfee7029981db91fd4849c923d..0000000000000000000000000000000000000000 --- a/pyteomics/mzmlb.py +++ /dev/null @@ -1,618 +0,0 @@ -# -*- coding: utf8 -*- -""" -mzmlb - reader for mass spectrometry data in mzMLb format -========================================================= - -.. warning:: - This is a **Provisional Implementation**. The mzMLb format has been published - but is not yet broadly available. - -Summary -------- -mzMLb is an HDF5 container format wrapping around the standard rich XML-format -for raw mass spectrometry data storage. Please refer to [1]_ for more information -about mzMLb and its features. Please refer to -`psidev.info <https://www.psidev.info/mzML>`_ for the detailed -specification of the format and structure of mzML files. - -This module provides a minimalistic way to extract information from mzMLb -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`MzMLb` to iterate over entries in ``<spectrum>`` elements. -:py:class:`MzMLb` also support direct indexing with spectrum IDs or indices. - -Data access ------------ - - :py:class:`MzMLb` - a class representing a single mzMLb file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through spectra in mzMLb file. Data from a - single spectrum are converted to a human-readable dict. Spectra themselves are - stored under 'm/z array' and 'intensity array' keys. - - :py:func:`chain` - read multiple mzMLb files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - -Controlled Vocabularies -~~~~~~~~~~~~~~~~~~~~~~~ -mzMLb relies on controlled vocabularies to describe its contents extensibly. See -`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ -for more details on how they are used. - -Handling Time Units and Other Qualified Quantities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -mzMLb contains information which may be described as using a variety of different time units. -See `Unit Handling <../data.html#unit-handling>`_ for more information. - -References ----------- -.. [1] Bhamber, R. S., Jankevics, A., Deutsch, E. W., Jones, A. R., & Dowsey, A. W. (2021). - MzMLb: A Future-Proof Raw Mass Spectrometry Data Format Based on Standards-Compliant - mzML and Optimized for Speed and Storage Requirements. Journal of Proteome Research, - 20(1), 172–183. https://doi.org/10.1021/acs.jproteome.0c00192 -""" - -import io -import warnings -import logging -from collections import namedtuple - -import h5py -try: - logging.getLogger("hdf5plugin").addHandler(logging.NullHandler()) - import hdf5plugin -except ImportError: - hdf5plugin = None - -import numpy as np - -from pyteomics.mzml import MzML as _MzML -from pyteomics.auxiliary.file_helpers import HierarchicalOffsetIndex, TaskMappingMixin, TimeOrderedIndexedReaderMixin, FileReader -from pyteomics import auxiliary as aux, xml - - -def delta_predict(data, copy=True): - '''Reverse the lossy transformation of the delta compression - helper. - - Parameters - ---------- - data : :class:`numpy.ndarray` - The data to transform - copy : bool - Whether to make a copy of the data array or transform it in-place. - - Returns - ------- - :class:`numpy.ndarray` - The transformed data array - ''' - if copy: - out = data.copy() - else: - out = data - for i in range(2, len(data)): - out[i] = out[i] + out[i - 1] - out[0] - return out - - -def linear_predict(data, copy=True): - '''Reverse the lossy transformation of the linear interpolation compression - helper. - - Parameters - ---------- - data : :class:`numpy.ndarray` - The data to transform - copy : bool - Whether to make a copy of the data array or transform it in-place. - - Returns - ------- - :class:`numpy.ndarray` - The transformed data array - ''' - if copy: - out = data.copy() - else: - out = data - for i in range(2, len(data)): - out[i] = out[i] + 2 * out[i - 1] - out[i - 2] - out[1] - return out - - -class HDF5ByteBuffer(io.RawIOBase): - '''Helper class that looks file-like so that we can pass a HDF5 byte dataset to - an arbitrary XML parser. - - Implements :class:`~io.RawIOBase` for reading. - ''' - def __init__(self, buffer, offset=None): - if offset is None: - offset = 0 - self.buffer = buffer - self.offset = offset - self.size = self.buffer.size - self.mode = 'rb' - - def readable(self): - return True - - def seekable(self): - return True - - def isatty(self): - return False - - def seek(self, offset, whence=0): - if whence == io.SEEK_SET: - self.offset = offset - elif whence == io.SEEK_CUR: - self.offset += offset - elif whence == io.SEEK_END: - self.offset = self.size - offset - else: - raise ValueError("Bad whence %r" % whence) - return self.offset - - def tell(self): - return self.offset - - def close(self): - return - - @property - def closed(self): - return False - - def readinto(self, b): - n = len(b) - temp = self._read(n) - m = len(temp) - b[:m] = temp[:] - return m - - def readall(self): - return bytes(self._read(-1)) - - def read(self, n=-1): - return bytes(self._read(n)) - - def write(self, b): - raise ValueError("Read-only stream") - - def _read(self, n=-1): - if n == -1: - n = self.size + 1 - dat = bytearray(np.array(self.buffer[self.offset:self.offset + n])) - self.offset += n - return dat - - -class external_array_slice(namedtuple('external_array_slice', - ['array_name', 'offset', 'length', 'source', 'transform', 'key', 'dtype'])): - def decode(self): - """Decode :attr:`data` into a numerical array - - Returns - ------- - np.ndarray - """ - return self.source._decode_record(self) - - -class ExternalDataMzML(_MzML): - '''An MzML parser that reads data arrays from an external provider. - - This is an implementation detail of :class:`MzMLb`. - ''' - def __init__(self, *args, **kwargs): - self._external_data_registry = kwargs.pop("external_data_registry", None) - super(ExternalDataMzML, self).__init__(*args, **kwargs) - - def _make_record(self, array_name, offset, length, transform, name, dtype): - return external_array_slice(array_name, offset, length, self, transform, name, dtype) - - def _transform_array(self, array, transform): - if transform is None: - return array - elif "linear prediction" == transform: - return linear_predict(array, copy=False) - elif "delta prediction" == transform: - return delta_predict(array, copy=False) - else: - raise ValueError("Transformation not recognized") - - def _retrieve_external_array(self, array_name, length, offset): - array = self._external_data_registry.get(array_name, length, offset) - return array - - def decode_data_array(self, array_name, offset, length, transform=None, dtype=np.float64): - array = self._retrieve_external_array(array_name, length, offset) - array = self._transform_array(array, transform) - return array - - def _decode_record(self, record): - array = self.decode_data_array( - record.array_name, record.offset, record.length, record.transform, record.dtype) - return self._finalize_record_conversion(array, record) - - def _handle_binary(self, info, **kwargs): - if not self.decode_binary: - self.decode_binary = True - # Binary decoding works totally differently here, not supporting the previous signatures - # that the parent method will use. Pretend we are decoding because it is a no-op in the - # parent method. - result = super(ExternalDataMzML, self)._handle_binary(info, **kwargs) - self.decode_binary = False - else: - result = super(ExternalDataMzML, self)._handle_binary(info, **kwargs) - try: - array_name = info['external HDF5 dataset'] - except KeyError: - array_name = info['external dataset'] - offset = int(info['external offset']) - length = int(info['external array length']) - - transform = None - # The zlib compression in these two terms happens automatically during HDF5 encoding and - # the reader needn't even know about it. Need an example of how Numpress will be signaled. - if "linear prediction" in info or "truncation, linear prediction and zlib compression" in info: - transform = 'linear prediction' - elif "delta prediction" in info or "truncation, delta prediction and zlib compression" in info: - transform = 'delta prediction' - - if not self.decode_binary: - name = self._detect_array_name(info) - result[name] = self._make_record( - array_name, offset, length, transform, name, - self._external_data_registry.dtype_of(array_name)) - return result - - array = self._retrieve_external_array(array_name, length, offset) - - if len(result) == 1: - name = next(iter(result)) - else: - name = self._detect_array_name(info) - result[name] = self._convert_array(name, array) - return result - - def reset(self): - super(ExternalDataMzML, self).reset() - self._external_data_registry.clear() - - -class chunk_interval_cache_record(namedtuple("chunk_interval_cache_record", ("start", "end", "array"))): - def contains(self, start, end): - if self.start <= start: - if end < self.end: - return True - return False - - def get(self, start, end): - return self.array[start - self.start:end - self.start] - - def __eq__(self, other): - return self.start == other.start and self.end == other.end - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash(self.start) - - -class ExternalArrayRegistry(object): - '''Read chunks out of a single long array - - This is an implementation detail of :class:`MzMLb` - - Attributes - ---------- - registry : Mapping - A mapping from array name to the out-of-core array object. - chunk_size : int - The number of entries to chunk together and keep in memory. - chunk_cache : dict - A mapping from array name to cached array blocks. - ''' - def __init__(self, registry, chunk_size=None): - if chunk_size is None: - chunk_size = 2 ** 20 - else: - chunk_size = int(chunk_size) - self.registry = registry - self.chunk_cache = {} - self.chunk_size = chunk_size - - def clear(self): - self.chunk_cache.clear() - - def _get_raw(self, array_name, start, end): - return self.registry[array_name][start:end] - - def _make_cache_record(self, array_name, start, end): - return chunk_interval_cache_record(start, end, self._get_raw(array_name, start, end)) - - def get(self, array_name, length, offset=0): - start = offset - end = start + length - try: - cache_record = self.chunk_cache[array_name] - if cache_record.contains(start, end): - return cache_record.get(start, end) - else: - cache_record = self._make_cache_record( - array_name, start, start + max(length, self.chunk_size)) - self.chunk_cache[array_name] = cache_record - return cache_record.get(start, end) - except KeyError: - cache_record = self._make_cache_record( - array_name, start, start + max(length, self.chunk_size)) - self.chunk_cache[array_name] = cache_record - return cache_record.get(start, end) - return self.registry[array_name][offset:offset + length] - - def dtype_of(self, array_name): - return self.registry[array_name].dtype - - def __call__(self, array_name, length, offset=0): - return self.get(array_name, length, offset) - - -class MzMLb(TimeOrderedIndexedReaderMixin, TaskMappingMixin): - '''A parser for mzMLb [1]_. - - Provides an identical interface to :class:`~pyteomics.mzml.MzML`. - - Attributes - ---------- - path : str, Path-like, or file-like object - The mzMLb file path or a file-like object providing it. - handle : :class:`h5py.File` - The raw HDF5 file container. - mzml_parser : :class:`~.ExternalDataMzML` - The mzML parser for the XML stream inside the HDF5 file with - special behavior for retrieving the out-of-band data arrays - from their respective storage locations. - schema_version : str - The mzMLb HDF5 schema version, distinct from the mzML schema inside it. - - - References - ---------- - [1] Bhamber, R. S., Jankevics, A., Deutsch, E. W., Jones, A. R., & Dowsey, A. W. (2021). - MzMLb: A Future-Proof Raw Mass Spectrometry Data Format Based on Standards-Compliant - mzML and Optimized for Speed and Storage Requirements. Journal of Proteome Research, - 20(1), 172–183. https://doi.org/10.1021/acs.jproteome.0c00192 - ''' - _default_iter_tag = ExternalDataMzML._default_iter_tag - - file_format = "mzMLb" - - def __init__(self, path, hdfargs=None, mzmlargs=None, allow_updates=False, - use_index=True, **kwargs): - if hdfargs is None: - hdfargs = {} - if mzmlargs is None: - mzmlargs = {} - mzmlargs.update(kwargs) - - self.path = path - self._hdfargs = hdfargs - self._mzmlargs = mzmlargs - self._allow_updates = allow_updates - self.handle = h5py.File(self.path, 'r+' if self._allow_updates else 'r', **hdfargs) - self.schema_version = self.handle['mzML'].attrs.get('version') - self._check_compressor() - - self._xml_buffer = io.BufferedReader(HDF5ByteBuffer(self.handle['mzML'])) - self._array_registry = ExternalArrayRegistry(self.handle) - self._make_mzml_parser(mzmlargs) - - super(MzMLb, self).__init__(**kwargs) - - def _check_compressor(self): - for key in self.handle.keys(): - if "spectrum_MS_" in key or "chromatogram_MS_": - data = self.handle[key] - try: - filts = data._filters - except AttributeError: - continue - if '32001' in filts: - if hdf5plugin is None: - warnings.warn( - ("Blosc meta-compressor detected, but hdf5plugin is " - "not installed, may not be able to access %r") % (key)) - - def _make_mzml_parser(self, kwargs): - self._mzml_parser = ExternalDataMzML( - self._xml_buffer, external_data_registry=self._array_registry, - use_index=False, **kwargs) - self._mzml_parser._offset_index = self._build_index() - self._mzml_parser._use_index = True - - @property - def name(self): - if hasattr(self.path, 'name'): - return self.path.name - return self.path - - def _build_index(self): - index = HierarchicalOffsetIndex() - for label in [u'spectrum', u'chromatogram']: - sub = index[label] - ids = bytearray(np.array(self.handle['mzML_{}Index_idRef'.format(label)])).split(b"\x00") - offsets = self.handle["mzML_{}Index".format(label)][:-1] - for i, o in enumerate(offsets): - sub[ids[i].decode('utf8')] = o - return index - - def get_by_id(self, id): - """Parse the file and return the element with `id` attribute equal - to `elem_id`. Returns :py:const:`None` if no such element is found. - - Parameters - ---------- - elem_id : str - The value of the `id` attribute to match. - - Returns - ------- - out : :py:class:`dict` or :py:const:`None` - """ - return self._mzml_parser.get_by_id(id) - - def get_by_ids(self, ids): - return self._mzml_parser.get_by_ids(ids) - - def get_by_index(self, i): - return self._mzml_parser.get_by_index(i) - - def get_by_indexes(self, indexes): - return self._mzml_parser.get_by_indexes(indexes) - - def get_by_index_slice(self, s): - return self._mzml_parser.get_by_index_slice(s) - - def get_by_key_slice(self, s): - return self._mzml_parser.get_by_key_slice(s) - - def __contains__(self, key): - return key in self.index - - def __getitem__(self, i): - return self._mzml_parser[i] - - def __len__(self): - return len(self._mzml_parser) - - def __iter__(self): - return iter(self._mzml_parser) - - def __next__(self): - return next(self._mzml_parser) - - def next(self): - return self.__next__() - - def __reduce__(self): - return self.__class__, (self.path, self._hdfargs, self._mzmlargs, self._allow_updates) - - def close(self): - self.handle.close() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def iterfind(self, *args, **kwargs): - iterf = self._mzml_parser.iterfind(*args, **kwargs) - iterf.parser = self - return iterf - - def _iterfind_impl(self, path, *args, **kwargs): - return self._mzml_parser._iterfind_impl(path, *args, **kwargs) - - @property - def index(self): - return self._mzml_parser.index - - @property - def _offset_index(self): - return self._mzml_parser._offset_index - - @property - def default_index(self): - return self._mzml_parser.default_index - - def _get_time(self, scan): - return self._mzml_parser._get_time(scan) - - @property - def mzml_parser(self): - return self._mzml_parser - - def _task_map_iterator(self): - """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC - queue used by :meth:`map` - - Returns - ------- - :class:`Iteratable` - """ - return iter(self.index[self._default_iter_tag]) - - def read(self, n=-1): - return self._mzml_parser.read(n) - - def reset(self): - self._mzml_parser.reset() - - def seek(self, offset, whence=0): - self._mzml_parser.seek(offset, whence) - - def tell(self): - return self._mzml_parser.tell() - - def get_dataset(self, name): - '''Get an HDF5 dataset by its name or path relative to - the root node. - - .. warning:: - Because this accesses HDF5 data directly, it may be possible to mutate - the underlying file if :attr:`allow_updates` is :const:`True`. - - Parameters - ---------- - name : :class:`str` - The dataset name or path. - - Returns - ------- - :class:`h5py.Dataset` or :class:`h5py.Group` - - Raises - ------ - KeyError : - The name is not found. - ''' - return self.handle[name] - - -def read(source, dtype=None): - """Parse `source` and iterate through spectra. - - Parameters - ---------- - source : str or file - A path to a target mzMLb file or the file object itself. - dtype : type or dict, optional - dtype to convert arrays to, one for both m/z and intensity arrays or one for each key. - If :py:class:`dict`, keys should be 'm/z array' and 'intensity array'. - - Returns - ------- - out : iterator - An iterator over the dicts with spectrum properties. - """ - reader = MzMLb(source, dtype=dtype) - return reader - - -# The MzMLb class is detatched from the normal :class:`FileReader`-based inheritance tree, -# this grafts it back on for :func:`isinstance` and :func:`issubclass` tests at least. -FileReader.register(MzMLb) - - -version_info = xml._make_version_info(MzMLb) - -# chain = aux._make_chain(read, 'read') - -chain = aux.ChainBase._make_chain(MzMLb) diff --git a/pyteomics/mztab.py b/pyteomics/mztab.py deleted file mode 100644 index 148d4cfc8be1cd49b988fd6343863ee557debf26..0000000000000000000000000000000000000000 --- a/pyteomics/mztab.py +++ /dev/null @@ -1,783 +0,0 @@ -""" -mztab - mzTab file reader -========================= - -Summary -------- - -`mzTab <https://github.com/HUPO-PSI/mzTab>`_ is one of the standards -developed by the Proteomics Informatics working group of the HUPO Proteomics -Standard Initiative. - -This module provides a way to read mzTab files into a collection of -:py:class:`pandas.DataFrame` instances in memory, along with a mapping -of the file-level metadata. MzTab specifications 1.0 and 2.0 are supported. - -Data access ------------ - - :py:class:`MzTab` - a class representing a single mzTab file. - -Helpers -------- - - :py:class:`Group` - a collection of metadata relating to one entity. - - -Internals ---------- - - :py:class:`_MzTabTable` - a single table in an mzTab file. - - -Property Management -~~~~~~~~~~~~~~~~~~~ - -:mod:`mztab` uses metaprogramming to generate its metadata accessors, generated by -these classes working in concert. - - :py:class:`MetadataBackedProperty` - - :py:class:`MetadataBackedCollection` - - :py:class:`MetadataPropertyAnnotator` - -------------------------------------------------------------------------------- -""" - -import re -import warnings - -try: - import pandas as pd -except ImportError: - pd = None - - -from collections import OrderedDict - -from pyteomics.auxiliary import _file_obj -from pyteomics.auxiliary import cvstr -from pyteomics.auxiliary.utils import add_metaclass - - -def _require_pandas(): - if pd is None: - raise ImportError( - "To load an mzTab file into pandas.DataFrame objects, you must install pandas!") - - -class MetadataBackedProperty(object): - '''Our descriptor type which uses the instance's metadata attribute to carry its values''' - - def __init__(self, name, variant_required=None): - if variant_required is None: - variant_required = () - self.name = name - self.variant_required = variant_required - self.__doc__ = self.build_docstring() - - def __repr__(self): - return "{self.__class__.__name__}(name={self.name!r}, variant_required={self.variant_required})".format(self=self) - - def __get__(self, obj, objtype=None): - if obj is None and objtype is not None: - # So the property can be seen for what it is - return self - value = obj.metadata.get(self.name) - if value is None and self.variant_required and obj.variant in self.variant_required: - raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format( - self.name, obj.variant)) - return value - - def __set__(self, obj, value): - obj.metadata[self.name] = value - - def __delete__(self, obj): - del obj.metadata[self.name] - - def build_docstring(self): - doc = '''Accesses the {self.name!r} key in the :attr:`metadata` mapping attached -to this object. -''' - if self.variant_required: - if len(self.variant_required) > 1: - plural = 's' - else: - plural = '' - requires = ' or '.join(['-%s' % v for v in self.variant_required]) - doc += ''' -This key must be present when the file is of {requires} variant{plural}. - '''.format(requires=requires, plural=plural) - doc += ''' -Returns -------- -object - ''' - doc = doc.format(self=self) - return doc - - -class MetadataBackedCollection(object): - def __init__(self, name, variant_required=None): - if variant_required is None: - variant_required = () - self.name = name - self.variant_required = variant_required - self.__doc__ = self.build_docstring() - - def __get__(self, obj, objtype=None): - if obj is None and objtype is not None: - # So the property can be seen for what it is - return self - groups = obj.gather(obj.metadata) - value = groups.get(self.name) - if value is None and self.variant_required and obj.variant in self.variant_required: - raise AttributeError("{0} is missing from a mzTab-\"{1}\" document where it is required!".format( - self.name, obj.variant)) - return value - - def build_docstring(self): - doc = '''Accesses the {self.name!r} key group gathered in the :attr:`metadata` mapping attached -to this object. - -This group is dynamically generated on each access and may be expensive for repeated use. -''' - if self.variant_required: - if len(self.variant_required) > 1: - plural = 's' - else: - plural = '' - requires = ' or '.join(['-%s' % v for v in self.variant_required]) - doc += ''' -This key must be present when the file is of {requires} variant{plural}. - '''.format(requires=requires, plural=plural) - doc += ''' -Returns -------- -:class:`~.Group` - ''' - doc = doc.format(self=self) - return doc - - -class MetadataPropertyAnnotator(type): - '''A simple metaclass to do some class-creation time introspection - and descriptor binding. - - Uses a list of strings or 3-tuples from :attr:`__metadata_properties__` to - bind :class:`MetadataBackedProperty` or :class:`MetadataBackedCollection` - onto the class during its creation. - - The specification for a property is a tuple of three values: - 1. The metadata key to fetch - 2. The property name to expose on the object - 3. The variant(s) which require this metadata key be present - - :obj:`("mzTab-version", "version", ("M", "P"))` would be interpreted as - Expose a property "version" on instances which serves the key "mzTab-version" - from the instance's :attr:`metadata`, and raise an error if it is absent in - the "M" or "P" variants. - - Alternatively a specification may be a single string which will be interpreted - as the metadata key, and used to generate the property name replacing all '-' - with '_' and assumed to be optional in all variants. - - If a metadata key ends with "[]" the property is assumed to be a collection. mzTab - makes heavy use of "<collection_name>[<index>]..." keys to define groups of homogenous - object types, often with per-element attributes. - - .. code-block:: - - variable_mod[1] CHEMMOD:15.9949146221 - variable_mod[1]-site M - variable_mod[1]-position Anywhere - variable_mod[2] CHEMMOD:42.0105646863 - variable_mod[2]-site N-term - variable_mod[2]-position Protein N-term - - A specification :obj:`("variable_mod[]", "variable_mods", ())` would create a property - that returns: - - .. code-block:: python - - >>>instance.variable_mods - Group([(1, - {'name': 'CHEMMOD:15.9949146221', - 'position': 'Anywhere', - 'site': 'M'}), - (2, - {'name': 'CHEMMOD:42.0105646863', - 'position': 'Protein N-term', - 'site': 'N-term'})]) - - For precise description of the property collection algorithm, see - :meth:`~_MzTabParserBase.collapse_properties` and - :meth:`~_MzTabParserBase.gather`. - - If any base classes have a :attr:`__metadata_properties__` attribute, it will - also be included unless :attr:`__inherit_metadata_properties__` is set to - :const:`False`. Any names explicitly set by the current class override this - automatic property generation. - ''' - def __new__(mcls, name, bases, attrs): - props = attrs.get('__metadata_properties__', []) - inherit_props = attrs.get("__inherit_metadata_properties__", True) - # Gather from parent classes so we can use inheritance for overriding this - # behavior too. - if inherit_props: - for base in bases: - props.extend(getattr(base, '__metadata_properties__', [])) - - keys = set(attrs) - - # Iterate in reverse to ensure that classes nearer to the new classes override - # more basal classes, ending with the new class to make sure overrides are - # applied. - for prop in reversed(props): - # If the property definition is a single string, interpret the specification - # as the property name, and apply some simple normalization to make it a valid - # Python attribute name and assume the property is always optional. - if isinstance(prop, str): - prop_name = prop - attr_name = prop_name.replace("mzTab-", '').replace('-', '_') - variant_required = None - else: - # Otherwise unpack the triple - prop_name, attr_name, variant_required = prop - # Attach the new descriptor to the class definition to be created. These descriptors - # will then be used when instances of that class try to get/set those attribute names. - if attr_name in keys: - continue - if prop_name.endswith('[]'): - # If the property name ends with "[]", then we're dealing with a collection so - # use the :class:`MetadataBackedCollection` descriptor - attrs[attr_name] = MetadataBackedCollection( - prop_name[:-2], variant_required=variant_required) - else: - # Otherwise it is a scalar-valued property, using the :class:`MetadataBackedProperty` - # descriptor - prop = attrs[attr_name] = MetadataBackedProperty( - prop_name, variant_required=variant_required) - - return super(MetadataPropertyAnnotator, mcls).__new__(mcls, name, bases, attrs) - - -class _MzTabParserBase(object): - def _parse_param(self, tuplet): - """Parse a controlled vocabulary or user specified parameter tuplet - into a Python object - - Parameters - ---------- - tuplet : str - A square brace enclosed tuplet of values describing the parameter - - Returns - ------- - tuple - The reduced representation of the parameter - """ - cv, acc, name, value = re.split(r"\s*,\s*", tuplet[1:-1]) - param_name = cvstr(name, acc) - if value: - return (param_name, value) - else: - return (param_name) - - def collapse_properties(self, proplist): - '''Collapse a flat property list into a hierchical structure. - - This is intended to operate on :py:class:`Mapping` objects, including - :class:`dict`, :class:`pandas.Series` and :class:`pandas.DataFrame`. - - .. code-block:: python - - { - "ms_run[1]-format": "Andromeda:apl file format", - "ms_run[1]-location": "file://...", - "ms_run[1]-id_format": "scan number only nativeID format" - } - - to - - .. code-block:: python - - { - "ms_run": [ - { - "format": "Andromeda:apl file format", - "location": "file://...", - "id_format": "scan number only nativeID format" - } - ] - } - - Parameters - ---------- - proplist: :class:`Mapping` - Key-Value pairs to collapse - - Returns - ------- - :class:`OrderedDict`: - The collapsed property list - ''' - entities = OrderedDict() - rest = {} - for key, value in proplist.items(): - try: - entity, prop_name = key.rsplit("-", 1) - except ValueError: - rest[key] = value - continue - try: - entity_dict = entities[entity] - except KeyError: - entity_dict = entities[entity] = {} - entity_dict[prop_name] = value - for key, value in proplist.items(): - if key in entities: - entity = entities[key] - if 'name' not in entity: - entity['name'] = value - for key, value in rest.items(): - if key in entities: - entities[key]['name'] = value - else: - entities[key] = value - return entities - - def _collapse_collections(self, entities): - gathered = Group() - for key, props in entities.items(): - if '[' in key: - k, ix = key.split('[', 1) - if '[' in ix: - # If we have multiple [ in a key, we are dealing with a path - path = extract_path(key) - for k, ix in path[:-1]: - store = gathered[k] - store = store[int(ix)] - k, ix = path[-1] - store[k][int(ix)] = props - - else: - ix = int(ix[:-1]) - gathered[k][ix] = props - else: - gathered[key] = props - return gathered - - def _cast_value(self, value): - """Convert a cell value to the appropriate Python type - - Parameters - ---------- - value : str - The cell value as text - - Returns - ------- - object - The most specialized type recognized - """ - if value == 'null': - return None - # is it a parameter? - if value.startswith("["): - try: - if "|" in value: - return [self._cast_value(v) for v in value.split("|")] - else: - return self._parse_param(value) - except ValueError: - return value - else: - # begin guessing dtype - try: - value = int(value) - except ValueError: - try: - value = float(value) - except ValueError: - pass - return value - - def gather(self, mapping): - '''Collapse property lists using :meth:`collapse_properties` - and then gather collections of entites into lists. - - Parameters - ---------- - mapping : dict - The flattened hierarchy of properties to re-construct - - Returns - ------- - Group : - A :class:`Group` of all entities and collections of entities - ''' - return self._collapse_collections(self.collapse_properties(mapping)) - - -class _MzTabTable(_MzTabParserBase): - - """An internal class for accumulating information about an single table - represented in an mzTab file - - Attributes - ---------- - header : list - The column names for the table - name : str - The table's name, human readable - rows : list - An accumulator of table rows - """ - - def __init__(self, name, header=None, rows=None): - if rows is None: - rows = [] - self.name = name - self.header = header - self.rows = rows - - def __repr__(self): - n_cols = len(self.header) if self.header is not None else 0 - n_rows = len(self.rows) - template = "<_MzTabTable {name} with {n_cols} columns and {n_rows} rows>" - return template.format(n_cols=n_cols, n_rows=n_rows, name=self.name) - - def add(self, row): - self.rows.append([self._cast_value(v) for v in row]) - - def __len__(self): - return len(self.rows) - - def __getitem__(self, i): - if isinstance(i, int): - return self.gather({h: r for h, r in zip(self.header, self.rows[i])}) - elif isinstance(i, slice): - out = [] - for i in range(i.start or 0, i.stop or len(self), i.step or 1): - out.append(self[i]) - return out - raise TypeError("Cannot access table with object of type %r" % type(i)) - - def as_dict(self): - return {"rows": [dict(zip(self.header, row)) for row in self.rows], - "name": self.name} - - def as_df(self, index=None): - """Convert the table to a DataFrame in memory. - - Returns - ------- - pd.DataFrame - """ - _require_pandas() - table = pd.DataFrame(data=self.rows, columns=self.header) - if index is not None and len(table.index) > 0: - table = table.set_index(index, drop=False) - table.name = self.name - return table - - def clear(self): - self.header = None - self.rows = [] - - -DATA_FRAME_FORMAT = 'df' -DICT_FORMAT = 'dict' -RAW_FORMAT = 'raw' - -PATH_PARSER = re.compile(r"([^\[]+)\[(\d+)\]_?") - - -def extract_path(path): - '''Parse `key[index]_next_key[next_index]...` sequences into - lists of (key, index) pairs. - - Parameters - ---------- - path : str - The path key to parse - - Returns - ------- - list - ''' - return [(t, int(i)) for t, i in PATH_PARSER.findall(path)] - - -class Group(OrderedDict): - '''A type for holding collections of arbitrarily nested keys from rows - and metadata mappings. - - Implemented as an autovivifying :class:`OrderedDict` variant. As such implements - the :class:`~collections.abc.Mapping` interface. - ''' - - def get_path(self, path, default=None): - '''As :meth:`get` but over a path key parsed with :func:`extract_path`. - - Parameters - ---------- - path : str - The path to search down - default : object, optional - The return value when the path is missing - - Returns - ------- - object - ''' - tokens = extract_path(path) - if not tokens: - return self.get(path, default) - layer = self - for k, i in tokens[:-1]: - i = int(i) - layer = layer.get(k) - if layer is None: - return None - layer = layer.get(i) - if layer is None: - return None - k, i = tokens[-1] - i = int(i) - layer = layer.get(k) - if layer is None: - return default - value = layer.get(i, default) - return value - - def __missing__(self, key): - value = self.__class__() - self[key] = value - return value - - -@add_metaclass(MetadataPropertyAnnotator) -class MzTab(_MzTabParserBase): - """Parser for mzTab format files. - - Attributes - ---------- - comments : list - A list of comments across the file - file : _file_obj - A file stream wrapper for the file to be read - metadata : OrderedDict - A mapping of metadata that was entities. - peptide_table : _MzTabTable or pd.DataFrame - The table of peptides. Not commonly used. - protein_table : _MzTabTable or pd.DataFrame - The table of protein identifications. - small_molecule_table : _MzTabTable or pd.DataFrame - The table of small molecule identifications. - spectrum_match_table : _MzTabTable or pd.DataFrame - The table of spectrum-to-peptide match identifications. - table_format: 'df', 'dict', or callable - The structure type to replace each table with. The string - 'df' will use pd.DataFrame instances. 'dict' will create - a dictionary of dictionaries for each table. A callable - will be called on each raw _MzTabTable object - - Additional components of :attr:`metadata` are exposed as properties, returning - single values or aggregated collections of objects. - """ - - __metadata_properties__ = [ - ('mzTab-version', 'version', ()), - ('mzTab-mode', 'mode', 'P'), - ('mzTab-type', 'type', 'P'), - ('mzTab-ID', 'id', 'M'), - 'title', - 'description', - ('ms_run[]', 'ms_runs', 'MP'), - ('instrument[]', 'instruments', ()), - ('software[]', 'software', ()), - ('publication[]', 'publications', ()), - ('contact[]', 'contacts', ()), - ('uri[]', 'uris', ()), - ('external_study_uri[]', 'external_study_uris', ()), - ('quantification_method', 'quantification_method', 'M'), - ('sample[]', 'samples', ()), - ('assay[]', 'assays', ()), - ('study_variable[]', 'study_variables', 'M'), - ('custom[]', 'custom', ()), - ('cv[]', 'cvs', 'M'), - ('database[]', 'databases', 'M'), - - ('psm_search_engine_score[]', 'psm_search_engine_scores', ()), - ('protein_search_engine_score[]', 'protein_search_engine_scores', ()), - ('fixed_mod[]', 'fixed_mods', 'P'), - ('variable_mod[]', 'variable_mods', 'P'), - 'colunit_protein', - 'colunit_peptide', - 'colunit_psm', - 'colunit_small_molecule', - 'false_discovery_rate', - - ('derivatization_agent[]', 'derivatization_agents', ()), - ('small_molecule-quantification_unit', - 'small_molecule_quantification_unit', 'M'), - ('small_molecule_feature-quantification_unit', 'small_molecule_feature_quantification_unit', 'M'), - ('small_molecule-identification_reliability', - 'small_molecule_identification_reliability', ()), - ('id_confidence_measure[]', 'id_confidence_measures', 'M'), - ('colunit-small_molecule', 'colunit_small_molecule', ()), - ('colunit-small_molecule_feature', 'colunit_small_molecule_feature', ()), - ('colunit-small_molecule_evidence', 'colunit_small_molecule_evidence', ()), - - ('sample_processing[]', 'sample_processing', ()) - ] - - def __init__(self, path, encoding='utf8', table_format=DATA_FRAME_FORMAT): - if table_format == DATA_FRAME_FORMAT: - _require_pandas() - # Must be defined in order for metadata properties to work - self.variant = None - self.file = _file_obj(path, mode='r', encoding=encoding) - self.metadata = OrderedDict() - self.comments = [] - self._table_format = table_format - self._init_tables() - self._parse() - self._determine_schema_version() - self._transform_tables() - - @property - def table_format(self): - return self._table_format - - def __getitem__(self, key): - key = key.lower().strip() - if key in ('psm', ): - return self.spectrum_match_table - if key in ('pep', ): - return self.peptide_table - if key in ('prt', ): - return self.protein_table - if key in ('sml', ): - return self.small_molecule_table - if key in ('smf', ): - return self.small_molecule_feature_table - if key in ('sme', ): - return self.small_molecule_evidence_table - else: - raise KeyError(key) - - def __iter__(self): - if self.variant == "P": - yield 'PRT', self.protein_table - yield 'PEP', self.peptide_table - yield 'PSM', self.spectrum_match_table - yield 'SML', self.small_molecule_table - elif self.variant == "M": - yield 'SML', self.small_molecule_table - yield 'SMF', self.small_molecule_feature_table - yield 'SME', self.small_molecule_evidence_table - - def _init_tables(self): - self.protein_table = _MzTabTable("protein") - self.peptide_table = _MzTabTable("peptide") - self.spectrum_match_table = _MzTabTable('psm') - self.small_molecule_table = _MzTabTable('small molecule') - self.small_molecule_feature_table = _MzTabTable('small molecule feature') - self.small_molecule_evidence_table = _MzTabTable('small molecule evidence') - - def _transform_tables(self): - if self._table_format == DATA_FRAME_FORMAT: - self.protein_table = self.protein_table.as_df('accession') - self.peptide_table = self.peptide_table.as_df() - self.spectrum_match_table = self.spectrum_match_table.as_df('PSM_ID') - self.small_molecule_table = self.small_molecule_table.as_df() - self.small_molecule_feature_table = self.small_molecule_feature_table.as_df() - self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_df() - elif self._table_format in (DICT_FORMAT, dict): - self.protein_table = self.protein_table.as_dict() - self.peptide_table = self.peptide_table.as_dict() - self.spectrum_match_table = self.spectrum_match_table.as_dict() - self.small_molecule_table = self.small_molecule_table.as_dict() - self.small_molecule_feature_table = self.small_molecule_feature_table.as_dict() - self.small_molecule_evidence_table = self.small_molecule_evidence_table.as_dict() - elif callable(self._table_format): - self.protein_table = self._table_format(self.protein_table) - self.peptide_table = self._table_format(self.peptide_table) - self.spectrum_match_table = self._table_format(self.spectrum_match_table) - self.small_molecule_table = self._table_format(self.small_molecule_table) - self.small_molecule_feature_table = self._table_format(self.small_molecule_feature_table) - self.small_molecule_evidence_table = self._table_format(self.small_molecule_evidence_table) - - def _parse(self): - for i, line in enumerate(self.file): - line = line.strip() - tokens = line.split("\t") - if not tokens: - continue - if tokens[0] == ("MTD"): - name = tokens[1] - value = self._cast_value(tokens[2]) - self.metadata[name] = value - elif tokens[0] == 'COM': - self.comments.append(self._cast_value(tokens[1])) - # headers - elif tokens[0] == "PRH": - self.protein_table.header = tokens[1:] - elif tokens[0] == "PEH": - self.peptide_table.header = tokens[1:] - elif tokens[0] == "PSH": - self.spectrum_match_table.header = tokens[1:] - elif tokens[0] == "SMH": - self.small_molecule_table.header = tokens[1:] - elif tokens[0] == "SFH": - self.small_molecule_feature_table.header = tokens[1:] - elif tokens[0] == "SEH": - self.small_molecule_evidence_table.header = tokens[1:] - # rows - elif tokens[0] == "PRT": - self.protein_table.add(tokens[1:]) - elif tokens[0] == "PEP": - self.peptide_table.add(tokens[1:]) - elif tokens[0] == "PSM": - self.spectrum_match_table.add(tokens[1:]) - elif tokens[0] == "SML": - self.small_molecule_table.add(tokens[1:]) - elif tokens[0] == "SMF": - self.small_molecule_feature_table.add(tokens[1:]) - elif tokens[0] == "SME": - self.small_molecule_evidence_table.add(tokens[1:]) - - def _determine_schema_version(self): - if self.version is not None: - version = str(self.version) - else: - warnings.warn("The mzTab-version metadata header was missing. Assuming the schema version is 1.0.0") - version = "1.0.0" - self.version = version - match = re.search(r"(?P<schema_version>\d+(?:\.\d+(?:\.\d+)?)?)(?:-(?P<schema_variant>[MP]))?", version) - if match is None: - warnings.warn("mzTab-version does not match the expected pattern: %r" % version) - version_parsed = '1.0.0' - variant = 'P' - else: - version_parsed, variant = match.groups() - if variant is None: - variant = "P" - self.num_version = [int(v) for v in version_parsed.split(".")] - # Ensure self.num_version is 3-tuple - while len(self.num_version) < 3: - self.num_version.append(0) - self.variant = variant - - def keys(self): - return OrderedDict(list(self)).keys() - - def values(self): - return OrderedDict(list(self)).values() - - def items(self): - return OrderedDict(list(self)).items() diff --git a/pyteomics/mzxml.py b/pyteomics/mzxml.py deleted file mode 100644 index 28c67e357ba9e0c77ba2dafebd121cda67a2c963..0000000000000000000000000000000000000000 --- a/pyteomics/mzxml.py +++ /dev/null @@ -1,328 +0,0 @@ -""" -mzxml - reader for mass spectrometry data in mzXML format -========================================================= - -Summary -------- - -**mzXML** is a (formerly) standard XML-format for raw mass spectrometry data storage, -intended to be replaced with **mzML**. - -This module provides a minimalistic way to extract information from mzXML -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`MzXML`) -to iterate over entries in ``<scan>`` elements. -:py:class:`MzXML` also supports direct indexing with scan IDs. - -Data access ------------ - - :py:class:`MzXML` - a class representing a single mzXML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through spectra in mzXML file. Data from a - single scan are converted to a human-readable dict. Spectra themselves are - stored under 'm/z array' and 'intensity array' keys. - - :py:func:`chain` - read multiple mzXML files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - -Deprecated functions --------------------- - - :py:func:`version_info` - get version information about the mzXML file. - You can just read the corresponding attribute of the :py:class:`MzXML` object. - - :py:func:`iterfind` - iterate over elements in an mzXML file. - You can just call the corresponding method of the :py:class:`MzXML` object. - -Dependencies ------------- - -This module requires :py:mod:`lxml` and :py:mod:`numpy`. - -------------------------------------------------------------------------------- -""" - -# Copyright 2016 Joshua Klein, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import heapq - -from . import xml, auxiliary as aux, _schema_defaults -import numpy as np - - -def _decode_peaks(info, peaks_data): - """Decode the interleaved base 64 encoded, potentially - compressed, raw data points. - - Parameters - ---------- - info : dict - The current context - peaks_data : str - The textually encoded peak data - - Returns - ------- - tuple of np.array - A pair of NumPy arrays containing - m/z and intensity values. - """ - compressed = (info.get('compressionType') == 'zlib') - dt = np.float32 if info['precision'] == '32' else np.float64 - dtype = np.dtype([('m/z array', dt), ('intensity array', dt)]).newbyteorder('>') - data = aux._decode_base64_data_array(peaks_data, dtype, compressed) - return data - - -class IteratorQueue(object): - def __init__(self, iterator): - q = list() - heapq.heapify(q) - self.queue = q - self.iterator = iterator - self.last_index = -1 - self.producer = self.consume(iterator) - - def insert_item(self, scan): - heapq.heappush(self.queue, (int(scan['num']), scan)) - - def __iter__(self): - return self.producer - - def consume(self, iterator): - for scan in iterator: - scan.pop("scan", None) - if scan['msLevel'] != 1: - self.insert_item(scan) - else: - self.insert_item(scan) - barrier = int(scan['num']) - while True: - idx, item = heapq.heappop(self.queue) - if idx >= barrier: - self.insert_item(item) - break - yield item - while self.queue: - idx, item = heapq.heappop(self.queue) - yield item - - -class MzXML(aux.BinaryArrayConversionMixin, aux.TimeOrderedIndexedReaderMixin, xml.MultiProcessingXML, xml.IndexSavingXML): - """Parser class for mzXML files.""" - _root_element = 'mzXML' - _default_iter_tag = 'scan' - _indexed_tags = {'scan'} - _indexed_tag_keys = {'scan': 'num'} - _default_version = None - _default_schema = _schema_defaults._mzxml_schema_defaults - _default_id_attr = 'num' - - def __init__(self, *args, **kwargs): - self.decode_binary = kwargs.pop('decode_binary', True) - super(MzXML, self).__init__(*args, **kwargs) - - def __getstate__(self): - state = super(MzXML, self).__getstate__() - state['decode_binary'] = self.decode_binary - return state - - def __setstate__(self, state): - super(MzXML, self).__setstate__(state) - self.decode_binary = state['decode_binary'] - - def _get_info_smart(self, element, **kw): - name = xml._local_name(element) - - kwargs = dict(kw) - rec = kwargs.pop('recursive', None) - if name in {'mzXML'}: - info = self._get_info(element, - recursive=( - rec if rec is not None else False), - **kwargs) - else: - info = self._get_info(element, - recursive=(rec if rec is not None else True), - **kwargs) - if 'num' in info and isinstance(info, dict): - info['id'] = info['num'] - if 'peaks' in info and isinstance(info, dict): - self._decode_peaks(info) - return info - - def _determine_compression(self, info): - if info.get('compressionType') == 'zlib': - return 'zlib compression' - return "no compression" - - def _determine_dtype(self, info): - dt = np.float32 if info['precision'] == '32' else np.float64 - endianess = ">" if info['byteOrder'] in ('network', "big") else "<" - dtype = np.dtype( - [('m/z array', dt), ('intensity array', dt)]).newbyteorder(endianess) - return dtype - - def _finalize_record_conversion(self, array, record): - key = record.key - return self._convert_array(key, array[key]) - - def _decode_peaks(self, info): - # handle cases where peaks is the encoded binary data which must be - # unpacked - if not isinstance(info['peaks'], (dict, list)): - compression_type = self._determine_compression(info) - dtype = self._determine_dtype(info) - binary = info.pop('peaks') - if not self.decode_binary: - for k in self._array_keys: - record = self._make_record(binary, compression_type, dtype, k) - info[k] = record - else: - peak_data = self.decode_data_array(binary, compression_type, dtype) - for k in self._array_keys: - info[k] = self._convert_array(k, peak_data[k]) - # otherwise we've already decoded the arrays and we're just passing - # them up the hierarchy - else: - if not self.decode_binary: - arrays = info.pop('peaks')[0] - for k in self._array_keys: - info[k] = arrays[k] - else: - peak_data = info.pop('peaks')[0] - for k in self._array_keys: - info[k] = self._convert_array(k, peak_data.get(k, np.array([]))) - - def iterfind(self, path, **kwargs): - if path == 'scan': - generator = super(MzXML, self).iterfind(path, **kwargs) - for item in IteratorQueue(generator): - yield item - else: - for item in super(MzXML, self).iterfind(path, **kwargs): - yield item - - def _get_time(self, scan): - return scan['retentionTime'] - - -def read(source, read_schema=False, iterative=True, use_index=False, dtype=None, - huge_tree=False, decode_binary=True): - """Parse `source` and iterate through spectra. - - Parameters - ---------- - source : str or file - A path to a target mzML file or the file object itself. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzML header. Otherwise, use default - parameters. Not recommended without Internet connection or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - spectrum elements. Default is :py:const:`False`. - - decode_binary : bool, optional - Defines whether binary data should be decoded and included in the output - (under "m/z array", "intensity array", etc.). - Default is :py:const:`True`. - - huge_tree : bool, optional - This option is passed to the `lxml` parser and defines whether - security checks for XML tree depth and node size should be disabled. - Default is :py:const:`False`. - Enable this option for trusted files to avoid XMLSyntaxError exceptions - (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). - - Returns - ------- - out : iterator - An iterator over the dicts with spectrum properties. - """ - - return MzXML(source, read_schema=read_schema, iterative=iterative, - use_index=use_index, dtype=dtype, huge_tree=huge_tree, - decode_binary=decode_binary) - - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified XPath. - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create an :py:class:`MzXML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - decode_binary : bool, optional - Defines whether binary data should be decoded and included in the output - (under "m/z array", "intensity array", etc.). - Default is :py:const:`True`. - - Returns - ------- - out : iterator - """ - return MzXML(source, **kwargs).iterfind(path, **kwargs) - -version_info = xml._make_version_info(MzXML) - - -# chain = aux._make_chain(read, 'read') -chain = aux.ChainBase._make_chain(MzXML) diff --git a/pyteomics/openms/__init__.py b/pyteomics/openms/__init__.py deleted file mode 100644 index 8b9c0338f46370b775594e9fe369eaa9e3d790a9..0000000000000000000000000000000000000000 --- a/pyteomics/openms/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import featurexml, trafoxml, idxml diff --git a/pyteomics/openms/featurexml.py b/pyteomics/openms/featurexml.py deleted file mode 100644 index 0dcfd099d01652443b076fccf6dedd12daaf934f..0000000000000000000000000000000000000000 --- a/pyteomics/openms/featurexml.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -featurexml - reader for featureXML files -======================================== - -Summary -------- - -**featureXML** is a format specified in the -`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. -It defines a list of LC-MS features observed in an experiment. - -This module provides a minimalistic way to extract information from **featureXML** -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`FeatureXML`) -to iterate over entries in ``<feature>`` elements. -:py:class:`FeatureXML` also supports direct indexing with feature IDs. - -Data access ------------ - - :py:class:`FeatureXML` - a class representing a single featureXML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through features in a featureXML file. Data from a - single feature are converted to a human-readable dict. - - :py:func:`chain` - read multiple featureXML files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - -Dependencies ------------- - -This module requres :py:mod:`lxml`. - --------------------------------------------------------------------------------- -""" - -from .. import xml, auxiliary as aux, _schema_defaults, version - -class FeatureXML(xml.MultiProcessingXML): - """Parser class for featureXML files.""" - file_format = 'featureXML' - _root_element = 'featureMap' - _default_schema = _schema_defaults._featurexml_schema_defaults - _default_version = '1.9' - _default_iter_tag = 'feature' - _structures_to_flatten = {} - _indexed_tags = {'feature'} - _schema_location_param = 'noNamespaceSchemaLocation' - - _offending_keys = {'ints': { - ('PeptideIdentification', 'spectrum_reference'), - ('UnassignedPeptideIdentification', 'spectrum_reference'), - ('quality', 'quality') - }} - _missing_keys = {'floats': {('quality', 'quality')}} - - def _get_info_smart(self, element, **kw): - kw['recursive'] = kw.get('recursive', True) - info = self._get_info(element, **kw) - return info - - @xml._keepstate - def _get_schema_info(self, read_schema=True): - schema_info = super(FeatureXML, self)._get_schema_info(read_schema) - if not read_schema: - return schema_info - file_version, schema = self.version_info - if version.VersionInfo(file_version) < version.VersionInfo(self._default_version): - for k, s in self._offending_keys.items(): - if k in schema_info: - for elem in s: - try: - schema_info[k].remove(elem) - except KeyError: - pass - for t, s in self._missing_keys.items(): - schema_info.setdefault(t, set()).update(s) - return schema_info - - -def read(source, read_schema=True, iterative=True, use_index=False): - """Parse `source` and iterate through features. - - Parameters - ---------- - source : str or file - A path to a target featureXML file or the file object itself. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the file header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - spectrum elements. Default is :py:const:`False`. - - Returns - ------- - out : iterator - An iterator over the dicts with feature properties. - """ - - return FeatureXML(source, read_schema=read_schema, iterative=iterative, use_index=use_index) - -chain = aux._make_chain(read, 'read') diff --git a/pyteomics/openms/idxml.py b/pyteomics/openms/idxml.py deleted file mode 100644 index 0d71d7a4e61ef46c8343f902819874fa09abe572..0000000000000000000000000000000000000000 --- a/pyteomics/openms/idxml.py +++ /dev/null @@ -1,430 +0,0 @@ -""" -idxml - idXML file reader -========================= - -Summary -------- - -**idXML** is a format specified in the -`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. -It defines a list of peptide identifications. - -This module provides a minimalistic way to extract information from idXML -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`IDXML`) to iterate over entries in -``<PeptideIdentification>`` elements. Note that each entry can contain more than one PSM -(peptide-spectrum match). They are accessible with ``'PeptideHit'`` key. -:py:class:`IDXML` objects also support direct indexing by element ID. - -Data access ------------ - - :py:class:`IDXML` - a class representing a single idXML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through peptide-spectrum matches in an idXML - file. Data from a single PSM group are converted to a human-readable dict. - Basically creates an :py:class:`IDXML` object and reads it. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`DataFrame` - read idXML files into a :py:class:`pandas.DataFrame`. - -Target-decoy approach ---------------------- - - :py:func:`filter` - read a chain of idXML files and filter to a certain - FDR using TDA. - - :py:func:`filter.chain` - chain a series of filters applied independently to - several files. - - :py:func:`filter.chain.from_iterable` - chain a series of filters applied - independently to an iterable of files. - - :py:func:`filter_df` - filter idXML files and return a :py:class:`pandas.DataFrame`. - - :py:func:`is_decoy` - determine if a "SpectrumIdentificationResult" should be - consiudered decoy. - - :py:func:`fdr` - estimate the false discovery rate of a set of identifications - using the target-decoy approach. - - :py:func:`qvalues` - get an array of scores and local FDR values for a PSM - set using the target-decoy approach. - -Deprecated functions --------------------- - - :py:func:`version_info` - get information about idXML version and schema. - You can just read the corresponding attribute of the :py:class:`IDXML` - object. - - :py:func:`get_by_id` - get an element by its ID and extract the data from it. - You can just call the corresponding method of the :py:class:`IDXML` - object. - - :py:func:`iterfind` - iterate over elements in an idXML file. - You can just call the corresponding method of the :py:class:`IDXML` - object. - -Dependencies ------------- - -This module requires :py:mod:`lxml`. - -------------------------------------------------------------------------------- -""" - -# Copyright 2020 Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings -from .. import auxiliary as aux -from .. import xml, _schema_defaults - - -class IDXML(xml.IndexedXML): - """Parser class for idXML files.""" - file_format = 'idXML' - _root_element = 'IdXML' - _default_schema = _schema_defaults._idxml_schema_defaults - _default_version = '1.5' - _default_iter_tag = 'PeptideIdentification' - _structures_to_flatten = {} - _indexed_tags = {'ProteinHit'} - _schema_location_param = 'noNamespaceSchemaLocation' - - def __init__(self, *args, **kwargs): - kwargs.setdefault('retrieve_refs', True) - super(IDXML, self).__init__(*args, **kwargs) - - def _get_info_smart(self, element, **kwargs): - """Extract the info in a smart way depending on the element type""" - name = xml._local_name(element) - kwargs = dict(kwargs) - rec = kwargs.pop("recursive", None) - - # Try not to recursively unpack the root element - # unless the user really wants to. - if name == self._root_element: - info = self._get_info(element, recursive=(rec if rec is not None else False), **kwargs) - else: - info = self._get_info(element, recursive=(rec if rec is not None else True), **kwargs) - for k in ['start', 'end']: - v = info.get(k) - if isinstance(v, list) and len(v) == 2: - info[k] = [int(x) for x in v[0].split()] - for k in ['aa_before', 'aa_after']: - if k in info: - info[k] = info[k].split() - return info - - def _retrieve_refs(self, info, **kwargs): - """Retrieves and embeds the data for each attribute in `info` that - ends in _ref. Removes the id attribute from `info`""" - for k, v in dict(info).items(): - if k[-5:] == '_refs': - try: - by_id = [self.get_by_id(x, retrieve_refs=True) for x in v.split()] - except KeyError: - warnings.warn('Ignoring unresolved reference: ' + v) - else: - for x in by_id: - x.pop('id', None) - info[k[:-5]] = by_id - del info[k] - - -def read(source, **kwargs): - """Parse `source` and iterate through peptide-spectrum matches. - - .. note:: This function is provided for backward compatibility only. - It simply creates an :py:class:`IDXML` instance using - provided arguments and returns it. - - Parameters - ---------- - source : str or file - A path to a target IDXML file or the file object itself. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - retrieve_refs : bool, optional - If :py:const:`True`, additional information from references will be - automatically added to the results. The file processing time will - increase. Default is :py:const:`True`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the IDXML header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - build_id_cache : bool, optional - Defines whether a cache of element IDs should be built and stored on the - created :py:class:`IDXML` instance. Default value is the value of - `retrieve_refs`. - - .. note:: This parameter is ignored when ``use_index`` is ``True`` (default). - - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - the indexed elements. If :py:const:`True` (default), `build_id_cache` is ignored. - - indexed_tags : container of bytes, optional - Defines which elements need to be indexed. Empty set by default. - - Returns - ------- - out : IDXML - An iterator over the dicts with PSM properties. - """ - kwargs = kwargs.copy() - kwargs.setdefault('retrieve_refs', True) - kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) - return IDXML(source, **kwargs) - - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified "XPath". - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create an :py:class:`IDXML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - retrieve_refs : bool, optional - If :py:const:`True`, additional information from references will be - automatically added to the results. The file processing time will - increase. Default is :py:const:`False`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the IDXML header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - build_id_cache : bool, optional - Defines whether a cache of element IDs should be built and stored on the - created :py:class:`IDXML` instance. Default value is the value of - `retrieve_refs`. - - Returns - ------- - out : iterator - """ - kwargs = kwargs.copy() - kwargs['build_id_cache'] = kwargs.get('build_id_cache', kwargs.get('retrieve_refs')) - return IDXML(source, **kwargs).iterfind(path, **kwargs) - - -version_info = xml._make_version_info(IDXML) - - -def get_by_id(source, elem_id, **kwargs): - """Parse `source` and return the element with `id` attribute equal - to `elem_id`. Returns :py:const:`None` if no such element is found. - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`get_by_id` calls on one file, you should - create an :py:class:`IDXML` object and use its - :py:meth:`!get_by_id` method. - - Parameters - ---------- - source : str or file - A path to a target mzIdentML file of the file object itself. - - elem_id : str - The value of the `id` attribute to match. - - Returns - ------- - out : :py:class:`dict` or :py:const:`None` - """ - return IDXML(source, **kwargs).get_by_id(elem_id, **kwargs) - - -chain = aux.ChainBase._make_chain(IDXML) - - -def is_decoy(psm, prefix=None): - """Given a PSM dict, return :py:const:`True` if it is marked as decoy, - and :py:const:`False` otherwise. - - Parameters - ---------- - psm : dict - A dict, as yielded by :py:func:`read`. - prefix : ignored - - Returns - ------- - out : bool - """ - return psm['PeptideHit'][0]['target_decoy'] == 'decoy' - - -def DataFrame(*args, **kwargs): - """Read idXML files into a :py:class:`pandas.DataFrame`. - - Requires :py:mod:`pandas`. - - .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'. - - Parameters - ---------- - *args - Passed to :py:func:`chain` - - **kwargs - Passed to :py:func:`chain` - - sep : str or None, keyword only, optional - Some values related to PSMs (such as protein information) are variable-length - lists. If `sep` is a :py:class:`str`, they will be packed into single string using - this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is - :py:const:`None`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - data = [] - - sep = kwargs.pop('sep', None) - with chain(*args, **kwargs) as f: - for item in f: - info = {} - for k, v in item.items(): - if isinstance(v, (str, int, float)): - info[k] = v - peptide_hit = item.get('PeptideHit', [None])[0] - if peptide_hit is not None: - info.update((k, v) for k, v in peptide_hit.items() if isinstance(v, (str, int, float))) - protein = peptide_hit.get('protein') - if protein: - accessions, isd, starts, ends, scores, aa_bs, aa_as = [], [], [], [], [], [], [] - for d, start, end, aab, aaa in zip(protein, peptide_hit['start'], peptide_hit['end'], peptide_hit['aa_before'], peptide_hit['aa_after']): - accessions.append(d.get('accession')) - isd.append(d.get('target_decoy')) - scores.append(d.get('score')) - starts.append(start) - ends.append(end) - aa_bs.append(aab) - aa_as.append(aaa) - - isd = all(x == 'decoy' for x in isd) - if sep is not None: - if all(isinstance(acc, str) for acc in accessions): - accessions = sep.join(accessions) - if all(isinstance(aaa, str) for aaa in aa_as): - aa_as = sep.join(aa_as) - if all(isinstance(aab, str) for aab in aa_bs): - aa_bs = sep.join(aa_bs) - if all(acc is None for acc in accessions): - accessions = None - - info.update((k, v) for k, v in protein[0].items() if isinstance(v, (str, int, float, list))) - info['accession'] = accessions - info['is decoy'] = isd - info['start'] = starts - info['end'] = ends - info['aa_before'] = aa_bs - info['aa_after'] = aa_as - data.append(info) - df = pd.DataFrame(data) - return df - - -def filter_df(*args, **kwargs): - """Read idXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. - Positional arguments can be idXML files or DataFrames. - - Requires :py:mod:`pandas`. - - .. warning :: Only the first 'PeptideHit' element is considered in every 'PeptideIdentification'. - - Parameters - ---------- - key : str / iterable / callable, keyword only, optional - Peptide identification score. Default is 'score'. You will probably need to change it. - is_decoy : str / iterable / callable, keyword only, optional - Default is 'is decoy'. - *args - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - **kwargs - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - kwargs.setdefault('key', 'score') - if all(isinstance(arg, pd.DataFrame) for arg in args): - df = pd.concat(args) - else: - df = DataFrame(*args, **kwargs) - if 'is_decoy' not in kwargs: - kwargs['is_decoy'] = 'is decoy' - return aux.filter(df, **kwargs) - - -fdr = aux._make_fdr(is_decoy, None) -_key = lambda x: x['PeptideHit'][0]['score'] -qvalues = aux._make_qvalues(chain, is_decoy, None, _key) -filter = aux._make_filter(chain, is_decoy, None, _key, qvalues) -filter.chain = aux._make_chain(filter, 'filter', True) diff --git a/pyteomics/openms/trafoxml.py b/pyteomics/openms/trafoxml.py deleted file mode 100644 index d42c56979dc2a3ddc7c0076f467091c2da3c0563..0000000000000000000000000000000000000000 --- a/pyteomics/openms/trafoxml.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -trafoxml - reader for trafoXML files -======================================== - -Summary -------- - -**trafoXML** is a format specified in the -`OpenMS <http://open-ms.sourceforge.net/about/>`_ project. -It defines a transformation, which is a result of retention time alignment. - -This module provides a minimalistic way to extract information from **trafoXML** -files. You can use the old functional interface (:py:func:`read`) or the new -object-oriented interface (:py:class:`TrafoXML`) -to iterate over entries in ``<Pair>`` elements. - -Data access ------------ - - :py:class:`TrafoXML` - a class representing a single trafoXML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through pairs in a trafoXML file. Data from a - single trafo are converted to a human-readable dict. - - :py:func:`chain` - read multiple trafoXML files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - -Dependencies ------------- - -This module requres :py:mod:`lxml`. - --------------------------------------------------------------------------------- -""" - -from .. import xml, auxiliary as aux, _schema_defaults - -class TrafoXML(xml.XML): - """Parser class for trafoXML files.""" - file_format = 'trafoXML' - _root_element = 'TrafoXML' - _default_schema = _schema_defaults._trafoxml_schema_defaults - _default_version = '1.0' - _default_iter_tag = 'Pair' - _schema_location_param = 'noNamespaceSchemaLocation' - - def _get_info_smart(self, element, **kw): - kw['recursive'] = kw.get('recursive', True) - info = self._get_info(element, **kw) - return info - -def read(source, read_schema=True, iterative=True): - """Parse `source` and iterate through pairs. - - Parameters - ---------- - source : str or file - A path to a target trafoXML file or the file object itself. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the file header (default). Otherwise, use default - parameters. Disable this to avoid waiting on slow network connections or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - Returns - ------- - out : iterator - An iterator over the dicts with feature properties. - """ - - return TrafoXML(source, read_schema=read_schema, iterative=iterative) - -chain = aux._make_chain(read, 'read') \ No newline at end of file diff --git a/pyteomics/parser.py b/pyteomics/parser.py deleted file mode 100644 index 72ef9fc2ad8cced5dcb500041357b450214b863b..0000000000000000000000000000000000000000 --- a/pyteomics/parser.py +++ /dev/null @@ -1,1148 +0,0 @@ -""" -parser - operations on modX peptide sequences -============================================= - -modX is a simple extension of the `IUPAC one-letter peptide sequence -representation <http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html>`_. - -The labels (or codes) for the 20 standard amino acids in modX are the same as -in IUPAC nomeclature. A label for a modified amino acid has a general -form of 'modX', i.e.: - -- it starts with an arbitrary number of lower-case symbols or numbers - (a modification); - -- it ends with a single upper-case symbol (an amino acid residue). - -The valid examples of modX amino acid labels are: 'G', 'pS', 'oxM'. This rule -allows to combine read- and parseability. - -Besides the sequence of amino acid residues, modX has a rule to specify -terminal modifications of a polypeptide. Such a label should start or -end with a hyphen. The default N-terminal amine group and C-terminal -carboxyl group may not be shown explicitly. - -Therefore, valid examples of peptide sequences in modX are: "GAGA", -"H-PEPTIDE-OH", "H-TEST-NH2". It is not recommmended to specify only one -terminal group. - -Operations on polypeptide sequences ------------------------------------ - - :py:func:`parse` - convert a sequence string into a list of - amino acid residues. - - :py:func:`to_string` - convert a parsed sequence to a string. - - :py:func:`to_proforma` - convert a (parsed) *modX* sequence to ProForma. - - :py:func:`amino_acid_composition` - get numbers of each amino acid - residue in a peptide. - - :py:func:`cleave`, :py:func:`icleave`, :py:func:`xcleave` - cleave a polypeptide using a given rule of - enzymatic digestion. - - :py:func:`num_sites` - count the number of cleavage sites in a sequence. - - :py:func:`isoforms` - generate all unique modified peptide sequences - given the initial sequence and modifications. - -Auxiliary commands ------------------- - - :py:func:`coverage` - calculate the sequence coverage of a protein by peptides. - - :py:func:`length` - calculate the number of amino acid - residues in a polypeptide. - - :py:func:`valid` - check if a sequence can be parsed successfully. - - :py:func:`fast_valid` - check if a sequence contains of known one-letter - codes. - - :py:func:`is_modX` - check if supplied code corresponds to a modX label. - - :py:func:`is_term_mod` - check if supplied code corresponds to a - terminal modification. - -Data ----- - - :py:data:`std_amino_acids` - a list of the 20 standard amino acid IUPAC codes. - - :py:data:`std_nterm` - the standard N-terminal modification (the - unmodified group is a single atom of hydrogen). - - :py:data:`std_cterm` - the standard C-terminal modification (the - unmodified group is hydroxyl). - - :py:data:`std_labels` - a list of all standard sequence - elements, amino acid residues and terminal modifications. - - :py:data:`expasy_rules` and :py:data:`psims_rules` - two dicts with the regular expressions of - cleavage rules for the most popular proteolytic enzymes. - -------------------------------------------------------------------------------- - -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from collections import deque -import itertools as it -import warnings -from .auxiliary import PyteomicsError, memoize, BasicComposition, cvstr, cvquery - - -std_amino_acids = ['Q', 'W', 'E', 'R', 'T', 'Y', 'I', 'P', 'A', 'S', - 'D', 'F', 'G', 'H', 'K', 'L', 'C', 'V', 'N', 'M'] -"""modX labels for the 20 standard amino acids.""" - -std_nterm = 'H-' -"""modX label for the unmodified N-terminus.""" - -std_cterm = '-OH' -"""modX label for the unmodified C-terminus.""" - -std_labels = std_amino_acids + [std_nterm, std_cterm] -"""modX labels for the standard amino acids and unmodified termini.""" - -_nterm_mod = r'[^-]+-$' -_cterm_mod = r'-[^-]+$' - - -def is_term_mod(label): - """Check if `label` corresponds to a terminal modification. - - Parameters - ---------- - label : str - - Returns - ------- - out : bool - - Examples - -------- - >>> is_term_mod('A') - False - >>> is_term_mod('Ac-') - True - >>> is_term_mod('-customGroup') - True - >>> is_term_mod('this-group-') - False - >>> is_term_mod('-') - False - """ - return (re.match(_nterm_mod, label) or re.match(_cterm_mod, label)) is not None - - -def match_modX(label): - """Check if `label` is a valid 'modX' label. - - Parameters - ---------- - label : str - - Returns - ------- - out : re.match or None - """ - return re.match(_modX_single, label) - - -def is_modX(label): - """Check if `label` is a valid 'modX' label. - - Parameters - ---------- - label : str - - Returns - ------- - out : bool - - Examples - -------- - >>> is_modX('M') - True - >>> is_modX('oxM') - True - >>> is_modX('oxMet') - False - >>> is_modX('160C') - True - """ - return bool(match_modX(label)) - - -def length(sequence, **kwargs): - """Calculate the number of amino acid residues in a polypeptide - written in modX notation. - - Parameters - ---------- - sequence : str or list or dict - A string with a polypeptide sequence, a list with a parsed sequence or - a dict of amino acid composition. - labels : list, optional - A list of allowed labels for amino acids and terminal modifications. - - Returns - ------- - out : int - - Examples - -------- - >>> length('PEPTIDE') - 7 - >>> length('H-PEPTIDE-OH') - 7 - """ - if not sequence: - return 0 - - if isinstance(sequence, str) or isinstance(sequence, list): - if isinstance(sequence, str): - parsed_sequence = parse(sequence, **kwargs) - else: - parsed_sequence = sequence - num_term_groups = 0 - if is_term_mod(parsed_sequence[0]): - num_term_groups += 1 - if is_term_mod(parsed_sequence[-1]): - num_term_groups += 1 - return len(parsed_sequence) - num_term_groups - elif isinstance(sequence, dict): - return sum(amount for aa, amount in sequence.items() if not is_term_mod(aa)) - - raise PyteomicsError('Unsupported type of sequence.') - - -def _split_label(label): - try: - mod, X = match_modX(label).groups() - except AttributeError: - raise PyteomicsError('Cannot split a non-modX label: %s' % label) - if not mod: - return (X,) - else: - return mod, X - - -_modX_sequence = re.compile(r'^([^-]+-)?((?:[^A-Z-]*[A-Z])+)(-[^-]+)?$') -_modX_group = re.compile(r'[^A-Z-]*[A-Z]') -_modX_split = re.compile(r'([^A-Z-]*)([A-Z])') -_modX_single = re.compile(r'^([^A-Z-]*)([A-Z])$') - - -def parse(sequence, show_unmodified_termini=False, split=False, allow_unknown_modifications=False, **kwargs): - """Parse a sequence string written in modX notation into a list of - labels or (if `split` argument is :py:const:`True`) into a list of - tuples representing amino acid residues and their modifications. - - Parameters - ---------- - sequence : str - The sequence of a polypeptide. - show_unmodified_termini : bool, optional - If :py:const:`True` then the unmodified N- and C-termini are explicitly - shown in the returned list. Default value is :py:const:`False`. - split : bool, optional - If :py:const:`True` then the result will be a list of tuples with 1 to 4 - elements: terminal modification, modification, residue. Default value is - :py:const:`False`. - allow_unknown_modifications : bool, optional - If :py:const:`True` then do not raise an exception when an unknown - modification of a known amino acid residue is found in the sequence. - This also includes terminal groups. - Default value is :py:const:`False`. - - .. note:: - Since version 2.5, this parameter has effect only if `labels` - are provided. - labels : container, optional - A container of allowed labels for amino acids, - modifications and terminal modifications. - If not provided, no checks will be done. - Separate labels for modifications (such as 'p' or 'ox') - can be supplied, which means they are applicable to all residues. - - .. warning:: - If `show_unmodified_termini` is set to :py:const:`True`, standard - terminal groups need to be present in `labels`. - - .. warning:: - Avoid using sequences with only one terminal group, as they are - ambiguous. If you provide one, `labels` (or :py:const:`std_labels`) - will be used to resolve the ambiguity. - - Returns - ------- - out : list - List of tuples with labels of modifications and amino acid residues. - - Examples - -------- - >>> parse('PEPTIDE', split=True) - [('P',), ('E',), ('P',), ('T',), ('I',), ('D',), ('E',)] - >>> parse('H-PEPTIDE') - ['P', 'E', 'P', 'T', 'I', 'D', 'E'] - >>> parse('PEPTIDE', show_unmodified_termini=True) - ['H-', 'P', 'E', 'P', 'T', 'I', 'D', 'E', '-OH'] - >>> parse('TEpSToxM', labels=std_labels + ['pS', 'oxM']) - ['T', 'E', 'pS', 'T', 'oxM'] - >>> parse('zPEPzTIDzE', True, True, labels=std_labels+['z']) - [('H-', 'z', 'P'), ('E',), ('P',), ('z', 'T'), ('I',), ('D',), ('z', 'E', '-OH')] - >>> parse('Pmod1EPTIDE') - ['P', 'mod1E', 'P', 'T', 'I', 'D', 'E'] - """ - sequence = str(sequence) - - try: - n, body, c = re.match(_modX_sequence, sequence).groups() - except AttributeError: - raise PyteomicsError('Not a valid modX sequence: ' + sequence) - - # Check for allowed labels, if they were explicitly given - labels = kwargs.get('labels') - # labels help save the day when only one terminal group is given - if c is None and n is not None: - if labels is None: - labels = std_labels - # we can try to resolve the ambiguity - if n != std_nterm and n not in labels: - # n is the body then - c = '-' + body - body = n[:-1] - n = None - - # Actual parsing - if split: - parsed_sequence = [g if g[0] else (g[1],) for g in re.findall( - _modX_split, body)] - else: - parsed_sequence = re.findall(_modX_group, body) - nterm, cterm = (n or std_nterm), (c or std_cterm) - - # Check against `labels` if given - if labels is not None: - labels = set(labels) - for term, std_term in zip([n, c], [std_nterm, std_cterm]): - if term and term not in labels and not allow_unknown_modifications: - raise PyteomicsError('Unknown label: {}'.format(term)) - for group in parsed_sequence: - if split: - mod, X = group if len(group) == 2 else ('', group[0]) - else: - mod, X = re.match(_modX_split, group).groups() - if ((not mod) and X not in labels) or not ((mod + X in labels) or ( - X in labels and ( - mod in labels or allow_unknown_modifications))): - raise PyteomicsError('Unknown label: {}'.format(group)) - - # Append terminal labels - if show_unmodified_termini or nterm != std_nterm: - if split: - parsed_sequence[0] = (nterm,) + parsed_sequence[0] - else: - parsed_sequence.insert(0, nterm) - if show_unmodified_termini or cterm != std_cterm: - if split: - parsed_sequence[-1] = parsed_sequence[-1] + (cterm,) - else: - parsed_sequence.append(cterm) - - return parsed_sequence - - -def valid(*args, **kwargs): - """Try to parse sequence and catch the exceptions. - All parameters are passed to :py:func:`parse`. - - Returns - ------- - out : bool - :py:const:`True` if the sequence was parsed successfully, and - :py:const:`False` otherwise. - """ - try: - parse(*args, **kwargs) - except PyteomicsError: - return False - return True - - -def fast_valid(sequence, labels=set(std_labels)): - """Iterate over `sequence` and check if all items are in `labels`. - With strings, this only works as expected on sequences without - modifications or terminal groups. - - Parameters - ---------- - sequence : iterable (expectedly, str) - The sequence to check. A valid sequence would be a string of - labels, all present in `labels`. - labels : iterable, optional - An iterable of known labels. - - Returns - ------- - out : bool - """ - return set(sequence).issubset(labels) - - -def to_string(parsed_sequence, show_unmodified_termini=True): - """Create a string from a parsed sequence. - - Parameters - ---------- - parsed_sequence : iterable - Expected to be in one of the formats returned by - :py:func:`parse`, i.e. list of labels or list of tuples. - show_unmodified_termini : bool, optional - Defines the behavior towards standard terminal groups in the input. - :py:const:`True` means that they will be preserved if present (default). - :py:const:`False` means that they will be removed. Standard terminal - groups will not be added if not shown in `parsed_sequence`, - regardless of this setting. - - Returns - ------- - sequence : str - """ - parsed_sequence = list(parsed_sequence) - labels = [] - nterm = parsed_sequence[0] - cterm = parsed_sequence[-1] - - if isinstance(nterm, str): - if nterm != std_nterm or show_unmodified_termini: - labels.append(nterm) - labels.extend(parsed_sequence[1:-1]) - if len(parsed_sequence) > 1 and (cterm != std_cterm or show_unmodified_termini): - labels.append(cterm) - else: - if len(parsed_sequence) == 1: - g = nterm - if nterm[0] == std_nterm and not show_unmodified_termini: - g = g[1:] - if nterm[-1] == std_cterm and not show_unmodified_termini: - g = g[:-1] - return ''.join(g) - if nterm[0] != std_nterm or show_unmodified_termini: - labels.append(''.join(nterm)) - else: - labels.append(''.join(nterm[1:])) - labels.extend(''.join(g) for g in parsed_sequence[1:-1]) - if len(parsed_sequence) > 1: - if cterm[-1] != std_cterm or show_unmodified_termini: - labels.append(''.join(cterm)) - else: - labels.append(''.join(cterm[:-1])) - return ''.join(labels) - - -tostring = to_string - - -def to_proforma(sequence, **kwargs): - """Converts a (parsed) *modX* sequence to a basic ProForma string. - Modifications are represented as masses, if those are given in :arg:`aa_mass`, - as chemical formulas (via :arg:`aa_comp`) or as names (using :arg:`mod_names`). - - Parameters - ---------- - sequence : str or list - A *modX* sequence, possibly in the parsed form. - aa_mass : dict, keyword only, optional - Used to render modifications as mass shifts. - aa_comp : dict, keyword only, optional - Used to render modifications as chemical formulas. - mod_names : dict or callable, keyword only, optional - Used to get the rendered name of modification from the mod label. - prefix : str, keyword only, optional - Prepend all modification names with the given prefix. - - Returns - ------- - out : str - A ProForma sequence. - - Examples - -------- - >>> to_proforma('PEPTIDE') - 'PEPTIDE' - >>> to_proforma('Ac-oxMYPEPTIDE-OH', aa_mass={'Ac-': 42.010565}, mod_names={'ox': 'Oxidation'}, prefix='U:') - '[+42.0106]-M[U:Oxidation]YPEPTIDE' - >>> to_proforma('oxidationMYPEPTIDE') # last fallback is to just capitalize the label - 'M[Oxidation]YPEPTIDE' - """ - from . import proforma - from .mass.mass import std_aa_mass, std_aa_comp - - if isinstance(sequence, str): - return to_proforma(parse(sequence), **kwargs) - - aa_mass = kwargs.get('aa_mass', std_aa_mass) - aa_comp = kwargs.get('aa_comp', std_aa_comp) - mod_names = kwargs.get('mod_names', {}) - prefix = kwargs.get('prefix', '') - - if isinstance(mod_names, dict): - get_name = mod_names.get - else: - get_name = mod_names - - def get_tag(label): - if label in aa_mass: - return [proforma.MassModification(aa_mass[label])] - if label in aa_comp: - return [proforma.FormulaModification(''.join('{}{}'.format(k, v if v not in {0, 1} else '') for k, v in aa_comp[label].items()))] - name = get_name(label) - if not name: - warnings.warn("Unable to resolve label `{}`. " - "The ProForma string may be invalid. Specify `mod_names`, `aa_mass` or `aa_comp`.".format(label)) - name = label.capitalize() - return [proforma.GenericModification(prefix + name)] - - i, j = 0, len(sequence) - nterm = cterm = None - pro_sequence = [] - if isinstance(sequence[0], str): # regular parsed sequence - if is_term_mod(sequence[0]) and sequence[0] != std_nterm: - nterm = get_tag(sequence[0]) - i = 1 - if is_term_mod(sequence[-1]) and sequence[-1] != std_cterm: - cterm = get_tag(sequence[-1]) - j -= 1 - for label in sequence[i:j]: - if len(label) == 1: - pro_sequence.append((label, None)) - else: - mod, aa = _split_label(label) - pro_sequence.append((aa, get_tag(mod))) - else: # split sequence - if is_term_mod(sequence[0][0]) and sequence[0][0] != std_nterm: - nterm = get_tag(sequence[0][0]) - if is_term_mod(sequence[-1][-1]) and sequence[-1][-1] != std_cterm: - cterm = get_tag(sequence[-1][-1]) - if len(sequence) == 1: - pro_sequence = [(sequence[0][-2] if cterm else sequence[0][-1], get_tag(sequence[0][1]) if len(sequence[0]) == 4 else None)] - else: - pro_sequence.append((sequence[0][-1], get_tag(sequence[0][-2]) if len(sequence[0]) == 3 else None)) - for group in sequence[1:-1]: - pro_sequence.append((group[-1], get_tag(group[0]) if len(group) == 2 else None)) - if len(sequence[-1]) == 1 or (len(sequence[-1]) == 2 and cterm): - pro_sequence.append((sequence[-1][0], None)) - else: - pro_sequence.append((sequence[-1][1], get_tag(sequence[-1][0]))) - - return proforma.to_proforma(pro_sequence, n_term=nterm, c_term=cterm) - - -def amino_acid_composition(sequence, show_unmodified_termini=False, term_aa=False, allow_unknown_modifications=False, **kwargs): - """Calculate amino acid composition of a polypeptide. - - Parameters - ---------- - sequence : str or list - The sequence of a polypeptide or a list with a parsed sequence. - show_unmodified_termini : bool, optional - If :py:const:`True` then the unmodified N- and C-terminus are explicitly - shown in the returned dict. Default value is :py:const:`False`. - term_aa : bool, optional - If :py:const:`True` then the terminal amino acid residues are - artificially modified with `nterm` or `cterm` modification. - Default value is :py:const:`False`. - allow_unknown_modifications : bool, optional - If :py:const:`True` then do not raise an exception when an unknown - modification of a known amino acid residue is found in the sequence. - Default value is :py:const:`False`. - labels : list, optional - A list of allowed labels for amino acids and terminal modifications. - - Returns - ------- - out : dict - A dictionary of amino acid composition. - - Examples - -------- - >>> amino_acid_composition('PEPTIDE') == \ - {'I': 1, 'P': 2, 'E': 2, 'T': 1, 'D': 1} - True - >>> amino_acid_composition('PEPTDE', term_aa=True) == \ - {'ctermE': 1, 'E': 1, 'D': 1, 'P': 1, 'T': 1, 'ntermP': 1} - True - >>> amino_acid_composition('PEPpTIDE', labels=std_labels+['pT']) == \ - {'I': 1, 'P': 2, 'E': 2, 'D': 1, 'pT': 1} - True - """ - labels = kwargs.get('labels') - - if isinstance(sequence, str): - parsed_sequence = parse(sequence, show_unmodified_termini, - allow_unknown_modifications=allow_unknown_modifications, - labels=labels) - elif isinstance(sequence, list): - if sequence and isinstance(sequence[0], tuple): - parsed_sequence = parse(tostring(sequence, True), - show_unmodified_termini, - allow_unknown_modifications=allow_unknown_modifications, - labels=labels) - else: - parsed_sequence = sequence - else: - raise PyteomicsError('Unsupported type of a sequence.' - 'Must be str or list, not %s' % type(sequence)) - - aa_dict = BasicComposition() - - # Process terminal amino acids. - if term_aa: - nterm_aa_position = 1 if is_term_mod(parsed_sequence[0]) else 0 - cterm_aa_position = ( - len(parsed_sequence) - 2 if is_term_mod(parsed_sequence[-1]) - else len(parsed_sequence) - 1) - if len(parsed_sequence) > 1: - aa_dict['cterm' + parsed_sequence.pop(cterm_aa_position)] = 1 - aa_dict['nterm' + parsed_sequence.pop(nterm_aa_position)] = 1 - - # Process core amino acids. - for aa in parsed_sequence: - aa_dict[aa] += 1 - - return aa_dict - - -@memoize() -def cleave(*args, **kwargs): - """Cleaves a polypeptide sequence using a given rule. - - .. seealso:: - :func:`icleave` and :func:`xcleave`, which produce both peptides and their indices. - - Parameters - ---------- - sequence : str - The sequence of a polypeptide. - - .. note:: - The sequence is expected to be in one-letter uppercase notation. - Otherwise, some of the cleavage rules in :py:data:`expasy_rules` - will not work as expected. - - rule : str or compiled regex - A key present in :py:data:`expasy_rules`, :py:data:`psims_rules` (or an MS ontology accession) or a - `regular expression <https://docs.python.org/library/re.html#regular-expression-syntax>`_ - describing the site of cleavage. It is recommended - to design the regex so that it matches only the residue whose C-terminal - bond is to be cleaved. All additional requirements should be specified - using `lookaround assertions - <http://www.regular-expressions.info/lookaround.html>`_. - :py:data:`expasy_rules` contains cleavage rules for popular cleavage agents. - - .. seealso:: The `regex` argument. - - missed_cleavages : int, optional - Maximum number of allowed missed cleavages. Defaults to 0. - min_length : int or None, optional - Minimum peptide length. Defaults to :py:const:`None`. - - .. note :: - This checks for string length, which is only correct for one-letter - notation and not for full *modX*. Use :py:func:`length` manually if - you know what you are doing and apply :py:func:`cleave` to *modX* - sequences. - - max_length : int or None, optional - Maximum peptide length. Defaults to :py:const:`None`. See note above. - - semi : bool, optional - Include products of semi-specific cleavage. Default is :py:const:`False`. - This effectively cuts every peptide at every position and adds results to the output. - - exception : str or compiled RE or None, optional - Exceptions to the cleavage rule. If specified, should be a key present in :py:const:`expasy_rules` - or regular expression. Cleavage sites matching `rule` will be checked against `exception` and omitted - if they match. - - regex : bool, optional - If :py:const:`True`, the cleavage rule is always interpreted as a regex. Otherwise, a matching value - is looked up in :py:data:`expasy_rules` and :py:data:`psims_rules`. - - Returns - ------- - out : set - A set of unique (!) peptides. - - Examples - -------- - >>> cleave('AKAKBK', expasy_rules['trypsin'], 0) == {'AK', 'BK'} - True - >>> cleave('AKAKBK', 'trypsin', 0) == {'AK', 'BK'} - True - >>> cleave('AKAKBK', 'MS:1001251', 0) == {'AK', 'BK'} - True - >>> cleave('GKGKYKCK', 'Trypsin/P', 2) == \ - {'CK', 'GKYK', 'YKCK', 'GKGK', 'GKYKCK', 'GK', 'GKGKYK', 'YK'} - True - - """ - return set(p for i, p in icleave(*args, **kwargs)) - - -def icleave(sequence, rule, missed_cleavages=0, min_length=None, max_length=None, semi=False, exception=None, regex=False): - """Like :py:func:`cleave`, but the result is an iterator and includes peptide indices. - Refer to :py:func:`cleave` for explanation of parameters. - - Returns - ------- - out : iterator - An iterator over (index, sequence) pairs. - - """ - if not regex: - if rule in expasy_rules: - rule = expasy_rules[rule] - elif rule in psims_rules: - rule = psims_rules[rule] - elif rule in _psims_index: - rule = _psims_index[rule] - elif re.search(r'[a-z]', rule): - warnings.warn('Interpreting the rule as a regular expression: {}. Did you mistype the rule? ' - 'Specify `regex=True` to silence this warning.'.format(rule)) - exception = expasy_rules.get(exception, exception) - ml = missed_cleavages + 2 - trange = range(ml) - cleavage_sites = deque([0], maxlen=ml) - if min_length is None: - min_length = 1 - if max_length is None: - max_length = len(sequence) - cl = 1 - if exception is not None: - exceptions = {x.end() for x in re.finditer(exception, sequence)} - for end in it.chain([x.end() for x in re.finditer(rule, sequence)], [None]): - if exception is not None and end in exceptions: - continue - cleavage_sites.append(end) - if cl < ml: - cl += 1 - for j in trange[:cl - 1]: - seq = sequence[cleavage_sites[j]:cleavage_sites[-1]] - lenseq = len(seq) - if end is not None: - start = end - lenseq - else: - start = len(sequence) - lenseq - if seq and min_length <= lenseq <= max_length: - yield (start, seq) - if semi: - for k in range(min_length, min(lenseq, max_length)): - yield (start, seq[:k]) - for k in range(max(1, lenseq - max_length), lenseq - min_length + 1): - yield (start + k, seq[k:]) - - -def xcleave(*args, **kwargs): - """Like :py:func:`icleave`, but returns a list. - - Returns - ------- - out : list - A list of (index, sequence) pairs. - - Examples - -------- - >>> xcleave('AKAKBK', 'trypsin', 1) - [(0, 'AK'), (0, 'AKAK'), (2, 'AK'), (2, 'AKBK'), (4, 'BK')] - """ - return list(icleave(*args, **kwargs)) - - -def num_sites(sequence, rule, **kwargs): - """Count the number of sites where `sequence` can be cleaved using - the given `rule` (e.g. number of miscleavages for a peptide). - - Parameters - ---------- - sequence : str - The sequence of a polypeptide. - rule : str or compiled regex - A regular expression describing the site of cleavage. It is recommended - to design the regex so that it matches only the residue whose C-terminal - bond is to be cleaved. All additional requirements should be specified - using `lookaround assertions - <http://www.regular-expressions.info/lookaround.html>`_. - labels : list, optional - A list of allowed labels for amino acids and terminal modifications. - exception : str or compiled RE or None, optional - Exceptions to the cleavage rule. If specified, should be a regular expression. - Cleavage sites matching `rule` will be checked against `exception` and omitted - if they match. - - Returns - ------- - out : int - Number of cleavage sites. - """ - return sum(1 for _ in icleave(sequence, rule, **kwargs)) - 1 - - -expasy_rules = { - 'arg-c': r'R', - 'asp-n': r'\w(?=D)', - 'bnps-skatole' : r'W', - 'caspase 1': r'(?<=[FWYL]\w[HAT])D(?=[^PEDQKR])', - 'caspase 2': r'(?<=DVA)D(?=[^PEDQKR])', - 'caspase 3': r'(?<=DMQ)D(?=[^PEDQKR])', - 'caspase 4': r'(?<=LEV)D(?=[^PEDQKR])', - 'caspase 5': r'(?<=[LW]EH)D', - 'caspase 6': r'(?<=VE[HI])D(?=[^PEDQKR])', - 'caspase 7': r'(?<=DEV)D(?=[^PEDQKR])', - 'caspase 8': r'(?<=[IL]ET)D(?=[^PEDQKR])', - 'caspase 9': r'(?<=LEH)D', - 'caspase 10': r'(?<=IEA)D', - 'chymotrypsin high specificity' : r'([FY](?=[^P]))|(W(?=[^MP]))', - 'chymotrypsin low specificity': - r'([FLY](?=[^P]))|(W(?=[^MP]))|(M(?=[^PY]))|(H(?=[^DMPW]))', - 'clostripain': r'R', - 'cnbr': r'M', - 'enterokinase': r'(?<=[DE]{3})K', - 'factor xa': r'(?<=[AFGILTVM][DE]G)R', - 'formic acid': r'D', - 'glutamyl endopeptidase': r'E', - 'granzyme b': r'(?<=IEP)D', - 'hydroxylamine': r'N(?=G)', - 'iodosobenzoic acid': r'W', - 'lysc': r'K', - 'ntcb': r'\w(?=C)', - 'pepsin ph1.3': r'((?<=[^HKR][^P])[^R](?=[FL][^P]))|' - r'((?<=[^HKR][^P])[FL](?=\w[^P]))', - 'pepsin ph2.0': r'((?<=[^HKR][^P])[^R](?=[FLWY][^P]))|' - r'((?<=[^HKR][^P])[FLWY](?=\w[^P]))', - 'proline endopeptidase': r'(?<=[HKR])P(?=[^P])', - 'proteinase k': r'[AEFILTVWY]', - 'staphylococcal peptidase i': r'(?<=[^E])E', - 'thermolysin': r'[^DE](?=[AFILMV][^P])', - 'thrombin': r'((?<=G)R(?=G))|' - r'((?<=[AFGILTVM][AFGILTVWA]P)R(?=[^DE][^DE]))', - 'trypsin': r'([KR](?=[^P]))|((?<=W)K(?=P))|((?<=M)R(?=P))', - 'trypsin_exception': r'((?<=[CD])K(?=D))|((?<=C)K(?=[HY]))|((?<=C)R(?=K))|((?<=R)R(?=[HR]))', -} -""" -This dict contains regular expressions for cleavage rules of the most -popular proteolytic enzymes. The rules were taken from the -`PeptideCutter tool -<http://ca.expasy.org/tools/peptidecutter/peptidecutter_enzymes.html>`_ -at Expasy. - -.. note:: - 'trypsin_exception' can be used as `exception` argument when calling - :py:func:`cleave` with 'trypsin' `rule`:: - - >>> parser.cleave('PEPTIDKDE', parser.expasy_rules['trypsin']) - {'DE', 'PEPTIDK'} - >>> parser.cleave('PEPTIDKDE', parser.expasy_rules['trypsin'], \ -exception=parser.expasy_rules['trypsin_exception']) - {'PEPTIDKDE'} -""" - - -psims_rules = { - cvstr('2-iodobenzoate', 'MS:1001918'): r'(?<=W)', - cvstr('Arg-C', 'MS:1001303'): r'(?<=R)(?!P)', - cvstr('Asp-N', 'MS:1001304'): r'(?=[BD])', - cvstr('Asp-N ambic', 'MS:1001305'): r'(?=[DE])', - cvstr('CNBr', 'MS:1001307'): r'(?<=M)', - cvstr('Chymotrypsin', 'MS:1001306'): r'(?<=[FYWL])(?!P)', - cvstr('Formic acid', 'MS:1001308'): r'((?<=D))|((?=D))', - cvstr('Lys-C', 'MS:1001309'): r'(?<=K)(?!P)', - cvstr('Lys-C/P', 'MS:1001310'): r'(?<=K)', - cvstr('PepsinA', 'MS:1001311'): r'(?<=[FL])', - cvstr('TrypChymo', 'MS:1001312'): r'(?<=[FYWLKR])(?!P)', - cvstr('Trypsin', 'MS:1001251'): r'(?<=[KR])(?!P)', - cvstr('Trypsin/P', 'MS:1001313'): r'(?<=[KR])', - cvstr('V8-DE', 'MS:1001314'): r'(?<=[BDEZ])(?!P)', - cvstr('V8-E', 'MS:1001315'): r'(?<=[EZ])(?!P)', - cvstr('glutamyl endopeptidase', 'MS:1001917'): r'(?<=[^E]E)', - cvstr('leukocyte elastase', 'MS:1001915'): r'(?<=[ALIV])(?!P)', - cvstr('proline endopeptidase', 'MS:1001916'): r'(?<=[HKR]P)(?!P)', -} -""" -This dict contains regular expressions for cleavage rules of the most -popular proteolytic enzymes. The rules were taken from the PSI `MS ontology -<http://purl.obolibrary.org/obo/MS_1001045>`_. - -You can use names or accessions to access the rules. -Use :py:func:`pyteomics.auxiliary.cvquery` for accession access:: - - >>> from pyteomics.auxiliary import cvquery - >>> from pyteomics.parser import psims_rules - >>> cvquery(psims_rules, 'MS:1001918') - '(?<=W)' - -""" - -_psims_index = cvquery(psims_rules) - -def isoforms(sequence, **kwargs): - """ - Apply variable and fixed modifications to the polypeptide and yield - the unique modified sequences. - - Parameters - ---------- - - sequence : str - Peptide sequence to modify. - - variable_mods : dict, optional - A dict of variable modifications in the following format: - :py:const:`{'label1': ['X', 'Y', ...], 'label2': ['X', 'A', 'B', ...]}` - - Keys in the dict are modification labels (terminal modifications allowed). - Values are iterables of residue labels (one letter each) or - :py:const:`True`. If a value for a modification is :py:const:`True`, - it is applicable to any residue (useful for terminal modifications). - You can use values such as 'ntermX' or 'ctermY' to specify that a - mdofication only occurs when the residue is in the terminal position. - This is *not needed* for terminal modifications. - - .. note:: Several variable modifications can occur on amino acids of the - same type, but in the output each amino acid residue will be - modified at most once (apart from terminal modifications). - - fixed_mods : dict, optional - A dict of fixed modifications in the same format. - - **Note**: if a residue is affected by a fixed modification, no variable - modifications will be applied to it (apart from terminal modifications). - - labels : list, optional - A list of amino acid labels containing all the labels present in - `sequence`. Modified entries will be added automatically. - Defaults to :py:data:`std_labels`. - Not required since version 2.5. - - max_mods : int or None, optional - Number of modifications that can occur simultaneously on a peptide, - excluding fixed modifications. If :py:const:`None` or if ``max_mods`` - is greater than the number of modification sites, all possible - isoforms are generated. Default is :py:const:`None`. - - override : bool, optional - Defines how to handle the residues that are modified in the input. - :py:const:`False` means that they will be preserved (default). - :py:const:`True` means they will be treated as unmodified. - - show_unmodified_termini : bool, optional - If :py:const:`True` then the unmodified N- and C-termini are explicitly - shown in the returned sequences. Default value is :py:const:`False`. - - format : str, optional - If :py:const:`'str'` (default), an iterator over sequences is returned. - If :py:const:`'split'`, the iterator will yield results in the same - format as :py:func:`parse` with the 'split' option, with unmodified - terminal groups shown. - - Returns - ------- - - out : iterator over strings or lists - All possible unique polypeptide sequences resulting from - the specified modifications are yielded obe by one. - """ - def main(group): # index of the residue (capital letter) in `group` - if group[-1][0] == '-': - i = -2 - else: - i = -1 - return len(group) + i, group[i] - - def apply_mod(label, mod): - # `label` is assumed to be a tuple (see split option of `parse`) - # unmodified termini are assumed shown - # if the modification is not applicable, `None` is returned - group = list(label) - m = main(group)[0] - c = True # whether the change is applied in the end - if m == 0 and not is_term_mod(mod): - group.insert(0, mod) - elif mod[0] == '-' and (group[-1] == std_cterm or (group[-1][0] == '-' and override)): - group[-1] = mod - elif mod[-1] == '-' and (group[0] == std_nterm or (group[0][-1] == '-' and override)): - group[0] = mod - elif not is_term_mod(mod): - if m and group[m - 1][-1] != '-': - if override: - group[m - 1] = mod - else: - c = False - else: - group.insert(m, mod) - else: - c = False - if c: - return tuple(group) - - variable_mods = kwargs.get('variable_mods', {}) - varmods_term, varmods_non_term = [], [] - for m, r in sorted(variable_mods.items()): - if is_term_mod(m): - varmods_term.append((m, r)) - else: - varmods_non_term.append((m, r)) - fixed_mods = kwargs.get('fixed_mods', {}) - parse_kw = {} - if 'labels' in kwargs: - parse_kw['labels'] = list(kwargs['labels']) + list(fixed_mods) - parsed = parse(sequence, True, True, **parse_kw) - override = kwargs.get('override', False) - show_unmodified_termini = kwargs.get('show_unmodified_termini', False) - max_mods = kwargs.get('max_mods') - format_ = kwargs.get('format', 'str') - - # Apply fixed modifications - for cmod, res in fixed_mods.items(): - for i, group in enumerate(parsed): - if res is True or main(group)[1] in res: - parsed[i] = apply_mod(group, cmod) or parsed[i] - - # Create a list of possible states for each group - # Start with N-terminal mods and regular mods on the N-terminal residue - states = [[parsed[0]]] - m0 = main(parsed[0])[1] - for m, r in varmods_non_term: - if r is True or m0 in r or 'nterm' + m0 in r or len(parsed) == 1 and 'cterm' + m0 in r: - applied = apply_mod(parsed[0], m) - if applied is not None: - states[0].append(applied) - more_states = [] - for m, r in varmods_term: - if r is True or m0 in r: - if m[-1] == '-' or len(parsed) == 1: - for group in states[0]: - applied = apply_mod(group, m) - if applied is not None: - more_states.append(applied) - states[0].extend(more_states) - - # Continue with regular mods - for group in parsed[1:-1]: - gstates = [group] - for m, r in varmods_non_term: - if r is True or group[-1] in r: - applied = apply_mod(group, m) - if applied is not None: - gstates.append(applied) - states.append(gstates) - - # Finally add C-terminal mods and regular mods on the C-terminal residue - if len(parsed) > 1: - states.append([parsed[-1]]) - m1 = main(parsed[-1])[1] - for m, r in varmods_non_term: - if r is True or m1 in r or 'cterm' + m1 in r or len(parsed) == 1 and 'nterm' + m1 in r: - applied = apply_mod(parsed[-1], m) - if applied is not None: - states[-1].append(applied) - more_states = [] - for m, r in varmods_term: - if r is True or m1 in r: - if m[0] == '-' or len(parsed) == 1: - for group in states[-1]: - applied = apply_mod(group, m) - if applied is not None: - more_states.append(applied) - states[-1].extend(more_states) - - sites = [s for s in enumerate(states) if len(s[1]) > 1] - if max_mods is None or max_mods > len(sites): - possible_states = it.product(*states) - else: - def state_lists(): - for m in range(max_mods + 1): - for comb in it.combinations(sites, m): - skel = [[s[0]] for s in states] - for i, e in comb: - skel[i] = e[1:] - yield skel - possible_states = it.chain.from_iterable(it.product(*skel) for skel in state_lists()) - - if format_ == 'split': - def strip_std_terms(): - for ps in possible_states: - ps = list(ps) - if not show_unmodified_termini: - if ps[0][0] == std_nterm: - ps[0] = ps[0][1:] - if ps[-1][-1] == std_cterm: - ps[-1] = ps[-1][:-1] - yield ps - return strip_std_terms() - elif format_ == 'str': - return (tostring(form, show_unmodified_termini) - for form in possible_states) - else: - raise PyteomicsError('Unsupported value of "format": {}'.format(format_)) - - -def coverage(protein, peptides): - """Calculate how much of `protein` is covered by `peptides`. - Peptides can overlap. If a peptide is found multiple times in `protein`, - it contributes more to the overall coverage. - - Requires :py:mod:`numpy`. - - .. note:: - Modifications and terminal groups are discarded. - - Parameters - ---------- - protein : str - A protein sequence. - peptides : iterable - An iterable of peptide sequences. - - Returns - ------- - out : float - The sequence coverage, between 0 and 1. - - Examples - -------- - >>> coverage('PEPTIDES'*100, ['PEP', 'EPT']) - 0.5 - """ - import numpy as np - protein = re.sub(r'[^A-Z]', '', protein) - mask = np.zeros(len(protein), dtype=np.int8) - for peptide in peptides: - indices = [m.start() for m in re.finditer( - '(?={})'.format(re.sub(r'[^A-Z]', '', peptide)), protein)] - for i in indices: - mask[i:i + len(peptide)] = 1 - return mask.sum(dtype=float) / mask.size - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/pyteomics/peff.py b/pyteomics/peff.py deleted file mode 100644 index 90ffcc5a992256521f2e01af7822ec80406cfe0f..0000000000000000000000000000000000000000 --- a/pyteomics/peff.py +++ /dev/null @@ -1,277 +0,0 @@ -""" -peff - PSI Extended FASTA Format -================================ - -PEFF is a forth-coming standard from PSI-HUPO formalizing and extending the -encoding of protein features and annotations for building search spaces for -proteomics. See `The PEFF specification <http://www.psidev.info/peff>`_ for -more up-to-date information on the standard. - -Data manipulation ------------------ - -Classes -....... - -The PEFF parser inherits several properties from implementation in the :mod:`~.fasta` module, -building on top of the :class:`~.TwoLayerIndexedFASTA` reader. - -Available classes: - - :py:class:`IndexedPEFF` - Parse a PEFF format file in binary-mode, supporting - direct indexing by header string or by tag. - -""" - -# Copyright 2018 Joshua Klein, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -try: - from collections.abc import Sequence as SequenceABC, Mapping -except ImportError: - from collections import Sequence as SequenceABC, Mapping -from collections import OrderedDict, defaultdict - -from .fasta import TwoLayerIndexedFASTA - - -class Header(Mapping): - """Hold parsed properties of a key-value pair like a sequence's - definition line. - - This object supports the :class:`Mapping` interface, and - keys may be accessed by attribute access notation. - """ - def __init__(self, mapping, original=None): - self._mapping = mapping - - def __getitem__(self, key): - return self._mapping[key] - - def __iter__(self): - return iter(self._mapping) - - def items(self): - return self._mapping.items() - - def keys(self): - return self._mapping.keys() - - def values(self): - return self._mapping.values() - - def __len__(self): - return len(self._mapping) - - def __contains__(self, key): - return key in self._mapping - - def __getattr__(self, key): - if key == "_mapping": - raise AttributeError(key) - try: - return self._mapping[key] - except KeyError: - raise AttributeError(key) - - def __repr__(self): - return "{self.__class__.__name__}({mapping})".format(self=self, mapping=dict(self._mapping)) - - def __hash__(self): - return hash(self.defline) - - def __eq__(self, other): - try: - return self._mapping == other._mapping - except AttributeError: - return str(self) == str(other) - - def __ne__(self, other): - return not (self == other) - - def __dir__(self): - base = set(dir(super(Header, self))) - keys = set(self._mapping.keys()) - return list(base | keys) - - -class IndexedPEFF(TwoLayerIndexedFASTA): - """Creates an :py:class:`IndexedPEFF` object. - - Parameters - ---------- - source : str or file - The file to read. If a file object, it needs to be in *rb* mode. - parse : bool, optional - Defines whether the descriptions should be parsed in the produced tuples. - Default is :py:const:`True`. - kwargs : passed to the :py:class:`TwoLayerIndexedFASTA` constructor. - """ - - kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)") - header_pattern = re.compile(r"^>?(\S+):(\S+)") - has_feature_index = re.compile(r"^\(?(\d+):") - header_group = 2 - - class _PEFFFeature(SequenceABC): - def __init__(self, *fields, **kwargs): - self.fields = tuple(fields) - self.id = kwargs.get('id') - self.feature_type = kwargs.get("feature_type") - - def __eq__(self, other): - return tuple(self) == tuple(other) - - def __ne__(self, other): - return not (self == other) - - def __getitem__(self, i): - return self.fields[i] - - def __len__(self): - return len(self.fields) - - def __repr__(self): - return repr(tuple(self)) - - def __str__(self): - return "(%s%s)" % ( - '%r:' % self.id if self.id is not None else '', - '|'.join(map(str, self)), ) - - def __init__(self, source, ignore_comments=False, **kwargs): - super(IndexedPEFF, self).__init__( - source, ignore_comments=ignore_comments, parser=self.parser, - header_pattern=self.header_pattern, **kwargs) - self.header_blocks = [] - self.comments = [] - self.version = None - self.number_of_entries = 0 - self._parse_header() - - def _parse_header(self): - self.seek(0) - line = self.readline().decode("ascii") - if not line.startswith("# PEFF"): - raise ValueError("Not a PEFF File") - self.version = tuple(map(int, line.strip()[7:].split("."))) - current_block = defaultdict(list) - in_header = True - while in_header: - line = self.readline().decode("ascii") - if not line.startswith("#"): - in_header = False - line = line.strip()[2:] - if '=' in line: - key, value = line.split("=", 1) - if key == "GeneralComment": - self.comments.append(value) - else: - current_block[key].append(value) - if line.startswith("//"): - if current_block: - self.header_blocks.append( - Header(OrderedDict((k, v if len(v) > 1 else v[0]) - for k, v in current_block.items()))) - current_block = defaultdict(list) - number_of_entries = 0 - for block in self.header_blocks: - try: - number_of_entries += int(block['NumberOfEntries']) - except KeyError: - pass - self.number_of_entries = number_of_entries - - def _extract_parenthesis_list(self, text): - chunks = [] - chunk = [] - paren_level = 0 - i = 0 - n = len(text) - while i < n: - c = text[i] - i += 1 - if c == "(": - if paren_level > 0: - chunk.append(c) - paren_level += 1 - elif c == ")": - if paren_level > 1: - chunk.append(c) - paren_level -= 1 - if paren_level == 0: - if chunk: - chunks.append(chunk) - chunk = [] - else: - chunk.append(c) - chunks = list(map(''.join, chunks)) - return chunks - - def _split_pipe_separated_tuple(self, text): - parts = text.split("|") - return parts - - def _coerce_types(self, key, value): - value = value.strip() - feature_id_match = self.has_feature_index.search(value) - if feature_id_match: - feature_id = int(feature_id_match.group(1)) - value = self.has_feature_index.sub('', value) - else: - feature_id = None - if "|" in value: - value = self._split_pipe_separated_tuple(value) - result = [] - for i, v in enumerate(value): - result.append(self._coerce_value(key, v, i)) - return self._PEFFFeature(*result, feature_type=key, id=feature_id) - else: - return self._coerce_value(key, value, 0) - - def _coerce_value(self, key, value, index): - try: - return int(value) - except ValueError: - pass - try: - return float(value) - except ValueError: - pass - return str(value) - - def parser(self, line): - match = self.header_pattern.match(line) - if not match: - raise ValueError( - "Failed to parse {!r} using {!r}".format( - line, self)) - storage = OrderedDict() - prefix = None - db_uid = None - if line.startswith(">"): - line = line[1:] - prefix, line = line.split(":", 1) - db_uid, line = line.split(" ", 1) - storage['Prefix'] = prefix - storage['Tag'] = db_uid - kv_pattern = re.compile(r"\\(?P<key>\S+)=(?P<value>.+?)(?:\s(?=\\)|$)") - for key, value in kv_pattern.findall(line): - if not (value.startswith("(") or " (" in value): - storage[key] = self._coerce_types(key, value) - else: - # multi-value - storage[key] = [self._coerce_types(key, v) for v in self._extract_parenthesis_list(value)] - return Header(storage) diff --git a/pyteomics/pepxml.py b/pyteomics/pepxml.py deleted file mode 100644 index 813f5749fcabc19c150717230c1e85e0b570c4c2..0000000000000000000000000000000000000000 --- a/pyteomics/pepxml.py +++ /dev/null @@ -1,573 +0,0 @@ -""" -pepxml - pepXML file reader -=========================== - -Summary -------- - -`pepXML <http://tools.proteomecenter.org/wiki/index.php?title=Formats:pepXML>`_ -was the first widely accepted format for proteomics search engines' output. -Even though it is to be replaced by a community standard -`mzIdentML <http://www.psidev.info/index.php?q=node/454>`_, it is still used -commonly. - -This module provides minimalistic infrastructure for access to data stored in -pepXML files. The most important function is :py:func:`read`, which -reads peptide-spectum matches and related information and saves them into -human-readable dicts. This function relies on the terminology of the underlying -`lxml library <http://lxml.de/>`_. - -Data access ------------ - - :py:class:`PepXML` - a class representing a single pepXML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through peptide-spectrum matches in a pepXML - file. Data for a single spectrum are converted to an easy-to-use dict. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`DataFrame` - read pepXML files into a :py:class:`pandas.DataFrame`. - -Target-decoy approach ---------------------- - - :py:func:`filter` - filter PSMs from a chain of pepXML files to a specific FDR - using TDA. - - :py:func:`filter.chain` - chain a series of filters applied independently to - several files. - - :py:func:`filter.chain.from_iterable` - chain a series of filters applied - independently to an iterable of files. - - :py:func:`filter_df` - filter pepXML files and return a :py:class:`pandas.DataFrame`. - - :py:func:`fdr` - estimate the false discovery rate of a PSM set using the - target-decoy approach. - - :py:func:`qvalues` - get an array of scores and local FDR values for a PSM - set using the target-decoy approach. - - :py:func:`is_decoy` - determine whether a PSM is decoy or not. - -Miscellaneous -------------- - - :py:func:`roc_curve` - get a receiver-operator curve (min PeptideProphet - probability in a sample vs. false discovery rate) of PeptideProphet analysis. - -Deprecated functions --------------------- - - :py:func:`iterfind` - iterate over elements in a pepXML file. - You can just call the corresponding method of the :py:class:`PepXML` - object. - - :py:func:`version_info` - get information about pepXML version and schema. - You can just read the corresponding attribute of the :py:class:`PepXML` - object. - -Dependencies ------------- - -This module requires :py:mod:`lxml`. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from lxml import etree -from . import xml, auxiliary as aux, _schema_defaults - - -class PepXML(xml.MultiProcessingXML, xml.IndexSavingXML): - """Parser class for pepXML files.""" - file_format = 'pepXML' - _root_element = 'msms_pipeline_analysis' - _default_schema = _schema_defaults._pepxml_schema_defaults - _default_version = '1.15' - _default_iter_tag = 'spectrum_query' - _indexed_tags = {'spectrum_query'} - _indexed_tag_keys = {'spectrum_query': 'spectrum'} - _default_id_attr = 'spectrum' - _structures_to_flatten = {'search_score_summary', 'modification_info'} - # attributes which contain unconverted values - _convert_items = {'float': {'calc_neutral_pep_mass', 'massdiff', - 'probability', 'variable', 'static'}, - 'int': {'start_scan', 'end_scan', 'index', 'num_matched_peptides'}, - 'bool': {'is_rejected'}, - 'floatarray': {'all_ntt_prob'}}.items() - - def _get_info_smart(self, element, **kwargs): - """Extract the info in a smart way depending on the element type""" - try: - name = kwargs.pop('ename') - except KeyError: - name = xml._local_name(element) - rec = kwargs.pop('recursive', None) - if name == self._root_element: - info = self._get_info(element, ename=name, recursive=(rec if rec is not None else False), **kwargs) - else: - info = self._get_info(element, ename=name, recursive=(rec if rec is not None else True), **kwargs) - - def safe_float(s): - try: - return float(s) - except ValueError: - if s.startswith('+-0'): - return 0 - return s - - converters = {'float': safe_float, 'int': int, - 'bool': lambda x: x.lower() in {'1', 'true'}, - 'floatarray': lambda x: list(map(float, x[1:-1].split(',')))} - for k, v in dict(info).items(): - for t, s in self._convert_items: - if k in s: - del info[k] - info[k] = converters[t](v) - for k in {'search_score', 'parameter'}: - if k in info and isinstance(info[k], list) and all( - isinstance(x, dict) and len(x) == 1 for x in info[k]): - scores = {} - for score in info[k]: - name, value = score.popitem() - try: - scores[name] = float(value) - except ValueError: - scores[name] = value - info[k] = scores - if 'search_result' in info and len(info['search_result']) == 1: - info.update(info['search_result'][0]) - del info['search_result'] - if 'protein' in info and 'peptide' in info: - info['proteins'] = [{'protein': info.pop('protein'), - 'protein_descr': info.pop('protein_descr', None)}] - for add_key in {'peptide_prev_aa', 'peptide_next_aa', 'protein_mw'}: - if add_key in info: - info['proteins'][0][add_key] = info.pop(add_key) - info['proteins'][0]['num_tol_term'] = info.pop('num_tol_term', 0) - if 'alternative_protein' in info: - info['proteins'].extend(info['alternative_protein']) - del info['alternative_protein'] - if 'peptide' in info and not 'modified_peptide' in info: - info['modified_peptide'] = info['peptide'] - if 'peptide' in info: - info['modifications'] = info.pop('mod_aminoacid_mass', []) - if 'mod_nterm_mass' in info: - info['modifications'].insert(0, {'position': 0, - 'mass': float(info.pop('mod_nterm_mass'))}) - if 'mod_cterm_mass' in info: - info['modifications'].append({'position': 1 + len(info['peptide']), - 'mass': float(info.pop('mod_cterm_mass'))}) - if 'modified_peptide' in info and info['modified_peptide'] == info.get( - 'peptide'): - if not info.get('modifications'): - info['modifications'] = [] - else: - mp = info['modified_peptide'] - for mod in sorted(info['modifications'], - key=lambda m: m['position'], - reverse=True): - if mod['position'] not in {0, 1+len(info['peptide'])}: - p = mod['position'] - mp = mp[:p] + '[{}]'.format(int(mod['mass'])) + mp[p:] - info['modified_peptide'] = mp - if 'search_hit' in info: - info['search_hit'].sort(key=lambda x: x['hit_rank']) - return info - - -def read(source, read_schema=False, iterative=True, **kwargs): - """Parse `source` and iterate through peptide-spectrum matches. - - Parameters - ---------- - source : str or file - A path to a target pepXML file or the file object itself. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the pepXML header. Otherwise, use default parameters. - Not recommended without Internet connection or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - Returns - ------- - out : PepXML - An iterator over dicts with PSM properties. - """ - - return PepXML(source, read_schema=read_schema, iterative=iterative) - - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified "XPath". - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create an :py:class:`PepXML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, keyword only, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - iterative : bool, keyword only, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, keyword only, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header. Otherwise, use default parameters. - Not recommended without Internet connection or - if you don't like to get the related warnings. - - Returns - ------- - out : iterator - """ - return PepXML(source, **kwargs).iterfind(path, **kwargs) - - -version_info = xml._make_version_info(PepXML) - - -def roc_curve(source): - """Parse source and return a ROC curve for peptideprophet analysis. - - Parameters - ---------- - source : str or file - A path to a target pepXML file or the file object itself. - - Returns - ------- - out : list - A list of ROC points. - """ - - parser = etree.XMLParser(remove_comments=True, ns_clean=True) - tree = etree.parse(source, parser=parser) - - roc_curve = [] - for roc_error_data in tree.xpath( - "/*[local-name()='msms_pipeline_analysis'] \ - //*[local-name()='analysis_summary' and @analysis='peptideprophet'] \ - //*[local-name()='peptideprophet_summary'] \ - //*[local-name()='roc_error_data']"): - for element in roc_error_data.xpath("*[local-name()='roc_data_point' or local-name()='error_point']"): - data_point = dict(element.attrib) - for key in data_point: - data_point[key] = float(data_point[key]) - data_point["charge"] = roc_error_data.attrib["charge"] - data_point["tag"] = etree.QName(element).localname - roc_curve.append(data_point) - - return roc_curve - - -# chain = aux._make_chain(read, 'read') -chain = aux.ChainBase._make_chain(read) - - -def _is_decoy_prefix(psm, prefix='DECOY_'): - """Given a PSM dict, return :py:const:`True` if all protein names for - the PSM start with ``prefix``, and :py:const:`False` otherwise. This - function might not work for some pepXML flavours. Use the source to get the - idea and suit it to your needs. - - Parameters - ---------- - psm : dict - A dict, as yielded by :py:func:`read`. - prefix : str, optional - A prefix used to mark decoy proteins. Default is `'DECOY_'`. - - Returns - ------- - out : bool - """ - return all(protein['protein'].startswith(prefix) - for protein in psm['search_hit'][0]['proteins']) - - -def _is_decoy_suffix(psm, suffix='_DECOY'): - return all(protein['protein'].endswith(suffix) - for protein in psm['search_hit'][0]['proteins']) - - -is_decoy = _is_decoy_prefix -fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix) -_key = lambda x: min(sh['search_score']['expect'] for sh in x['search_hit']) -qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, _key) -filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, _key, qvalues) -filter.chain = aux._make_chain(filter, 'filter', True) - - -def DataFrame(*args, **kwargs): - """Read pepXML output files into a :py:class:`pandas.DataFrame`. - - Requires :py:mod:`pandas`. - - Parameters - ---------- - *args - pepXML file names or objects. Passed to :py:func:`chain`. - - **kwargs - Passed to :py:func:`chain`. - - sep : str or None, keyword only, optional - Some values related to PSMs (such as protein information) are variable-length - lists. If `sep` is a :py:class:`str`, they will be packed into single string using - this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is - :py:const:`None`. - - recursive : bool, keyword only, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - iterative : bool, keyword only, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, keyword only, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header. Otherwise, use default parameters. - Not recommended without Internet connection or - if you don't like to get the related warnings. - - pd_kwargs : dict, optional - Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - kwargs = kwargs.copy() - sep = kwargs.pop('sep', None) - pd_kwargs = kwargs.pop('pd_kwargs', {}) - def gen_items(): - with chain(*args, **kwargs) as f: - for item in f: - info = {} - for k, v in item.items(): - if isinstance(v, (str, int, float)): - info[k] = v - if 'search_hit' in item: - sh = item['search_hit'][0] - proteins = sh.pop('proteins') - prot_dict = {} - for p in proteins: - for k in p: - prot_dict[k] = [] - for p in proteins: - for k, v in prot_dict.items(): - v.append(p.get(k)) - if sep is None: - info.update(prot_dict) - else: - for k, v in prot_dict.items(): - info[k] = sep.join(str(val) if val is not None else '' for val in v) - info.update(sh.pop('search_score')) - mods = sh.pop('modifications', []) - formatted_mods = ['{0[mass]:.3f}@{0[position]}'.format(x) for x in mods] - if sep is not None: - info['modifications'] = sep.join(formatted_mods) - else: - info['modifications'] = formatted_mods - for k, v in sh.items(): - if isinstance(v, (str, int, float)): - info[k] = v - if 'analysis_result' in sh: - for ar in sh['analysis_result']: - if ar['analysis'] == 'peptideprophet': - try: - info.update(ar['peptideprophet_result']['parameter']) - except KeyError: - pass - info['peptideprophet_probability'] = ar['peptideprophet_result']['probability'] - info['peptideprophet_ntt_prob'] = ar['peptideprophet_result']['all_ntt_prob'] - elif ar['analysis'] == 'interprophet': - info.update(ar['interprophet_result']['parameter']) - info['interprophet_probability'] = ar['interprophet_result']['probability'] - info['interprophet_ntt_prob'] = ar['interprophet_result']['all_ntt_prob'] - yield info - return pd.DataFrame(gen_items(), **pd_kwargs) - - -def filter_df(*args, **kwargs): - """Read pepXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. - Positional arguments can be pepXML files or DataFrames. Keyword parameter `fdr` is also required. - Other parameters are optional. - - Requires :py:mod:`pandas`. - - Parameters - ---------- - positional args - pepXML file names, file objects, or DataFrames. Passed to :py:func:`DataFrame`. - fdr : float, keyword only, 0 <= fdr <= 1 - Desired FDR level. - key : str / iterable / callable, keyword only, optional - PSM score. Default is 'expect'. - is_decoy : str / iterable / callable, keyword only, optional - Default is to check if all strings in the "protein" column start with `'DECOY_'`. - sep : str or None, keyword only, optional - Some values related to PSMs (such as protein information) are variable-length - lists. If `sep` is a :py:class:`str`, they will be packed into single string using - this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is - :py:const:`None`. - reverse : bool, keyword only, optional - If :py:const:`True`, then PSMs are sorted in descending order, - i.e. the value of the key function is higher for better PSMs. - Default is :py:const:`False`. - decoy_prefix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name prefix to use to detect decoy matches. If you provide your own - `is_decoy`, or if you specify `decoy_suffix`, this parameter has no effect. - Default is `"DECOY_"`. - decoy_suffix : str, optional - If the default `is_decoy` function works for you, this parameter specifies which - protein name suffix to use to detect decoy matches. If you provide your own - `is_decoy`, this parameter has no effect. Mutually exclusive with `decoy_prefix`. - remove_decoy : bool, keyword only, optional - Defines whether decoy matches should be removed from the output. - Default is :py:const:`True`. - - .. note:: If set to :py:const:`False`, then by default the decoy - PSMs will be taken into account when estimating FDR. Refer to the - documentation of :py:func:`fdr` for math; basically, if - `remove_decoy` is :py:const:`True`, then formula 1 is used - to control output FDR, otherwise it's formula 2. This can be - changed by overriding the `formula` argument. - - formula : int, keyword only, optional - Can be either 1 or 2, defines which formula should be used for FDR - estimation. Default is 1 if `remove_decoy` is :py:const:`True`, - else 2 (see :py:func:`fdr` for definitions). - ratio : float, keyword only, optional - The size ratio between the decoy and target databases. Default is - 1. In theory, the "size" of the database is the number of - theoretical peptides eligible for assignment to spectra that are - produced by *in silico* cleavage of that database. - correction : int or float, keyword only, optional - Possible values are 0, 1 and 2, or floating point numbers between 0 and 1. - - 0 (default): no correction; - - 1: enable "+1" correction. This accounts for the probability that a false - positive scores better than the first excluded decoy PSM; - - 2: this also corrects that probability for finite size of the sample, - so the correction will be slightly less than "+1". - - If a floating point number - is given, then instead of the expectation value for the number of false PSMs, - the confidence value is used. The value of `correction` is then interpreted as - desired confidence level. E.g., if correction=0.95, then the calculated q-values - do not exceed the "real" q-values with 95% probability. - - See `this paper <http://dx.doi.org/10.1021/acs.jproteome.6b00144>`_ for further explanation. - - pep : callable / array-like / iterable / str, keyword only, optional - If callable, a function used to determine the posterior error probability (PEP). - Should accept exactly one argument (PSM) and return a float. - If array-like, should contain float values for all given PSMs. - If string, it is used as a field name (PSMs must be in a record array - or a :py:class:`DataFrame`). - - .. note:: If this parameter is given, then PEP values will be used to calculate - q-values. Otherwise, decoy PSMs will be used instead. This option conflicts with: - `is_decoy`, `remove_decoy`, `formula`, `ratio`, `correction`. - `key` can still be provided. Without `key`, PSMs will be sorted by PEP. - - q_label : str, optional - Field name for q-value in the output. Default is ``'q'``. - - score_label : str, optional - Field name for score in the output. Default is ``'score'``. - - decoy_label : str, optional - Field name for the decoy flag in the output. Default is ``'is decoy'``. - - pep_label : str, optional - Field name for PEP in the output. Default is ``'PEP'``. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - sep = kwargs.get('sep') - kwargs.setdefault('key', 'expect') - if all(isinstance(arg, pd.DataFrame) for arg in args): - if len(args) > 1: - df = pd.concat(args) - else: - df = args[0] - else: - read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs} - df = DataFrame(*args, **read_kw) - if 'is_decoy' not in kwargs: - if sep is not None: - if 'decoy_suffix' in kwargs: - kwargs['is_decoy'] = df['protein'].str.split(';').apply( - lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) - else: - kwargs['is_decoy'] = df['protein'].str.split(';').apply( - lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) - else: - if 'decoy_suffix' in kwargs: - kwargs['is_decoy'] = df['protein'].apply( - lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) - else: - kwargs['is_decoy'] = df['protein'].apply( - lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) - return aux.filter(df, **kwargs) diff --git a/pyteomics/proforma.py b/pyteomics/proforma.py deleted file mode 100644 index c24792baf8c2005fd618ff33ed0a79eba920591b..0000000000000000000000000000000000000000 --- a/pyteomics/proforma.py +++ /dev/null @@ -1,2372 +0,0 @@ -''' -proforma - Proteoform and Peptidoform Notation -============================================== - -ProForma is a notation for defining modified amino acid sequences using -a set of controlled vocabularies, as well as encoding uncertain or partial -information about localization. See `ProForma specification <https://www.psidev.info/proforma>`_ -for more up-to-date information. - -For more details, see the :mod:`pyteomics.proforma` online. -''' - -import re -import warnings -from collections import deque, namedtuple -from functools import partial -from array import array as _array - -try: - from enum import Enum -except ImportError: - # Python 2 doesn't have a builtin Enum type - Enum = object - -from .mass import Composition, std_aa_mass, Unimod, nist_mass, calculate_mass, std_ion_comp, mass_charge_ratio -from .auxiliary import PyteomicsError, BasicComposition -from .auxiliary.utils import add_metaclass - -try: - import numpy as np -except ImportError: - np = None - -try: - from psims.controlled_vocabulary.controlled_vocabulary import (load_psimod, load_xlmod, load_gno, obo_cache, load_unimod) - _has_psims = True -except ImportError: - def _needs_psims(name): - raise ImportError("Loading %s requires the `psims` library. To access it, please install `psims`" % name) - - load_psimod = partial(_needs_psims, 'PSIMOD') - load_xlmod = partial(_needs_psims, 'XLMOD') - load_gno = partial(_needs_psims, 'GNO') - load_unimod = partial(_needs_psims, 'UNIMOD') - obo_cache = None - _has_psims = False - -_WATER_MASS = calculate_mass(formula="H2O") - -std_aa_mass = std_aa_mass.copy() -std_aa_mass['X'] = 0 - -element_symbols = set(nist_mass) -element_symbols.remove("e*") -element_symbols.add('e') - - -class ProFormaError(PyteomicsError): - def __init__(self, message, index=None, parser_state=None, **kwargs): - super(ProFormaError, self).__init__(PyteomicsError, message, index, parser_state) - self.message = message - self.index = index - self.parser_state = parser_state - - -class PrefixSavingMeta(type): - '''A subclass-registering-metaclass that provides easy - lookup of subclasses by prefix attributes. - ''' - - def __new__(mcs, name, parents, attrs): - new_type = type.__new__(mcs, name, parents, attrs) - prefix = attrs.get("prefix_name") - if prefix: - new_type.prefix_map[prefix.lower()] = new_type - short = attrs.get("short_prefix") - if short: - new_type.prefix_map[short.lower()] = new_type - return new_type - - def find_by_tag(self, tag_name): - if tag_name is None: - raise ValueError("tag_name cannot be None!") - tag_name = tag_name.lower() - return self.prefix_map[tag_name] - - -class TagTypeEnum(Enum): - unimod = 0 - psimod = 1 - massmod = 2 - generic = 3 - info = 4 - gnome = 5 - xlmod = 6 - - formula = 7 - glycan = 8 - - localization_marker = 9 - position_label = 10 - group_placeholder = 999 - - -class ModificationTagStyle(Enum): - Unset = 0 - ShortId = 1 - LongId = 2 - ShortName = 3 - LongName = 4 - - -_sentinel = object() - - -class ModificationMassNotFoundError(ProFormaError): - pass - - -class UnknownMonosaccharideError(ProFormaError): - pass - - -@add_metaclass(PrefixSavingMeta) -class TagBase(object): - '''A base class for all tag types. - - Attributes - ---------- - type: Enum - An element of :class:`TagTypeEnum` saying what kind of tag this is. - value: object - The data stored in this tag, usually an externally controlled name - extra: list - Any extra tags that were nested within this tag. Usually limited to INFO - tags but may be other synonymous controlled vocabulary terms. - group_id: str or None - A short label denoting which group, if any, this tag belongs to - ''' - __slots__ = ("type", "value", "extra", "group_id") - - prefix_name = None - short_prefix = None - prefix_map = {} - - def __init__(self, type, value, extra=None, group_id=None): - self.type = type - self.value = value - self.extra = extra - self.group_id = group_id - - def __str__(self): - part = self._format_main() - had_marker = False - if self.extra: - rest = [] - for e in self.extra: - rest.append(str(e)) - had_marker |= isinstance(e, GroupLabelBase) and e.group_id == self.group_id - label = '|'.join([part] + rest) - else: - label = part - if self.group_id and not had_marker: - label = '%s%s' % (label, self.group_id) - return '%s' % label - - def __repr__(self): - template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})" - return template.format(self=self) - - def __eq__(self, other): - if other is None: - return False - if isinstance(other, str): - return str(self) == other - return (self.type == other.type) and (self.value == other.value) and (self.extra == other.extra) \ - and (self.group_id == other.group_id) - - def __ne__(self, other): - return not self == other - - def find_tag_type(self, tag_type): - '''Search this tag or tag collection for elements with a particular - tag type and return them. - - Parameters - ---------- - tag_type : TagTypeEnum - A label from :class:`TagTypeEnum`, or an equivalent type. - - Returns - ------- - matches : list - The list of all tags in this object which match the requested tag type. - ''' - out = [] - if self.type == tag_type: - out.append(self) - if not self.extra: - return out - for e in self.extra: - if e.type == tag_type: - out.append(e) - return out - - @classmethod - def parse(cls, buffer): - return process_tag_tokens(buffer) - - -class GroupLabelBase(TagBase): - __slots__ = () - - def __str__(self): - part = self._format_main() - if self.extra: - rest = [str(e) for e in self.extra] - label = '|'.join([part] + rest) - else: - label = part - return '%s' % label - - -class PositionLabelTag(GroupLabelBase): - '''A tag to mark that a position is involved in a group in some way, but does - not imply any specific semantics. - ''' - __slots__ = () - - def __init__(self, value=None, extra=None, group_id=None): - assert group_id is not None - value = group_id - super(PositionLabelTag, self).__init__( - TagTypeEnum.position_label, value, extra, group_id) - - def _format_main(self): - return "{self.group_id}".format(self=self) - - -class LocalizationMarker(GroupLabelBase): - '''A tag to mark a particular localization site - ''' - __slots__ = () - - def __init__(self, value, extra=None, group_id=None): - assert group_id is not None - super(LocalizationMarker, self).__init__( - TagTypeEnum.localization_marker, float(value), extra, group_id) - - def _format_main(self): - return "{self.group_id}({self.value:.4g})".format(self=self) - - -class InformationTag(TagBase): - '''A tag carrying free text describing the location - ''' - __slots__ = () - - prefix_name = "INFO" - - def __init__(self, value, extra=None, group_id=None): - super(InformationTag, self).__init__( - TagTypeEnum.info, str(value), extra, group_id) - - def _format_main(self): - return str(self.value) - - -class ModificationResolver(object): - def __init__(self, name, **kwargs): - self.name = name.lower() - self.symbol = self.name[0] - self._database = None - - def load_database(self): - raise NotImplementedError() - - @property - def database(self): - if not self._database: - self._database = self.load_database() - return self._database - - @database.setter - def database(self, database): - self._database = database - - def parse_identifier(self, identifier): - """Parse a string that is either a CV prefixed identifier or name. - - Parameters - ---------- - identifier : str - The identifier string to parse, removing CV prefix as needed. - - Returns - ------- - name : str, optional - A textual identifier embedded in the qualified identifier, if any, otherwise - :const:`None`. - id : int, optional - An integer ID embedded in the qualified identifier, if any, otherwise - :const:`None`. - """ - tokens = identifier.split(":", 1) - if len(tokens) > 1: - prefix = tokens[0].lower() - if prefix == self.name or prefix == self.symbol: - identifier = tokens[1] - - if identifier.isdigit(): - id = int(identifier) - name = None - else: - name = identifier - id = None - return name, id - - def resolve(self, name=None, id=None, **kwargs): - raise NotImplementedError() - - def __call__(self, name=None, id=None, **kwargs): - return self.resolve(name, id, **kwargs) - - def __eq__(self, other): - return self.name == other.name - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash(self.name) - - -class UnimodResolver(ModificationResolver): - def __init__(self, **kwargs): - super(UnimodResolver, self).__init__("unimod", **kwargs) - self._database = kwargs.get("database") - self.strict = kwargs.get("strict", True) - - def load_database(self): - if _has_psims: - return obo_cache.resolve("http://www.unimod.org/obo/unimod.obo") - return Unimod() - - def resolve(self, name=None, id=None, **kwargs): - strict = kwargs.get("strict", self.strict) - exhaustive = kwargs.get("exhaustive", True) - if name is not None: - defn = self.database.by_title(name, strict=strict) - if not defn: - defn = self.database.by_name(name, strict=strict) - if not defn and exhaustive and strict: - defn = self.database.by_title(name, strict=False) - if not defn: - defn = self.database.by_name(name, strict=False) - if defn and isinstance(defn, list): - warnings.warn( - "Multiple matches found for {!r} in Unimod, taking the first, {}.".format( - name, defn[0]['record_id'])) - defn = defn[0] - if not defn: - raise KeyError(name) - elif id is not None: - defn = self.database[id] - if not defn: - raise KeyError(id) - else: - raise ValueError("Must provide one of `name` or `id`") - if isinstance(defn, dict): - return { - 'composition': defn['composition'], - 'name': defn['title'], - 'id': defn['record_id'], - 'mass': defn['mono_mass'], - 'provider': self.name, - "source": self - } - else: - name = defn.ex_code_name - if not name: - name = defn.code_name - return { - "composition": defn.composition, - "name": name, - "id": defn.id, - "mass": defn.monoisotopic_mass, - "provider": self.name, - "source": self - } - - -class PSIModResolver(ModificationResolver): - def __init__(self, **kwargs): - super(PSIModResolver, self).__init__('psimod', **kwargs) - self._database = kwargs.get("database") - - def load_database(self): - return load_psimod() - - def resolve(self, name=None, id=None, **kwargs): - if name is not None: - defn = self.database[name] - elif id is not None: - defn = self.database['MOD:{:05d}'.format(id)] - else: - raise ValueError("Must provide one of `name` or `id`") - try: - mass = float(defn.DiffMono) - except (KeyError, TypeError, ValueError): - raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn)) - if defn.DiffFormula is not None: - composition = Composition() - diff_formula_tokens = defn.DiffFormula.strip().split(" ") - for i in range(0, len(diff_formula_tokens), 2): - element = diff_formula_tokens[i] - count = diff_formula_tokens[i + 1] - if count: - count = int(count) - if element.startswith("("): - j = element.index(")") - isotope = element[1:j] - element = "%s[%s]" % (element[j + 1:], isotope) - composition[element] += count - else: - composition = None - warnings.warn("No formula was found for %r in PSI-MOD, composition will be missing" % ((name, id), )) - return { - 'mass': mass, - 'composition': composition, - 'name': defn.name, - 'id': defn.id, - 'provider': self.name, - "source": self - } - - -class XLMODResolver(ModificationResolver): - def __init__(self, **kwargs): - super(XLMODResolver, self).__init__('xlmod', **kwargs) - self._database = kwargs.get("database") - - def load_database(self): - return load_xlmod() - - def resolve(self, name=None, id=None, **kwargs): - if name is not None: - defn = self.database[name] - elif id is not None: - defn = self.database['XLMOD:{:05d}'.format(id)] - else: - raise ValueError("Must provide one of `name` or `id`") - try: - mass = float(defn['monoIsotopicMass']) - except (KeyError, TypeError, ValueError): - raise ModificationMassNotFoundError("Could not resolve the mass of %r from %r" % ((name, id), defn)) - if 'deadEndFormula' in defn: - composition = Composition(defn['deadEndFormula'].replace(" ", '').replace("D", "H[2]")) - elif 'bridgeFormula' in defn: - composition = Composition( - defn['bridgeFormula'].replace(" ", '').replace("D", "H[2]")) - return { - 'mass': mass, - 'composition': composition, - 'name': defn.name, - 'id': defn.id, - 'provider': self.name, - "source": self - } - -# TODO: Implement resolve walking up the graph to get the mass. Can't really -# get any more information without glypy/glyspace interaction -class GNOResolver(ModificationResolver): - mass_pattern = re.compile(r"(\d+(:?\.\d+)) Da") - - def __init__(self, **kwargs): - super(GNOResolver, self).__init__('gnome', **kwargs) - self._database = kwargs.get("database") - - def load_database(self): - return load_gno() - - def get_mass_from_glycan_composition(self, term): - '''Parse the Byonic-style glycan composition from property GNO:00000202 - to get the counts of each monosaccharide and use that to calculate mass. - - The mass computed here is exact and dehydrated, distinct from the rounded-off - mass that :meth:`get_mass_from_term` will produce by walking up the CV term - hierarchy. However, not all glycan compositions are representable in GNO:00000202 - format, so this may silently be absent or incomplete, hence the double-check in - :meth:`get_mass_from_term`. - - Parameters - ---------- - term : psims.controlled_vocabulary.Entity - The CV entity being parsed. - - Returns - ------- - mass : float or :const:`None` - If a glycan composition is found on the term, the computed - mass will be returned. Otherwise the :const:`None` is returned - ''' - val = term.get('GNO:00000202') - monosaccharides = BasicComposition() - composition = Composition() - if val: - tokens = re.findall(r"([A-Za-z0-9]+)\((\d+)\)", val) - mass = 0.0 - for symbol, count in tokens: - count = int(count) - try: - mono_mass, mono_comp, symbol = GlycanModification.valid_monosaccharides[symbol] - mass += mono_mass * count - composition += mono_comp * count - monosaccharides[symbol] += count - except KeyError: - continue - return mass, monosaccharides, composition - return None, None, None - - def get_mass_from_term(self, term, raw_mass): - '''Walk up the term hierarchy and find the mass group - term near the root of the tree, and return the most accurate - mass available for the provided term. - - The mass group term's mass is rounded to two decimal places, leading - to relatively large errors. - - Parameters - ---------- - term : psims.controlled_vocabulary.Entity - The CV entity being parsed. - - Returns - ------- - mass : float or :const:`None` - If a root node is found along the term's lineage, computed - mass will be returned. Otherwise the :const:`None` is returned. - The mass may be - ''' - root_id = 'GNO:00000001' - parent = term.parent() - if isinstance(parent, list): - parent = parent[0] - while parent.id != root_id: - next_parent = parent.parent() - if isinstance(next_parent, list): - next_parent = next_parent[0] - if next_parent.id == root_id: - break - parent = next_parent - match = self.mass_pattern.search(parent.name) - if not match: - return None - # This will have a small mass error. - rough_mass = float(match.group(1)) - _WATER_MASS - if raw_mass is not None and abs(rough_mass - raw_mass) < 1: - return raw_mass - warnings.warn( - ("An accurate glycan composition could not be inferred from %s. " - "Only a rough approximation is available.") % (term, )) - return rough_mass - - def resolve(self, name=None, id=None, **kwargs): - if name is not None: - term = self.database[name] - elif id is not None: - term = self.database[id] - else: - raise ValueError("Must provide one of `name` or `id`") - raw_mass, monosaccharides, composition = self.get_mass_from_glycan_composition(term) - - rec = { - "name":term.name, - "id": term.id, - "provider": self.name, - "composition": composition, - "monosaccharides": monosaccharides, - "mass": self.get_mass_from_term(term, raw_mass), - "source": self - } - return rec - - -class GenericResolver(ModificationResolver): - - def __init__(self, resolvers, **kwargs): - super(GenericResolver, self).__init__('generic', **kwargs) - self.resolvers = list(resolvers) - - def load_database(self): - return None - - def parse_identifier(self, identifier): - """Parse a string that is either a CV prefixed identifier or name. - - Does no parsing as a :class:`GenericModification` is never qualified. - - Parameters - ---------- - identifier : str - The identifier string to parse, removing CV prefix as needed. - - Returns - ------- - name : str, optional - A textual identifier embedded in the qualified identifier, if any, otherwise - :const:`None`. - id : int, optional - An integer ID embedded in the qualified identifier, if any, otherwise - :const:`None`. - """ - return identifier, None - - def resolve(self, name=None, id=None, **kwargs): - defn = None - for resolver in self.resolvers: - try: - defn = resolver(name=name, id=id, **kwargs) - break - except KeyError: - continue - except ModificationMassNotFoundError: - warnings.warn("Could not resolve the mass for %r in %r" % ((name, id), resolver)) - continue - if defn is None: - if name is None: - raise KeyError(id) - elif id is None: - raise KeyError(name) - else: - raise ValueError("Must provide one of `name` or `id`") - return defn - - -class ModificationBase(TagBase): - '''A base class for all modification tags with marked prefixes. - - While :class:`ModificationBase` is hashable, its equality testing - brings in additional tag-related information. For pure modification - identity comparison, use :attr:`key` to get a :class:`ModificationToken` - free of these concerns.. - ''' - - _tag_type = None - __slots__ = ('_definition', 'style') - - def __init__(self, value, extra=None, group_id=None, style=None): - if style is None: - style = ModificationTagStyle.Unset - super(ModificationBase, self).__init__( - self._tag_type, value, extra, group_id) - self._definition = None - self.style = style - - def __eq__(self, other): - if isinstance(other, ModificationToken): - return other == self - return super(ModificationBase, self).__eq__(other) - - def __hash__(self): - return hash((self.id, self.provider)) - - @property - def key(self): - '''Get a safe-to-hash-and-compare :class:`ModificationToken` - representing this modification without tag-like properties. - - Returns - -------- - ModificationToken - ''' - return ModificationToken(self.value, self.id, self.provider, self.__class__) - - @property - def definition(self): - '''A :class:`dict` of properties describing this modification, given - by the providing controlled vocabulary. This value is cached, and - should not be modified. - - Returns - ------- - dict - ''' - if self._definition is None: - self._definition = self.resolve() - return self._definition - - @property - def mass(self): - '''The monoisotopic mass shift this modification applies - - Returns - -------float - ''' - return self.definition['mass'] - - @property - def composition(self): - '''The chemical composition shift this modification applies''' - return self.definition.get('composition') - - @property - def id(self): - '''The unique identifier given to this modification by its provider - - Returns - ------- - str or int - ''' - return self.definition.get('id') - - @property - def name(self): - '''The primary name of this modification from its provider. - - Returns - ------- - str - ''' - return self.definition.get('name') - - @property - def provider(self): - '''The name of the controlled vocabulary that provided this - modification. - - Returns - ------- - str - ''' - return self.definition.get('provider') - - def _populate_from_definition(self, definition): - self._definition = definition - - def _format_main(self): - if self.style == ModificationTagStyle.Unset or self.style is None: - return "{self.prefix_name}:{self.value}".format(self=self) - elif self.style == ModificationTagStyle.LongId: - return "{self.prefix_name}:{self.id}".format(self=self) - elif self.style == ModificationTagStyle.ShortId: - return "{self.short_prefix}:{self.id}".format(self=self) - elif self.style == ModificationTagStyle.LongName: - return "{self.prefix_name}:{self.name}".format(self=self) - elif self.style == ModificationTagStyle.ShortName: - return "{self.short_prefix}:{self.name}".format(self=self) - else: - warnings.warn("Unknown formatting style {!r}".format(self.style)) - return "{self.prefix_name}:{self.value}".format(self=self) - - def resolve(self): - '''Find the term and return it's properties - ''' - keys = self.resolver.parse_identifier(self.value) - return self.resolver(*keys) - - -class MassModification(TagBase): - '''A modification defined purely by a signed mass shift in Daltons. - - The value of a :class:`MassModification` is always a :class:`float` - ''' - __slots__ = ('_significant_figures', ) - - prefix_name = "Obs" - - def __init__(self, value, extra=None, group_id=None): - if isinstance(value, str): - sigfigs = len(value.split('.')[-1].rstrip('0')) - else: - sigfigs = 4 - self._significant_figures = sigfigs - super(MassModification, self).__init__( - TagTypeEnum.massmod, float(value), extra, group_id) - - def _format_main(self): - if self.value >= 0: - return ('+{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') - else: - return ('{0:0.{1}f}'.format(self.value, self._significant_figures)).rstrip('0').rstrip('.') - - @property - def provider(self): - return None - - @property - def id(self): - return self._format_main() - - @property - def key(self): - '''Get a safe-to-hash-and-compare :class:`ModificationToken` - representing this modification without tag-like properties. - - Returns - -------- - ModificationToken - ''' - return ModificationToken(self.value, self.id, self.provider, self.__class__) - - @property - def mass(self): - return self.value - - def __eq__(self, other): - if isinstance(other, ModificationToken): - return other == self - return super(MassModification, self).__eq__(other) - - def __hash__(self): - return hash((self.id, self.provider)) - - -class FormulaModification(ModificationBase): - prefix_name = "Formula" - - isotope_pattern = re.compile(r'\[(?P<isotope>\d+)(?P<element>[A-Z][a-z]*)(?P<quantity>[\-+]?\d+)\]') - _tag_type = TagTypeEnum.formula - - def _normalize_isotope_notation(self, match): - '''Rewrite ProForma isotope notation to Pyteomics-compatible - isotope notation. - - Parameters - ---------- - match : Match - The matched isotope notation string parsed by the regular expression. - - Returns - reformatted : str - The re-written isotope notation - ''' - parts = match.groupdict() - return "{element}[{isotope}]{quantity}".format(**parts) - - def resolve(self): - normalized = self.value.replace(' ', '') - # If there is a [ character in the formula, we know there are isotopes which - # need to be normalized. - if '[' in normalized: - normalized = self.isotope_pattern.sub(self._normalize_isotope_notation, normalized) - composition = Composition(formula=normalized) - return { - "mass": composition.mass(), - "composition": composition, - "name": self.value - } - - -monosaccharide_description = namedtuple('monosaccharide_description', ('mass', 'composition', "symbol")) - - -class GlycanModification(ModificationBase): - prefix_name = "Glycan" - - _tag_type = TagTypeEnum.glycan - - valid_monosaccharides = { - "Hex": monosaccharide_description(162.0528, Composition("C6H10O5"), 'Hex'), - "HexNAc": monosaccharide_description(203.0793, Composition("C8H13N1O5"), 'HexNAc'), - "HexS": monosaccharide_description(242.009, Composition("C6H10O8S1"), 'HexS'), - "HexP": monosaccharide_description(242.0191, Composition("C6H11O8P1"), 'HexP'), - "HexNAcS": monosaccharide_description(283.0361, Composition("C8H13N1O8S1"), 'HexNAcS'), - "dHex": monosaccharide_description(146.0579, Composition("C6H10O4"), 'dHex'), - "NeuAc": monosaccharide_description(291.0954, Composition("C11H17N1O8"), 'NeuAc'), - "NeuGc": monosaccharide_description(307.0903, Composition("C11H17N1O9"), 'NeuGc'), - "Pen": monosaccharide_description(132.0422, Composition("C5H8O4"), 'Pen'), - "Fuc": monosaccharide_description(146.0579, Composition("C6H10O4"), 'Fuc') - } - - valid_monosaccharides['Neu5Ac'] = valid_monosaccharides['NeuAc'] - valid_monosaccharides['Neu5Gc'] = valid_monosaccharides['NeuGc'] - valid_monosaccharides['Pent'] = valid_monosaccharides['Pen'] - valid_monosaccharides['d-Hex'] = valid_monosaccharides['dHex'] - - monomer_tokenizer = re.compile( - r"|".join(sorted(valid_monosaccharides.keys(), key=len, reverse=True))) - tokenizer = re.compile(r"(%s|[A-Za-z]+)\s*(\d*)\s*" % monomer_tokenizer.pattern) - - @property - def monosaccharides(self): - return self.definition.get('monosaccharides') - - def resolve(self): - composite = BasicComposition() - for tok, cnt in self.tokenizer.findall(self.value): - if cnt: - cnt = int(cnt) - else: - cnt = 1 - if tok not in self.valid_monosaccharides: - parts = self.monomer_tokenizer.findall(tok) - t = 0 - for p in parts: - if p not in self.valid_monosaccharides: - break - t += len(p) - if t != len(tok): - raise ValueError("{tok!r} is not a valid monosaccharide name".format(tok=tok)) - else: - for p in parts[:-1]: - sym = self.valid_monosaccharides[p].symbol - composite[sym] += 1 - sym = self.valid_monosaccharides[parts[-1]].symbol - composite[sym] += cnt - else: - sym = self.valid_monosaccharides[tok].symbol - composite[sym] += cnt - mass = 0 - chemcomp = Composition() - for key, cnt in composite.items(): - try: - m, c, sym = self.valid_monosaccharides[key] - except KeyError: - raise UnknownMonosaccharideError(key) - mass += m * cnt - chemcomp += c * cnt - return { - "mass": mass, - "composition": chemcomp, - "name": self.value, - "monosaccharides": composite - } - - -class UnimodModification(ModificationBase): - __slots__ = () - - resolver = UnimodResolver() - - prefix_name = "UNIMOD" - short_prefix = "U" - _tag_type = TagTypeEnum.unimod - - -class PSIModModification(ModificationBase): - __slots__ = () - - resolver = PSIModResolver() - - prefix_name = "MOD" - short_prefix = 'M' - _tag_type = TagTypeEnum.psimod - - -class GNOmeModification(ModificationBase): - __slots__ = () - - resolver = GNOResolver() - - prefix_name = "GNO" - short_prefix = 'G' - _tag_type = TagTypeEnum.gnome - - @property - def monosaccharides(self): - return self.definition.get('monosaccharides') - - -class XLMODModification(ModificationBase): - __slots__ = () - - resolver = XLMODResolver() - - prefix_name = "XLMOD" - # short_prefix = 'XL' - _tag_type = TagTypeEnum.xlmod - - -class GenericModification(ModificationBase): - __slots__ = () - _tag_type = TagTypeEnum.generic - resolver = GenericResolver([ - # Do exact matching here first. Then default to non-strict matching as a final - # correction effort. - partial(UnimodModification.resolver, exhaustive=False), - PSIModModification.resolver, - XLMODModification.resolver, - GNOmeModification.resolver, - # Some really common names aren't actually found in the XML exactly, so default - # to non-strict matching now to avoid masking other sources here. - partial(UnimodModification.resolver, strict=False) - ]) - - def __init__(self, value, extra=None, group_id=None): - super(GenericModification, self).__init__( - value, extra, group_id) - - def _format_main(self): - return self.value - - def resolve(self): - '''Find the term, searching through all available vocabularies and - return the first match's properties - ''' - keys = self.resolver.parse_identifier(self.value) - defn = self.resolver(*keys) - if defn is not None: - return defn - raise KeyError(keys) - - -def set_unimod_path(path): - '''Set the path to load the Unimod database from for resolving - ProForma Unimod modifications. - - .. note:: - - This method ensures that the Unimod modification database loads - quickly from a local database file instead of downloading a new - copy from the internet. - - Parameters - ---------- - path : str or file-like object - A path to or file-like object for the "unimod.xml" file. - - Returns - ------- - :class:`~pyteomics.mass.mass.Unimod` - ''' - db = Unimod(path) - UnimodModification.resolver.database = db - return db - - -class ModificationToken(object): - '''Describes a particular modification from a particular provider, independent - of a :class:`TagBase`'s state. - - This class is meant to be used in place of a :class:`ModificationBase` object - when equality testing and hashing is desired, but do not want extra properties - to be involved. - - :class:`ModificationToken` is comparable and hashable, and can be compared with - :class:`ModificationBase` subclass instances safely. It can be called to create - a new instance of the :class:`ModificationBase` it is equal to. - - Attributes - ---------- - name : str - The name of the modification being represented, as the user specified it. - id : int or str - Whatever unique identifier the providing controlled vocabulary gave to this - modification - provider : str - The name of the providing controlled vocabulary. - source_cls : type - A sub-class of :class:`ModificationBase` that will be used to fulfill this - token if requested, providing it a resolver. - ''' - __slots__ = ('name', 'id', 'provider', 'source_cls') - - def __init__(self, name, id, provider, source_cls): - self.name = name - self.id = id - self.provider = provider - self.source_cls = source_cls - - def __eq__(self, other): - if other is None: - return False - if isinstance(other, (ModificationToken, ModificationBase, MassModification)): - return self.id == other.id and self.provider == other.provider - return False - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash((self.id, self.provider)) - - def __call__(self): - '''Create a new :class:`ModificationBase` - instance from the provided :attr:`name` - against :attr:`source_cls`'s resolver. - - Returns - ------- - ModificationBase - ''' - return self.source_cls(self.name) - - def __repr__(self): - template = "{self.__class__.__name__}({self.name!r}, {self.id!r}, {self.provider!r}, {self.source_cls})" - return template.format(self=self) - - -def split_tags(tokens): - '''Split a token array into discrete sets of tag - tokens. - - Parameters - ---------- - tokens: list - The characters of the tag token buffer - - Returns - ------- - list of list: - The tokens for each contained tag - ''' - starts = [0] - ends = [] - for i, c in enumerate(tokens): - if c == '|': - ends.append(i) - starts.append(i + 1) - elif (i != 0 and c == '#'): - ends.append(i) - starts.append(i) - ends.append(len(tokens)) - out = [] - for i, start in enumerate(starts): - end = ends[i] - tag = tokens[start:end] - if len(tag) == 0: - continue - # Short circuit on INFO tags which can't be broken - # if (tag[0] == 'i' and tag[:5] == ['i', 'n', 'f', 'o', ':']) or (tag[0] == 'I' and tag[:5] == ['I', 'N', 'F', 'O', ':']): - # tag = tokens[start:] - # out.append(tag) - # break - out.append(tag) - return out - - -def find_prefix(tokens): - '''Find the prefix, if any of the tag defined by `tokens` - delimited by ":". - - Parameters - ---------- - tokens: list - The tag tokens to search - - Returns - ------- - prefix: str or None - The prefix string, if found - rest: str - The rest of the tokens, merged as a string - ''' - for i, c in enumerate(tokens): - if c == ':': - return ''.join(tokens[:i]), ''.join(tokens[i + 1:]) - return None, ''.join(tokens) - - -def process_marker(tokens): - '''Process a marker, which is a tag whose value starts with #. - - Parameters - ---------- - tokens: list - The tag tokens to parse - - Returns - ------- - PositionLabelTag or LocalizationMarker - ''' - if tokens[1:3] == 'XL': - return PositionLabelTag(None, group_id=''.join(tokens)) - else: - group_id = None - value = None - for i, c in enumerate(tokens): - if c == '(': - group_id = ''.join(tokens[:i]) - if tokens[-1] != ')': - raise Exception( - "Localization marker with score missing closing parenthesis") - value = float(''.join(tokens[i + 1:-1])) - return LocalizationMarker(value, group_id=group_id) - else: - group_id = ''.join(tokens) - return PositionLabelTag(group_id=group_id) - - -def process_tag_tokens(tokens): - '''Convert a tag token buffer into a parsed :class:`TagBase` instance - of the appropriate sub-type with zero or more sub-tags. - - Parameters - ---------- - tokens: list - The tokens to parse - - Returns - ------- - TagBase: - The parsed tag - ''' - parts = split_tags(tokens) - main_tag = parts[0] - if main_tag[0] in ('+', '-'): - main_tag = ''.join(main_tag) - main_tag = MassModification(main_tag) - elif main_tag[0] == '#': - main_tag = process_marker(main_tag) - else: - prefix, value = find_prefix(main_tag) - if prefix is None: - main_tag = GenericModification(''.join(value)) - else: - try: - tag_type = TagBase.find_by_tag(prefix) - main_tag = tag_type(value) - except KeyError: - main_tag_str = ''.join(main_tag) - main_tag = GenericModification(main_tag_str) - - if len(parts) > 1: - extras = [] - for part in parts[1:]: - prefix, value = find_prefix(part) - if prefix is None: - if value[0] == "#": - marker = process_marker(value) - if isinstance(marker, PositionLabelTag): - main_tag.group_id = ''.join(value) - else: - main_tag.group_id = marker.group_id - extras.append(marker) - else: - extras.append(GenericModification(''.join(value))) - else: - try: - tag_type = TagBase.find_by_tag(prefix) - extra_tag = tag_type(value) - except KeyError: - part_str = ''.join(part) - extra_tag = GenericModification(part_str) - extras.append(extra_tag) - main_tag.extra = extras - return main_tag - - -class ModificationRule(object): - '''Define a fixed modification rule which dictates a modification tag is - always applied at one or more amino acid residues. - - Attributes - ---------- - modification_tag: TagBase - The modification to apply - targets: list - The list of amino acids this applies to - ''' - __slots__ = ('modification_tag', 'targets') - - def __init__(self, modification_tag, targets=None): - self.modification_tag = modification_tag - self.targets = targets - - def __eq__(self, other): - if other is None: - return False - return self.modification_tag == other.modification_tag and self.targets == other.targets - - def __ne__(self, other): - return not self == other - - def __str__(self): - targets = ','.join(self.targets) - return "<[{self.modification_tag}]@{targets}>".format(self=self, targets=targets) - - def __repr__(self): - return "{self.__class__.__name__}({self.modification_tag!r}, {self.targets})".format(self=self) - - -class StableIsotope(object): - '''Define a fixed isotope that is applied globally to all amino acids. - - Attributes - ---------- - isotope: str - The stable isotope string, of the form [<isotope-number>]<element> or a special - isotopoform's name. - ''' - __slots__ = ('isotope', ) - - def __init__(self, isotope): - self.isotope = isotope - - def __eq__(self, other): - if other is None: - return False - return self.isotope == other.isotope - - def __ne__(self, other): - return not self == other - - def __str__(self): - return "<{self.isotope}>".format(self=self) - - def __repr__(self): - return "{self.__class__.__name__}({self.isotope})".format(self=self) - - -class IntersectionEnum(Enum): - no_overlap = 0 - full_contains_interval = 1 - full_contained_in_interval = 2 - start_overlap = 3 - end_overlap = 4 - - -class TaggedInterval(object): - '''Define a fixed interval over the associated sequence which contains the localization - of the associated tag or denotes a region of general sequence order ambiguity. - - Attributes - ---------- - start: int - The starting position (inclusive) of the interval along the primary sequence - end: int - The ending position (exclusive) of the interval along the primary sequence - tags: list[TagBase] - The tags being localized - ambiguous : bool - Whether the interval is ambiguous or not - ''' - __slots__ = ('start', 'end', 'tags', 'ambiguous') - - def __init__(self, start, end=None, tags=None, ambiguous=False): - self.start = start - self.end = end - self.tags = tags - self.ambiguous = ambiguous - - def __eq__(self, other): - if other is None: - return False - return self.start == other.start and self.end == other.end and self.tags == other.tags - - def __ne__(self, other): - return not self == other - - def __str__(self): - return "({self.start}-{self.end}){self.tags!r}".format(self=self) - - def __repr__(self): - return "{self.__class__.__name__}({self.start}, {self.end}, {self.tags})".format(self=self) - - def as_slice(self): - return slice(self.start, self.end) - - def contains(self, i): - return self.start <= i < self.end - - def __contains__(self, i): - return self.contains(i) - - def copy(self): - return self.__class__(self.start, self.end, self.tags) - - def _check_slice(self, qstart, qend, warn_ambiguous): - # Fully contained interval - valid = qstart <= self.start and qend >= self.end - case = IntersectionEnum.full_contained_in_interval if valid else IntersectionEnum.no_overlap - if not valid: - # Spans the beginning but not the end - valid = qstart <= self.start and qend > self.start - if valid: - case = IntersectionEnum.start_overlap - if warn_ambiguous: - warnings.warn("Slice bisecting interval %s" % (self, )) - - if not valid: - # Spans the end but not the beginning - valid = qstart < self.end and qend > self.end - if valid: - case = IntersectionEnum.end_overlap - if warn_ambiguous: - warnings.warn("Slice bisecting interval %s" % (self, )) - - if not valid: - # Contained interval - valid = qstart >= self.start and qend < self.end - if valid: - case = IntersectionEnum.full_contains_interval - if warn_ambiguous: - warnings.warn("Slice bisecting interval %s" % (self, )) - return valid, case - - def _update_coordinates_sliced(self, start=None, end=None, warn_ambiguous=True): - if end is None: - qend = self.end + 1 - else: - qend = end - if start is None: - qstart = self.start - 1 - else: - qstart = start - - valid, intersection_type = self._check_slice(qstart, qend, warn_ambiguous) - if self.ambiguous and intersection_type not in (IntersectionEnum.full_contained_in_interval, IntersectionEnum.no_overlap): - raise ValueError("Cannot bisect an ambiguous interval") - if not valid: - return None - new = self.copy() - if start is not None: - diff = self.start - start - if diff < 0: - diff = 0 - new.start = diff - if end is not None: - width = min(new.end, end) - self.start - else: - width = self.end - max(start, self.start) - new.end = new.start + width - return new - - -class ChargeState(object): - '''Describes the charge and adduct types of the structure. - - Attributes - ---------- - charge : int - The total charge state as a signed number. - adducts : list[str] - Each charge carrier associated with the molecule. - ''' - __slots__ = ("charge", "adducts") - - def __init__(self, charge, adducts=None): - if adducts is None: - adducts = [] - self.charge = charge - self.adducts = adducts - - def __str__(self): - tokens = [str(self.charge)] - if self.adducts: - tokens.append("[") - tokens.append(','.join(str(adduct) for adduct in self.adducts)) - tokens.append("]") - return ''.join(tokens) - - def __repr__(self): - template = "{self.__class__.__name__}({self.charge}, {self.adducts})" - return template.format(self=self) - - -class TokenBuffer(object): - '''A token buffer that wraps the accumulation and reset logic - of a list of :class:`str` objects. - - Implements a subset of the Sequence protocol. - - Attributes - ---------- - buffer: list - The list of tokens accumulated since the last parsing. - ''' - def __init__(self, initial=None): - self.buffer = list(initial or []) - self.boundaries = [] - - def append(self, c): - '''Append a new character to the buffer. - - Parameters - ---------- - c: str - The character appended - ''' - self.buffer.append(c) - - def reset(self): - '''Discard the content of the current buffer. - ''' - if self.buffer: - self.buffer = [] - if self.boundaries: - self.boundaries = [] - - def __bool__(self): - return bool(self.buffer) - - def __iter__(self): - return iter(self.buffer) - - def __getitem__(self, i): - return self.buffer[i] - - def __len__(self): - return len(self.buffer) - - def tokenize(self): - i = 0 - pieces = [] - for k in self.boundaries + [len(self)]: - piece = self.buffer[i:k] - i = k - pieces.append(piece) - return pieces - - def _transform(self, value): - return value - - def process(self): - if self.boundaries: - value = [self._transform(v) for v in self.tokenize()] - else: - value = self._transform(self.buffer) - self.reset() - return value - - def bound(self): - k = len(self) - self.boundaries.append(k) - return k - - def __call__(self): - return self.process() - - -class NumberParser(TokenBuffer): - '''A buffer which accumulates tokens until it is asked to parse them into - :class:`int` instances. - ''' - - def _transform(self, value): - return int(''.join(value)) - - -class StringParser(TokenBuffer): - '''A buffer which accumulates tokens until it is asked to parse them into - :class:`str` instances. - ''' - - def _transform(self, value): - return ''.join(value) - - -class TagParser(TokenBuffer): - '''A buffer which accumulates tokens until it is asked to parse them into - :class:`TagBase` instances. - - Implements a subset of the Sequence protocol. - - Attributes - ---------- - buffer: list - The list of tokens accumulated since the last parsing. - group_ids: set - The set of all group IDs that have been produced so far. - ''' - - def __init__(self, initial=None, group_ids=None): - super(TagParser, self).__init__(initial) - if group_ids: - self.group_ids = set(group_ids) - else: - self.group_ids = set() - - def _transform(self, value): - tag = process_tag_tokens(value) - if tag.group_id: - self.group_ids.add(tag.group_id) - return tag - - def process(self): - value = super(TagParser, self).process() - if not isinstance(value, list): - value = [value] - return value - - -class ParserStateEnum(Enum): - before_sequence = 0 - tag_before_sequence = 1 - global_tag = 2 - fixed_spec = 3 - labile_tag = 4 - sequence = 5 - tag_in_sequence = 6 - interval_tag = 7 - tag_after_sequence = 8 - stable_isotope = 9 - post_tag_before = 10 - unlocalized_count = 11 - post_global = 12 - post_global_aa = 13 - post_interval_tag = 14 - post_tag_after = 15 - charge_state_start = 16 - charge_state_number = 17 - charge_state_adduct_start = 18 - charge_state_adduct_end = 19 - inter_chain_cross_link_start = 20 - chimeric_start = 21 - interval_initial = 22 - done = 999 - - -BEFORE = ParserStateEnum.before_sequence -TAG_BEFORE = ParserStateEnum.tag_before_sequence -FIXED = ParserStateEnum.fixed_spec -GLOBAL = ParserStateEnum.global_tag -ISOTOPE = ParserStateEnum.stable_isotope -LABILE = ParserStateEnum.labile_tag -SEQ = ParserStateEnum.sequence -TAG = ParserStateEnum.tag_in_sequence -INTERVAL_TAG = ParserStateEnum.interval_tag -INTERVAL_INIT = ParserStateEnum.interval_initial -TAG_AFTER = ParserStateEnum.tag_after_sequence -POST_TAG_BEFORE = ParserStateEnum.post_tag_before -POST_TAG_AFTER = ParserStateEnum.post_tag_after -UNLOCALIZED_COUNT = ParserStateEnum.unlocalized_count -POST_GLOBAL = ParserStateEnum.post_global -POST_GLOBAL_AA = ParserStateEnum.post_global_aa -POST_INTERVAL_TAG = ParserStateEnum.post_interval_tag -CHARGE_START = ParserStateEnum.charge_state_start -CHARGE_NUMBER = ParserStateEnum.charge_state_number -ADDUCT_START = ParserStateEnum.charge_state_adduct_start -ADDUCT_END = ParserStateEnum.charge_state_adduct_end -DONE = ParserStateEnum.done - -VALID_AA = set("QWERTYIPASDFGHKLCVNMXUOJZB") - -def parse(sequence): - '''Tokenize a ProForma sequence into a sequence of amino acid+tag positions, and a - mapping of sequence-spanning modifiers. - - .. note:: - This is a state machine parser, but with certain sub-state paths - unrolled to avoid an explosion of formal intermediary states. - - Parameters - ---------- - sequence: str - The sequence to parse - - Returns - ------- - parsed_sequence: list[tuple[str, list[TagBase]]] - The (amino acid: str, TagBase or None) pairs denoting the positions along the primary sequence - modifiers: dict - A mapping listing the labile modifications, fixed modifications, stable isotopes, unlocalized - modifications, tagged intervals, and group IDs - ''' - labile_modifications = [] - fixed_modifications = [] - unlocalized_modifications = [] - intervals = [] - isotopes = [] - - n_term = None - c_term = None - - i = 0 - n = len(sequence) - - positions = [] - state = BEFORE - depth = 0 - - current_aa = None - current_tag = TagParser() - current_interval = None - current_unlocalized_count = NumberParser() - current_aa_targets = TokenBuffer() - - charge_buffer = None - adduct_buffer = None - - # A mostly context free finite state machine unrolled - # by hand. - while i < n: - c = sequence[i] - i += 1 - # Initial state prior to sequence content - if state == BEFORE: - if c == '[': - state = TAG_BEFORE - depth = 1 - elif c == '{': - state = LABILE - depth = 1 - elif c == '<': - state = FIXED - elif c in VALID_AA: - current_aa = c - state = SEQ - else: - raise ProFormaError( - "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - # The body of the amino acid sequence. - elif state == SEQ or state == INTERVAL_INIT: - if state == INTERVAL_INIT: - state = SEQ - if c == '?': - if current_interval is not None: - current_interval.ambiguous = True - continue - if c in VALID_AA: - if current_aa is not None: - positions.append((current_aa, current_tag() if current_tag else None)) - current_aa = c - elif c == '[': - state = TAG - if current_tag: - current_tag.bound() - depth = 1 - elif c == '(': - if current_interval is not None: - raise ProFormaError( - ("Error In State {state}, nested range found at index {i}. " - "Nested ranges are not yet supported by ProForma.").format( - **locals()), i, state) - current_interval = TaggedInterval(len(positions) + 1) - state = INTERVAL_INIT - elif c == ')': - positions.append( - (current_aa, current_tag() if current_tag else None)) - current_aa = None - if current_interval is None: - raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - else: - current_interval.end = len(positions) - if i < n and sequence[i] == '[': - i += 1 - depth = 1 - state = INTERVAL_TAG - else: - intervals.append(current_interval) - current_interval = None - elif c == '-': - if current_aa: - positions.append((current_aa, current_tag() if current_tag else None)) - current_aa = None - state = TAG_AFTER - if i >= n or sequence[i] != '[': - raise ProFormaError("Missing Closing Tag", i, state) - i += 1 - depth = 1 - elif c == '/': - state = CHARGE_START - charge_buffer = NumberParser() - elif c == '+': - raise ProFormaError( - "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) - else: - raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - # Tag parsing which rely on `current_tag` to buffer tokens. - elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER or state == GLOBAL or state == INTERVAL_TAG: - if c == '[': - depth += 1 - current_tag.append(c) - elif c == ']': - depth -= 1 - if depth <= 0: - depth = 0 - if state == TAG: - state = SEQ - elif state == TAG_BEFORE: - state = POST_TAG_BEFORE - elif state == TAG_AFTER: - c_term = current_tag() - state = POST_TAG_AFTER - elif state == GLOBAL: - state = POST_GLOBAL - elif state == INTERVAL_TAG: - state = POST_INTERVAL_TAG - depth = 0 - else: - current_tag.append(c) - else: - current_tag.append(c) - # Handle transition to fixed modifications or isotope labeling from opening signal. - elif state == FIXED: - if c == '[': - state = GLOBAL - else: - # Do validation here - state = ISOTOPE - current_tag.reset() - current_tag.append(c) - # Handle fixed isotope rules, which rely on `current_tag` to buffer tokens - elif state == ISOTOPE: - if c != '>': - current_tag.append(c) - else: - # Not technically a tag, but exploits the current buffer - isotopes.append(StableIsotope(''.join(current_tag))) - current_tag.reset() - state = BEFORE - # Handle labile modifications, which rely on `current_tag` to buffer tokens - elif state == LABILE: - if c == '{': - depth += 1 - elif c == '}': - depth -= 1 - if depth <= 0: - depth = 0 - labile_modifications.append(current_tag()[0]) - state = BEFORE - else: - current_tag.append(c) - # The intermediate state between an interval tag and returning to sequence parsing. - # A new tag may start immediately, leading to it being appended to the interval instead - # instead of returning to the primary sequence. Because this state may also occur at the - # end of a sequence, it must also handle sequence-terminal transitions like C-terminal tags, - # charge states, and the like. - elif state == POST_INTERVAL_TAG: - if c == '[': - current_tag.bound() - state = INTERVAL_TAG - elif c in VALID_AA: - current_aa = c - current_interval.tags = current_tag() - intervals.append(current_interval) - current_interval = None - state = SEQ - elif c == '-': - state = TAG_AFTER - if i >= n or sequence[i] != '[': - raise ProFormaError("Missing Closing Tag", i, state) - i += 1 - depth = 1 - elif c == '/': - state = CHARGE_START - charge_buffer = NumberParser() - elif c == '+': - raise ProFormaError( - "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) - else: - raise ProFormaError( - "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - # An intermediate state for discriminating which type of tag-before-sequence type - # we just finished parsing. - elif state == POST_TAG_BEFORE: - if c == '?': - unlocalized_modifications.append(current_tag()[0]) - state = BEFORE - elif c == '-': - n_term = current_tag() - state = BEFORE - elif c == '^': - state = UNLOCALIZED_COUNT - else: - raise ProFormaError( - "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - elif state == UNLOCALIZED_COUNT: - if c.isdigit(): - current_unlocalized_count.append(c) - elif c == '[': - state = TAG_BEFORE - depth = 1 - tag = current_tag()[0] - multiplicity = current_unlocalized_count() - for i in range(multiplicity): - unlocalized_modifications.append(tag) - elif c == '?': - state = BEFORE - tag = current_tag()[0] - multiplicity = current_unlocalized_count() - for i in range(multiplicity): - unlocalized_modifications.append(tag) - else: - raise ProFormaError( - "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - elif state == POST_GLOBAL: - if c == '@': - state = POST_GLOBAL_AA - else: - raise ProFormaError( - ("Error In State {state}, fixed modification detected without " - "target amino acids found at index {i}").format(**locals()), i, state) - elif state == POST_GLOBAL_AA: - if c in VALID_AA: - current_aa_targets.append(c) - elif c == ',': - # the next character should be another amino acid - pass - elif c == '>': - fixed_modifications.append( - ModificationRule(current_tag()[0], current_aa_targets())) - state = BEFORE - else: - raise ProFormaError( - ("Error In State {state}, unclosed fixed modification rule").format(**locals()), i, state) - elif state == POST_TAG_AFTER: - if c == '/': - state = CHARGE_START - charge_buffer = NumberParser() - elif c == '+': - raise ProFormaError( - "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) - elif state == CHARGE_START: - if c in '+-': - charge_buffer.append(c) - state = CHARGE_NUMBER - elif c.isdigit(): - charge_buffer.append(c) - state = CHARGE_NUMBER - elif c == '/': - state = ParserStateEnum.inter_chain_cross_link_start - raise ProFormaError("Inter-chain cross-linked peptides are not yet supported", i, state) - else: - raise ProFormaError( - "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - elif state == CHARGE_NUMBER: - if c.isdigit(): - charge_buffer.append(c) - elif c == "[": - state = ADDUCT_START - adduct_buffer = StringParser() - else: - raise ProFormaError( - "Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - elif state == ADDUCT_START: - if c.isdigit() or c in "+-" or c in element_symbols: - adduct_buffer.append(c) - elif c == ',': - adduct_buffer.bound() - elif c == ']': - state = ADDUCT_END - elif state == ADDUCT_END: - if c == '+': - raise ProFormaError( - "Error In State {state}, {c} found at index {i}. Chimeric representation not supported".format(**locals()), i, state) - else: - raise ProFormaError("Error In State {state}, unexpected {c} found at index {i}".format(**locals()), i, state) - if charge_buffer: - charge_number = charge_buffer() - if adduct_buffer: - adducts = adduct_buffer() - else: - adducts = None - charge_state = ChargeState(charge_number, adducts) - else: - charge_state = None - if current_aa: - positions.append((current_aa, current_tag() if current_tag else None)) - if state in (ISOTOPE, TAG, TAG_AFTER, TAG_BEFORE, LABILE, ): - raise ProFormaError("Error In State {state}, unclosed group reached end of string!".format(**locals()), i, state) - return positions, { - 'n_term': n_term, - 'c_term': c_term, - 'unlocalized_modifications': unlocalized_modifications, - 'labile_modifications': labile_modifications, - 'fixed_modifications': fixed_modifications, - 'intervals': intervals, - 'isotopes': isotopes, - 'group_ids': sorted(current_tag.group_ids), - 'charge_state': charge_state, - } - - -def to_proforma(sequence, n_term=None, c_term=None, unlocalized_modifications=None, - labile_modifications=None, fixed_modifications=None, intervals=None, - isotopes=None, charge_state=None, group_ids=None): - '''Convert a sequence plus modifiers into formatted text following the - ProForma specification. - - Parameters - ---------- - sequence : list[tuple[str, TagBase]] - The primary sequence of the peptidoform/proteoform to render - n_term : Optional[TagBase] - The N-terminal modification, if any. - c_term : Optional[TagBase] - The C-terminal modification, if any. - unlocalized_modifications : Optional[list[TagBase]] - Any modifications which aren't assigned to a specific location. - labile_modifications : Optional[list[TagBase]] - Any labile modifications - fixed_modifications : Optional[list[ModificationRule]] - Any fixed modifications - intervals : Optional[list[TaggedInterval]] - A list of modified intervals, if any - isotopes : Optional[list[StableIsotope]] - Any global stable isotope labels applied - charge_state : Optional[ChargeState] - An optional charge state value - group_ids : Optional[list[str]] - Any group identifiers. This parameter is currently not used. - - Returns - ------- - str - ''' - primary = deque() - for aa, tags in sequence: - if not tags: - primary.append(str(aa)) - else: - primary.append(str(aa) + ''.join(['[{0!s}]'.format(t) for t in tags])) - if intervals: - for iv in sorted(intervals, key=lambda x: x.start): - if iv.ambiguous: - primary[iv.start] = '(?' + primary[iv.start] - else: - primary[iv.start] = '(' + primary[iv.start] - - terminator = '{0!s})'.format(primary[iv.end - 1]) - if iv.tags: - terminator += ''.join('[{!s}]'.format(t) for t in iv.tags) - primary[iv.end - 1] = terminator - if n_term: - primary.appendleft(''.join("[{!s}]".format(t) for t in n_term) + '-') - if c_term: - primary.append('-' + ''.join("[{!s}]".format(t) for t in c_term)) - if charge_state: - primary.append("/{!s}".format(charge_state)) - if labile_modifications: - primary.extendleft(['{{{!s}}}'.format(m) for m in labile_modifications]) - if unlocalized_modifications: - primary.appendleft("?") - primary.extendleft(['[{!s}]'.format(m) for m in unlocalized_modifications]) - if isotopes: - primary.extendleft(['{!s}'.format(m) for m in isotopes]) - if fixed_modifications: - primary.extendleft(['{!s}'.format(m) for m in fixed_modifications]) - return ''.join(primary) - - -class _ProFormaProperty(object): - def __init__(self, name): - self.name = name - - def __get__(self, obj, cls): - return obj.properties[self.name] - - def __set__(self, obj, value): - obj.properties[self.name] = value - - def __repr__(self): - template = "{self.__class__.__name__}({self.name!r})" - return template.format(self=self) - - -class ProForma(object): - '''Represent a parsed ProForma sequence. - - The preferred way to instantiate this class is via the :meth:`parse` - method. - - Attributes - ---------- - sequence : list[tuple[str, List[TagBase]]] - The list of (amino acid, tag collection) pairs making up the primary sequence of the - peptide. - isotopes : list[StableIsotope] - A list of any stable isotope rules that apply to this peptide - charge_state : int, optional - An optional charge state that may have been provided - intervals : list[Interval] - Any annotated intervals that contain either sequence ambiguity or a - tag over that interval. - labile_modifications : list[ModificationBase] - Any modifications that were parsed as labile, and may not appear at - any location on the peptide primary sequence. - unlocalized_modifications : list[ModificationBase] - Any modifications that were not localized but may be attached to peptide - sequence evidence. - n_term : list[ModificationBase] - Any modifications on the N-terminus of the peptide - c_term : list[ModificationBase] - Any modifications on the C-terminus of the peptide - group_ids : set - The collection of all groupd identifiers on this sequence. - mass : float - The computed mass for the fully modified peptide, including labile - and unlocalized modifications. **Does not include stable isotopes at this time** - ''' - - def __init__(self, sequence, properties): - self.sequence = sequence - self.properties = properties - - isotopes = _ProFormaProperty('isotopes') - charge_state = _ProFormaProperty('charge_state') - - intervals = _ProFormaProperty('intervals') - fixed_modifications = _ProFormaProperty('fixed_modifications') - labile_modifications = _ProFormaProperty('labile_modifications') - unlocalized_modifications = _ProFormaProperty('unlocalized_modifications') - - n_term = _ProFormaProperty('n_term') - c_term = _ProFormaProperty('c_term') - - group_ids = _ProFormaProperty('group_ids') - - def __str__(self): - return to_proforma(self.sequence, **self.properties) - - def __repr__(self): - return "{self.__class__.__name__}({self.sequence}, {self.properties})".format(self=self) - - def __len__(self): - return len(self.sequence) - - def __getitem__(self, i): - if isinstance(i, slice): - props = self.properties.copy() - ivs = [] - for iv in props['intervals']: - iv = iv._update_coordinates_sliced( - i.start, i.stop) - if iv is None: - continue - ivs.append(iv) - props['intervals'] = ivs - - if not (i.start is None or i.start == 0): - props['n_term'] = None - n = len(self) - if not (i.stop is None or i.stop >= n): - props['c_term'] = None - - return self.__class__(self.sequence[i], props) - else: - return self.sequence[i] - - def __eq__(self, other): - if isinstance(other, str): - return str(self) == other - elif other is None: - return False - else: - return self.sequence == other.sequence and self.properties == other.properties - - def __ne__(self, other): - return not self == other - - @classmethod - def parse(cls, string): - '''Parse a ProForma string. - - Parameters - ---------- - string : str - The string to parse - - Returns - ------- - ProForma - ''' - return cls(*parse(string)) - - @property - def mass(self): - mass = 0.0 - - fixed_modifications = self.properties['fixed_modifications'] - fixed_rules = {} - for rule in fixed_modifications: - for aa in rule.targets: - fixed_rules[aa] = rule.modification_tag.mass - - for position in self.sequence: - aa = position[0] - try: - mass += std_aa_mass[aa] - except KeyError: - warnings.warn("%r does not have an exact mass" % (aa, )) - if aa in fixed_rules: - mass += fixed_rules[aa] - tags = position[1] - if tags: - for tag in tags: - try: - mass += tag.mass - except (AttributeError, KeyError): - continue - for mod in self.properties['labile_modifications']: - mass += mod.mass - for mod in self.properties['unlocalized_modifications']: - mass += mod.mass - if self.properties.get('n_term'): - for mod in self.properties['n_term']: - try: - mass += mod.mass - except (AttributeError, KeyError): - continue - mass += calculate_mass(formula="H") - if self.properties.get('c_term'): - for mod in self.properties['c_term']: - try: - mass += mod.mass - except (AttributeError, KeyError): - continue - - mass += calculate_mass(formula="OH") - for iv in self.properties['intervals']: - try: - mass += iv.tag.mass - except (AttributeError, KeyError): - continue - return mass - - def fragments(self, ion_shift, charge=1, reverse=None, include_labile=True, include_unlocalized=True): - """ - The function generates all possible fragments of the requested - series type. - - Parameters - ---------- - ion_shift : float or str - The mass shift of the ion series, or the name of the ion series - charge : int - The charge state of the theoretical fragment masses to generate. - Defaults to 1+. If 0 is passed, neutral masses will be returned. - reverse : bool, optional - Whether to fragment from the N-terminus (``False``) or C-terminus (``True``). - If ``ion_shift`` is a :class:`str`, the terminal will be inferred from - the series name. Otherwise, defaults to ``False``. - include_labile : bool, optional - Whether or not to include dissociated modification masses. - Defaults to ``True`` - include_unlocalized : bool, optional - Whether or not to include unlocalized modification masses. - Defaults to ``True`` - - Returns - ------- - np.ndarray - - Examples - -------- - - >>> p = proforma.ProForma.parse("PEPTIDE") - >>> p.fragments('b', charge=1) - array([ 98.06004032, 227.1026334 , 324.15539725, 425.20307572, - 538.2871397 , 653.31408272]) - >>> p.fragments('y', charge=1) - array([148.06043424, 263.08737726, 376.17144124, 477.21911971, - 574.27188356, 703.31447664]) - - """ - if isinstance(ion_shift, str): - if ion_shift[0] in 'xyz': - reverse = True - ion_shift = std_ion_comp[ion_shift].mass(absolute=False) - - n = len(self.sequence) - masses = _array('d') - - mass = 0 - mass += ion_shift - - fixed_modifications = self.properties['fixed_modifications'] - fixed_rules = {} - for rule in fixed_modifications: - for aa in rule.targets: - fixed_rules[aa] = rule.modification_tag.mass - - intervals = self.intervals - if intervals: - intervals = sorted(intervals, key=lambda x: x.start) - intervals = deque(intervals) - - if not include_labile: - for mod in self.properties['labile_modifications']: - mass += mod.mass - - if not reverse: - if self.properties.get('n_term'): - for mod in self.properties['n_term']: - try: - mass += mod.mass - except (AttributeError, KeyError): - continue - else: - if self.properties.get('c_term'): - for mod in self.properties['c_term']: - try: - mass += mod.mass - except (AttributeError, KeyError): - continue - - if include_unlocalized: - for mod in self.properties['unlocalized_modifications']: - mass += mod.mass - - mass += _WATER_MASS - - if not reverse: - iterator = (iter(range(0, n - 1))) - else: - iterator = (reversed(range(1, n))) - - for i in iterator: - position = self.sequence[i] - - aa = position[0] - try: - mass += std_aa_mass[aa] - except KeyError: - warnings.warn("%r does not have an exact mass" % (aa, )) - - if aa in fixed_rules: - mass += fixed_rules[aa] - - tags = position[1] - if tags: - for tag in tags: - try: - mass += tag.mass - except (AttributeError, KeyError): - continue - - while intervals and intervals[0].contains(i): - iv = intervals.popleft() - - try: - mass += iv.tag.mass - except (AttributeError, KeyError): - continue - - masses.append(mass) - - if np is not None: - masses = np.asarray(masses) - if charge != 0: - return mass_charge_ratio(masses, charge) - return masses - if charge != 0: - for i, mass in enumerate(masses): - masses[i] = mass_charge_ratio(mass, charge) - return masses - - def find_tags_by_id(self, tag_id, include_position=True): - '''Find all occurrences of a particular tag ID - - Parameters - ---------- - tag_id : str - The tag ID to search for - include_position : bool - Whether or not to return the locations for matched - tag positions - - Returns - ------- - list[tuple[Any, TagBase]] or list[TagBase] - ''' - if not tag_id.startswith("#"): - tag_id = "#" + tag_id - matches = [] - for i, (_token, tags) in enumerate(self.sequence): - if tags: - for tag in tags: - if tag.group_id == tag_id: - if include_position: - matches.append((i, tag)) - else: - matches.append(tag) - for iv in self.properties['intervals']: - if iv.tag.group_id == tag_id: - matches.append((iv, iv.tag) if include_position else iv.tag) - for ulmod in self.properties['unlocalized_modifications']: - if ulmod.group_id == tag_id: - matches.append(('unlocalized_modifications', ulmod) - if include_position else ulmod) - for lamod in self.properties['labile_modifications']: - if lamod.group_id == tag_id: - matches.append(('labile_modifications', lamod) - if include_position else lamod) - return matches - - @property - def tags(self): - return [tag for tags_at in [pos[1] for pos in self if pos[1]] for tag in tags_at] diff --git a/pyteomics/protxml.py b/pyteomics/protxml.py deleted file mode 100644 index 51dea034a6c61a414e04b3847841e4e2c4a8d1c2..0000000000000000000000000000000000000000 --- a/pyteomics/protxml.py +++ /dev/null @@ -1,309 +0,0 @@ -""" -protxml - parsing of ProteinProphet output files -================================================ - -Summary -------- - -**protXML** is the output format of the `ProteinProphet software <http://proteinprophet.sourceforge.net/>`_. -It contains information about identified proteins and their statistical significance. - -This module provides minimalistic infrastructure for access to data stored in -protXML files. The central class is :py:class:`ProtXML`, which -reads protein entries and related information and saves them into -Python dicts. - -Data access ------------ - - :py:class:`ProtXML` - a class representing a single protXML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through peptide-spectrum matches in a protXML - file. Calling the function is synonymous to instantiating the :py:class:`ProtXML` class. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`DataFrame` - read protXML files into a :py:class:`pandas.DataFrame`. - -Target-decoy approach ---------------------- - - :py:func:`filter` - filter protein groups from a chain of protXML files to a specific FDR - using TDA. - - :py:func:`filter.chain` - chain a series of filters applied independently to - several files. - - :py:func:`filter.chain.from_iterable` - chain a series of filters applied - independently to an iterable of files. - - :py:func:`filter_df` - filter protXML files and return a :py:class:`pandas.DataFrame`. - - :py:func:`fdr` - estimate the false discovery rate of a set of protein groups using the - target-decoy approach. - - :py:func:`qvalues` - get an array of scores and *q* values for protein groups using the target-decoy approach. - - :py:func:`is_decoy` - determine whether a protein group is decoy or not. This function may not suit your use case. - -Dependencies ------------- - -This module requres :py:mod:`lxml`. - --------------------------------------------------------------------------------- -""" - -# Copyright 2018 Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import xml, auxiliary as aux, _schema_defaults -import operator as op - -class ProtXML(xml.MultiProcessingXML): - """Parser class for protXML files.""" - file_format = 'protXML' - _root_element = 'protein_summary' - _default_schema = _schema_defaults._protxml_schema_defaults - # _default_version = None - _default_iter_tag = 'protein_group' - _indexed_tag_keys = {'protein_group': 'group_number'} - _default_id_attr = 'group_number' - _indexed_tags = {'protein_group'} - _structures_to_flatten = {'annotation'} - # attributes which contain unconverted values - _convert_items = {'float': {'pct_spectrum_ids'}, - 'int': {'group_number', 'prot_length'}, - 'bool': {'is_contributing_evidence', 'is_nondegenerate_evidence'} - }.items() - - def _get_info_smart(self, element, **kwargs): - """Extract the info in a smart way depending on the element type""" - try: - name = kwargs.pop('ename') - except KeyError: - name = xml._local_name(element) - rec = kwargs.pop('recursive', None) - if name == self._root_element: - info = self._get_info(element, ename=name, - recursive=(rec if rec is not None else False), - **kwargs) - else: - info = self._get_info(element, ename=name, - recursive=(rec if rec is not None else True), - **kwargs) - - converters = {'float': float, 'int': int, - 'bool': lambda x: x.lower() in {'1', 'true', 'y'}} - for k, v in dict(info).items(): - for t, s in self._convert_items: - if k in s: - del info[k] - info[k] = converters[t](v) - p = info.get('parameter') - if isinstance(p, list) and len(p) == 1 and isinstance(p[0], dict): - info.update(info.pop('parameter')[0]) - - if 'modification_info' in info: - # this is a list with one element - info.update(info.pop('modification_info')[0]) - - if 'unique_stripped_peptides' in info: - info['unique_stripped_peptides'] = info['unique_stripped_peptides'].split('+') - return info - -def read(source, read_schema=False, iterative=True, **kwargs): - """Parse `source` and iterate through protein groups. - - Parameters - ---------- - source : str or file - A path to a target protXML file or the file object itself. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the protXML header. Otherwise, use default parameters. - Not recommended without Internet connection or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - Returns - ------- - out : ProtXML - An iterator over dicts with protein group properties. - """ - - return ProtXML(source, read_schema=read_schema, iterative=iterative) - - -# chain = aux._make_chain(read, 'read') -chain = aux.ChainBase._make_chain(ProtXML) - - -def _is_decoy_prefix(pg, prefix='DECOY_'): - """Determine if a protein group should be considered decoy. - - This function checks that all protein names in a group start with `prefix`. - You may need to provide your own function for correct filtering and FDR estimation. - - Parameters - ---------- - - pg : dict - A protein group dict produced by the :py:class:`ProtXML` parser. - prefix : str, optional - A prefix used to mark decoy proteins. Default is `'DECOY_'`. - - Returns - ------- - - out : bool - """ - return all(p['protein_name'].startswith(prefix) for p in pg['protein']) - -def _is_decoy_suffix(pg, suffix='_DECOY'): - """Determine if a protein group should be considered decoy. - - This function checks that all protein names in a group end with `suffix`. - You may need to provide your own function for correct filtering and FDR estimation. - - Parameters - ---------- - - pg : dict - A protein group dict produced by the :py:class:`ProtXML` parser. - suffix : str, optional - A suffix used to mark decoy proteins. Default is `'_DECOY'`. - - Returns - ------- - - out : bool - """ - return all(p['protein_name'].endswith(suffix) for p in pg['protein']) - -is_decoy = _is_decoy_prefix - -fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix) -_key = op.itemgetter('probability') -qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, _key) -filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, _key, qvalues) -filter.chain = aux._make_chain(filter, 'filter', True) - -def DataFrame(*args, **kwargs): - """Read protXML output files into a :py:class:`pandas.DataFrame`. - - .. note :: Rows in the DataFrame correspond to individual proteins, not protein groups. - - Requires :py:mod:`pandas`. - - Parameters - ---------- - sep : str or None, keyword only, optional - Some values related to protein groups are variable-length lists. - If `sep` is a :py:class:`str`, they will be packed into single string using - this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is - :py:const:`None`. - - pd_kwargs : dict, optional - Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor. - - *args - Passed to :py:func:`chain`. - - **kwargs - Passed to :py:func:`chain`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - kwargs = kwargs.copy() - sep = kwargs.pop('sep', None) - pd_kwargs = kwargs.pop('pd_kwargs', {}) - def gen_items(): - with chain(*args, **kwargs) as f: - for item in f: - info = {} - for k, v in item.items(): - if isinstance(v, (str, int, float)): - info[k] = v - if 'protein' in item: - for prot in item['protein']: - out = dict(info) - out.update(prot) - if 'unique_stripped_peptides' in out: - if sep is not None: - out['unique_stripped_peptides'] = sep.join(out['unique_stripped_peptides']) - if 'indistinguishable_protein' in out: - if sep is None: - out['indistinguishable_protein'] = [p['protein_name'] for p in out['indistinguishable_protein']] - else: - out['indistinguishable_protein'] = sep.join(p['protein_name'] for p in out['indistinguishable_protein']) - yield out - return pd.DataFrame(gen_items(), **pd_kwargs) - - -def filter_df(*args, **kwargs): - """Read protXML files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. - Positional arguments can be protXML files or DataFrames. - - .. note :: Rows in the DataFrame correspond to individual proteins, not protein groups. - - Requires :py:mod:`pandas`. - - Parameters - ---------- - key : str / iterable / callable, keyword only, optional - Default is 'probability'. - is_decoy : str / iterable / callable, keyword only, optional - Default is to check that "protein_name" starts with `'DECOY_'`. - reverse : bool, keyword only, optional - Should be :py:const:`True` if higher score is better. - Default is :py:const:`True` (because the default key is 'probability'). - *args - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - **kwargs - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - kwargs.setdefault('key', 'probability') - kwargs.setdefault('reverse', True) - if all(isinstance(arg, pd.DataFrame) for arg in args): - if len(args) > 1: - df = pd.concat(args) - else: - df = args[0] - else: - read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs} - df = DataFrame(*args, **read_kw) - if 'is_decoy' not in kwargs: - if 'decoy_suffix' in kwargs: - kwargs['is_decoy'] = df['protein_name'].str.endswith(kwargs['decoy_suffix']) - else: - kwargs['is_decoy'] = df['protein_name'].str.startswith(kwargs.get('decoy_prefix', 'DECOY_')) - return aux.filter(df, **kwargs) diff --git a/pyteomics/pylab_aux.py b/pyteomics/pylab_aux.py deleted file mode 100644 index c52e17e904ee3827d44b9e6ca2436818b6b122f2..0000000000000000000000000000000000000000 --- a/pyteomics/pylab_aux.py +++ /dev/null @@ -1,831 +0,0 @@ -""" -pylab_aux - auxiliary functions for plotting with pylab -======================================================= - -This module serves as a collection of useful routines for data plotting with -matplotlib. - -Generic plotting ----------------- - - :py:func:`plot_line` - plot a line. - - :py:func:`scatter_trend` - plot a scatter plot with a regression line. - - :py:func:`plot_function_3d` - plot a 3D graph of a function of two variables. - - :py:func:`plot_function_contour` - plot a contour graph of a function of - two variables. - -Spectrum visualization ----------------------- - - :py:func:`plot_spectrum` - plot a single spectrum (m/z vs intensity). - - :py:func:`annotate_spectrum` - plot and annotate peaks in MS/MS spectrum. - - :py:func:`mirror` - create a mirror plot of two spectra (using :py:mod:`spectrum_utils`). - -FDR control ------------ - - :py:func:`plot_qvalue_curve` - plot the dependence of q-value on the amount of PSMs - (similar to a ROC curve). - -See also --------- - - - `Matplotlib cookbook <http://www.scipy.org/Cookbook/Matplotlib/>`_ - - `Matplotlib tutorial - <http://matplotlib.sourceforge.net/mpl_toolkits/mplot3d/tutorial.html>`_ - -Dependencies ------------- - -This module requires :py:mod:`matplotlib`. Optional dependencies: :py:mod:`adjustText`, :py:mod:`spectrum_utils`. - -------------------------------------------------------------------------------- - -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pylab -import numpy as np -from .auxiliary import linear_regression, PyteomicsError -from .version import VersionInfo -from . import parser, mass, mgf, proforma - -try: - import spectrum_utils - if VersionInfo(spectrum_utils.__version__) < VersionInfo('0.4'): - raise ImportError("Supported spectrum_utils version is 0.4.0 or newer.") - import spectrum_utils.spectrum as sus - import spectrum_utils.plot as sup -except ImportError: - sus = sup = None - - -def plot_line(a, b, xlim=None, *args, **kwargs): - """Plot a line y = a * x + b. - - Parameters - ---------- - a : float - The slope of the line. - b : float - The intercept of the line. - xlim : tuple, optional - Minimal and maximal values of `x`. If not given, :py:func:`pylab.xlim` will be called. - *args - Passed to :py:func:`pylab.plot` after `x` and `y` values. - **kwargs - Passed to :py:func:`pylab.plot`. - - Returns - ------- - out : matplotlib.lines.Line2D - The line object. - """ - if xlim is None: - xlim = pylab.xlim() - return pylab.plot([xlim[0], xlim[1]], [a * xlim[0] + b, a * xlim[1] + b], *args, **kwargs) - - -def scatter_trend(x, y=None, **kwargs): - """Make a scatter plot with a linear regression. - - Parameters - ---------- - x : array_like of float - 1-D array of floats. If `y` is omitted, `x` must be a 2-D array of shape (N, 2). - y : array_like of float, optional - 1-D arrays of floats. If `y` is omitted or :py:const:`None`, `x` must be a 2-D array of shape (N, 2). - plot_trend : bool, optional - If :py:const:`True` then plot a trendline (default). - plot_sigmas : bool, optional - If :py:const:`True` then plot confidence intervals of the linear fit. - :py:const:`False` by default. - show_legend : bool, optional - If :py:const:`True`, a legend will be shown with linear fit equation, - correlation coefficient, and standard deviation from the fit. Default is - :py:const:`True`. - title : str, optional - The title. Empty by default. - xlabel, ylabel : str, optional - The axes labels. Empty by default. - alpha_legend : float, optional - Legend box transparency. 1.0 by default - scatter_kwargs : dict, optional - Keyword arguments for :py:func:`pylab.scatter`. - Empty by default. - plot_kwargs : dict, optional - Keyword arguments for :py:func:`plot_line`. - By default, sets `xlim` and `label`. - legend_kwargs : dict, optional - Keyword arguments for :py:func:`pylab.legend`. - Default is :py:const:`{'loc': 'upper left'}`. - sigma_kwargs : dict, optional - Keyword arguments for :py:func:`pylab.plot` used for sigma lines. - Default is :py:const:`{'color': 'red', 'linestyle': 'dashed'}`. - sigma_values : iterable, optional - Each value will be multiplied with standard error of the fit, and the line - shifted by the resulting value will be plotted. Default is :py:const:`range(-3, 4)`. - regression : callable, optional - Function to perform linear regression. Will be given ``x`` and ``y`` as arguments. - Must return a 4-tuple: (a, b, r, stderr). - Default is :py:func:`pyteomics.auxiliary.linear_regression`. - - Returns - ------- - out : tuple - A (scatter_plot, trend_line, sigma_lines, legend) tuple. - """ - regression = kwargs.get('regression', linear_regression) - a, b, r, stderr = regression(x, y) - pylab.title(kwargs.get('title', '')) - pylab.xlabel(kwargs.get('xlabel', '')) - pylab.ylabel(kwargs.get('ylabel', '')) - - equation = ( - '$y\,=\,{:.3f}x\,{}\,{:.3f}$, ' - '$R^2=\,{:.3f}$ \n$\sigma\,=\,{:.3f}$'.format( - a, '-' if b < 0 else '+', abs(b), r*r, stderr)) - - if y is None: - x = np.array(x, copy=False) - y = x[:, 1] - x = x[:, 0] - else: - x = np.array(x) - y = np.array(y) - sc = pylab.scatter(x, y, **kwargs.get('scatter_kwargs', {})) - xlim = (x.min(), x.max()) - plkw = kwargs.get('plot_kwargs', {}).copy() - plkw.setdefault('xlim', xlim) - plkw.setdefault('label', equation) - if kwargs.get('plot_trend', True): - line = plot_line(a, b, **plkw) - else: - line = None - - if kwargs.get('plot_sigmas', False): - s_lines = [] - sigma_kwargs = kwargs.get('sigma_kwargs', {'color': 'red', 'linestyle': 'dashed'}) - for i in kwargs.get('sigma_values', range(-3, 4)): - s_lines.append(plot_line(a, b + i * stderr, xlim, **sigma_kwargs)) - else: - s_lines = None - - if kwargs.get('show_legend', True): - legend = pylab.legend(**kwargs.get('legend_kwargs', {'loc': 'upper left'})) - legend_frame = legend.get_frame() - legend_frame.set_alpha(kwargs.get('alpha_legend', 1.0)) - else: - legend = None - return sc, line, s_lines, legend - - -def plot_function_3d(x, y, function, **kwargs): - """Plot values of a function of two variables in 3D. - - More on 3D plotting in pylab: - - http://www.scipy.org/Cookbook/Matplotlib/mplot3D - - Parameters - ---------- - x : array_like of float - The plotting range on X axis. - y : array_like of float - The plotting range on Y axis. - function : function - The function to plot. - plot_type : {'surface', 'wireframe', 'scatter', 'contour', 'contourf'}, keyword only, optional - The type of a plot, see - `scipy cookbook <http://www.scipy.org/Cookbook/Matplotlib/mplot3D>`_ - for examples. The default value is 'surface'. - num_contours : int - The number of contours to plot, 50 by default. - xlabel : str, keyword only, optional - The X axis label. Empty by default. - ylabel : str, keyword only, optional - The Y axis label. Empty by default. - zlabel : str, keyword only, optional - The Z axis label. Empty by default. - title : str, keyword only, optional - The title. Empty by default. - **kwargs - Passed to the respective plotting function. - """ - import mpl_toolkits.mplot3d.axes3d as pylab3d - ax = pylab3d.Axes3D(pylab.gcf()) - ax.set_xlabel(kwargs.pop('xlabel', '')) - ax.set_ylabel(kwargs.pop('ylabel', '')) - ax.set_zlabel(kwargs.pop('zlabel', '')) - ax.set_title(kwargs.pop('title', '')) - X, Y = np.meshgrid(x, y) - Z = [] - for y_value in y: - Z.append([]) - for x_value in x: - Z[-1].append(function(x_value, y_value)) - Z = np.array(Z) - plot_type = kwargs.pop('plot_type', 'surface') - if plot_type == 'surface': - ax.plot_surface(X, Y, Z, - rstride=kwargs.pop('rstride', 1), - cstride=kwargs.pop('cstride', 1), - cmap=kwargs.pop('cmap', pylab.cm.jet), - **kwargs) - elif plot_type == 'wireframe': - ax.plot_wireframe(X, Y, Z, - cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) - elif plot_type == 'scatter': - ax.scatter3D(np.ravel(X), np.ravel(Y), np.ravel(Z), **kwargs) - elif plot_type == 'contour': - num_contours = kwargs.pop('num_contours', 50) - ax.contour3D(X, Y, Z, num_contours, - cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) - elif plot_type == 'contourf': - num_contours = kwargs.pop('num_contours', 50) - ax.contourf3D(X, Y, Z, num_contours, - cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) - else: - raise PyteomicsError('Unknown plot type: {}'.format(plot_type)) - - -def plot_function_contour(x, y, function, **kwargs): - """Make a contour plot of a function of two variables. - - Parameters - ---------- - x, y : array_like of float - The positions of the nodes of a plotting grid. - function : function - The function to plot. - filling : bool - Fill contours if True (default). - num_contours : int - The number of contours to plot, 50 by default. - xlabel, ylabel : str, optional - The axes labels. Empty by default. - title : str, optional - The title. Empty by default. - **kwargs - Passed to :py:func:`pylab.contour` or :py:func:`pylab.contourf`. - """ - pylab.xlabel(kwargs.pop('xlabel', '')) - pylab.ylabel(kwargs.pop('ylabel', '')) - pylab.title(kwargs.pop('title', '')) - X, Y = np.meshgrid(x, y) - Z = [] - for y_value in y: - Z.append([]) - for x_value in x: - Z[-1].append(function(x_value, y_value)) - Z = np.array(Z) - num_contours = kwargs.pop('num_contours', 50) - if kwargs.pop('filling', True): - pylab.contourf(X, Y, Z, num_contours, - cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) - else: - pylab.contour(X, Y, Z, num_contours, - cmap=kwargs.pop('cmap', pylab.cm.jet), **kwargs) - - -def plot_qvalue_curve(qvalues, *args, **kwargs): - """ - Plot a curve with q-values on the X axis and corresponding PSM number - (starting with ``1``) on the Y axis. - - Parameters - ---------- - qvalues : array-like - An array of q-values for sorted PSMs. - xlabel : str, keyword only, optional - Label for the X axis. Default is "q-value". - ylabel : str, keyword only, optional - Label for the Y axis. Default is "# of PSMs". - title : str, keyword only, optional - The title. Empty by default. - *args - Given to :py:func:`pylab.plot` after `x` and `y`. - **kwargs - Given to :py:func:`pylab.plot`. - - Returns - ------- - out : matplotlib.lines.Line2D - """ - pylab.xlabel(kwargs.pop('xlabel', 'q-value')) - pylab.ylabel(kwargs.pop('ylabel', '# of PSMs')) - pylab.title(kwargs.pop('title', '')) - return pylab.plot(qvalues, 1 + np.arange(qvalues.size), *args, **kwargs) - - -def _default_plot_spectrum(spectrum, *args, **kwargs): - ax = kwargs.pop('ax', None) or pylab.gca() - if kwargs.pop('centroided', True): - kwargs.setdefault('align', 'center') - kwargs.setdefault('width', 0) - kwargs.setdefault('linewidth', 1) - kwargs.setdefault('edgecolor', 'k') - ax.bar(spectrum['m/z array'], spectrum['intensity array'], *args, **kwargs) - else: - ax.plot(spectrum['m/z array'], spectrum['intensity array'], *args, **kwargs) - return ax - - -def _spectrum_utils_plot(spectrum, *args, **kwargs): - - with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): - spectrum = _spectrum_utils_create_spectrum(spectrum, None, *args, **kwargs) - return sup.spectrum(spectrum) - - -def _spectrum_utils_iplot(spectrum, *args, **kwargs): - import spectrum_utils.iplot as supi - with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): - spectrum = _spectrum_utils_create_spectrum(spectrum, None, *args, **kwargs) - return supi.spectrum(spectrum) - - -_plot_backends = { - 'default': _default_plot_spectrum, - 'spectrum_utils': _spectrum_utils_plot, - 'spectrum_utils.iplot': _spectrum_utils_iplot, -} - - -def plot_spectrum(spectrum, *args, **kwargs): - """ - Plot a spectrum, assuming it is a dictionary containing "m/z array" and "intensity array". - - Parameters - ---------- - spectrum : dict - A dictionary, as returned by pyteomics MS data parsers. - Must contain "m/z array" and "intensity array" keys with decoded arrays. - backend : str, keyword only, optional - One of `{'default', 'spectrum_utils', 'spectrum_utils.iplot'}`. - The `spectrum_utils` backend requires installing :py:mod:`spectrum_utils`. - The `spectrum_utils.iplot` backend requires installing :py:mod:`spectrum_utils[iplot]`. - xlabel : str, keyword only, optional - Label for the X axis. Default is "m/z". - ylabel : str, keyword only, optional - Label for the Y axis. Default is "intensity". - title : str, keyword only, optional - The title. Empty by default. - - centroided : bool, keyword only, optional - Works only for the `default` backend. - If :py:const:`True` (default), peaks of the spectrum are plotted using :py:func:`pylab.bar`. - If :py:const:`False`, the arrays are simply plotted using :py:func:`pylab.plot`. - *args - When using `default` backend: given to :py:func:`pylab.plot` or :py:func:`pylab.bar` (depending on `centroided`). - **kwargs - When using `default` backend: given to :py:func:`pylab.plot` or :py:func:`pylab.bar` (depending on `centroided`). - - min_intensity : float, keyword only, optional - Remove low-intensity peaks; this is a factor of maximum peak intensity. Default is 0 (no filtering). - Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - max_num_peaks : int or None, keyword only, optional - Remove low-intensity peaks; this is the number of peaks to keep. Default is :py:const:`None` (no filtering). - Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - scaling : one of `{'root', 'log', 'rank'}` or None, keyword only, optional - Scaling to apply to peak intensities. Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - max_intensity : float or None, keyword only, optional - Intensity of the most intense peak relative to which the peaks will be scaled - (the default is :py:const:`None`, which means that no scaling - relative to the most intense peak will be performed). - Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - - Returns - ------- - out : matplotlib.pyplot.Axes - """ - bname = kwargs.pop('backend', 'default') - backend = _plot_backends.get(bname) - if backend is None: - raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format( - bname, '; '.join(_plot_backends))) - - pylab.xlabel(kwargs.pop('xlabel', 'm/z')) - pylab.ylabel(kwargs.pop('ylabel', 'intensity')) - if 'title' in kwargs: - pylab.title(kwargs.pop('title')) - return backend(spectrum, *args, **kwargs) - - -def _default_annotate_spectrum(spectrum, peptide, *args, **kwargs): - - # common kwargs - types = kwargs.pop('ion_types', ('b', 'y')) - aa_mass = kwargs.pop('aa_mass', mass.std_aa_mass) - mass_data = kwargs.pop('mass_data', mass.nist_mass) - ion_comp = kwargs.pop('ion_comp', mass.std_ion_comp) - colors = { - 'a': '#388E3C', - 'b': '#1976D2', - 'c': '#00796B', - 'x': '#7B1FA2', - 'y': '#D32F2F', - 'z': '#F57C00', - } - colors.update(kwargs.pop('colors', {})) - ftol = kwargs.pop('ftol', None) - if ftol is None: - rtol = kwargs.pop('rtol', 1e-5) - text_kw = kwargs.pop('text_kw', dict(ha='center', clip_on=True, backgroundcolor='#ffffff99')) - precursor_charge = kwargs.pop('precursor_charge', None) - if precursor_charge is None: - precursor_charge = _get_precursor_charge(spectrum) - if precursor_charge is None: - raise PyteomicsError('Could not extract precursor charge from spectrum. Please specify `precursor_charge` kwarg.') - maxcharge = kwargs.pop('maxcharge', max(1, precursor_charge - 1)) - ax = kwargs.get('ax', None) - # end of common kwargs - - # backend-specific kwargs - centroided = kwargs.pop('centroided', True) - adjust = kwargs.pop('adjust_text', None) - if adjust or adjust is None: - try: - from adjustText import adjust_text - adjust_kw = kwargs.pop('adjust_kw', dict( - only_move={'text': 'y', 'points': 'y', 'objects': 'y'}, autoalign=False, force_text=(1, 1))) - except ImportError: - if adjust: - raise PyteomicsError('Install adjustText for text adjustment') - adjust = False - else: - if adjust is None: - adjust = True - # end of backend-specific kwargs - - parsed = parser.parse(peptide, True, labels=list(aa_mass) + [parser.std_cterm, parser.std_nterm]) - n = len(parsed) - maxpeak = spectrum['intensity array'].max() - mz, names = {}, {} - for ion in types: - for charge in range(1, maxcharge + 1): - if ion[0] in 'abc': - for i in range(2, n): - mz.setdefault(ion, []).append(mass.fast_mass2(parsed[:i] + [parser.std_cterm], - aa_mass=aa_mass, charge=charge, ion_type=ion, mass_data=mass_data, ion_comp=ion_comp)) - names.setdefault(ion, []).append(ion[0] + str(i - 1) + ion[1:]) - else: - for i in range(1, n - 1): - mz.setdefault(ion, []).append(mass.fast_mass2([parser.std_nterm] + parsed[n - (i + 1):], - aa_mass=aa_mass, charge=charge, ion_type=ion, mass_data=mass_data, ion_comp=ion_comp)) - names.setdefault(ion, []).append(ion[0] + str(i) + ion[1:]) - texts = [] - for ion in types: - c = colors.get(ion, colors.get(ion[0], 'blue')) - matrix = np.abs(spectrum['m/z array'] - np.array(mz[ion]).reshape(-1, 1)) - if ftol is not None: - match = np.where(matrix < ftol) - else: - match = np.where(matrix / spectrum['m/z array'] < rtol) - pseudo_spec = {'m/z array': spectrum['m/z array'][match[1]], 'intensity array': spectrum['intensity array'][match[1]]} - plot_spectrum(pseudo_spec, centroided=True, edgecolor=c, ax=ax) - for j, i in zip(*match): - x = spectrum['m/z array'][i] - y = spectrum['intensity array'][i] + maxpeak * 0.02 - name = names[ion][j] - texts.append(pylab.text(x, y, name, color=c, **text_kw)) - if adjust: - adjust_text(texts, **adjust_kw) - kwargs.setdefault('zorder', -1) - return plot_spectrum(spectrum, *args, centroided=centroided, **kwargs) - - -def _get_precursor_charge(spectrum): - try: - return mgf.MGFBase.parse_precursor_charge(spectrum['params']['charge'], list_only=True)[0] - except (PyteomicsError, KeyError): - pass - try: - return int(spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['charge state']) - except KeyError: - pass - return None - - -def _get_precursor_mz(spectrum): - try: - return spectrum['params']['pepmass'][0] - except KeyError: - pass - try: - return spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]['selected ion m/z'] - except KeyError: - pass - if 'attributes' in spectrum: - for attr in spectrum['attributes']: - if attr in {"MS:1000827", "MS:1000744", "MS:1002234"}: - return spectrum['attributes'][attr] - return None - - -def _spectrum_utils_create_spectrum(spectrum, *args, **kwargs): - if sus is None: - raise PyteomicsError('This backend requires `spectrum_utils>=0.4`.') - - # backend-specific parameters - mz_range = kwargs.pop('mz_range', None) - - min_intensity = kwargs.pop('min_intensity', 0.0) - max_num_peaks = kwargs.pop('max_num_peaks', None) - scaling = kwargs.pop('scaling', None) - max_intensity = kwargs.pop('max_intensity', None) - spectrum = sus.MsmsSpectrum( - 'None', kwargs.pop('precursor_mz', None), kwargs.pop('precursor_charge', None), - spectrum['m/z array'], spectrum['intensity array']) - if mz_range: - spectrum = spectrum.set_mz_range(*mz_range) - - spectrum = spectrum.filter_intensity(min_intensity=min_intensity, max_num_peaks=max_num_peaks - ).scale_intensity(scaling, max_intensity) - return spectrum - - -def _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs): - - # common kwargs - aa_mass = kwargs.pop('aa_mass', mass.std_aa_mass) - types = kwargs.pop('ion_types', ('b', 'y')) - tol = kwargs.pop('ftol', None) - if tol is None: - tol = kwargs.pop('rtol', 1e-5) * 1e6 - tol_mode = 'ppm' - else: - tol_mode = 'Da' - - # kwargs.pop('text_kw', None) # not used - - precursor_charge = kwargs.pop('precursor_charge', None) - if precursor_charge is None: - precursor_charge = _get_precursor_charge(spectrum) - if precursor_charge is None: - raise PyteomicsError('Could not extract precursor charge from spectrum. ' - 'Please specify `precursor_charge` keyword argument.') - - maxcharge = kwargs.pop('maxcharge', max(1, precursor_charge - 1)) - # end of common kwargs - - # backend-specific parameters - remove_precursor_peak = kwargs.pop('remove_precursor_peak', False) - - # peptide can be modX or proforma. spectrum_utils supports proforma only - aa_comp = kwargs.get('aa_comp') - mod_names = kwargs.get('mod_names') - prefix = kwargs.get('prefix') - - try: - parsed_proforma = proforma.ProForma.parse(peptide) - peptide_pro = peptide - except Exception: - parsed_proforma = None - try: - peptide_pro = parser.to_proforma(peptide, aa_mass=aa_mass, aa_comp=aa_comp, mod_names=mod_names, prefix=prefix) - except Exception: - raise PyteomicsError("Cannot parse {} as ProForma or convert from modX".format(peptide)) - - precursor_mz = kwargs.pop('precursor_mz', None) - if precursor_mz is None: - precursor_mz = _get_precursor_mz(spectrum) - if precursor_mz is None: - try: - if aa_comp: - precursor_mz = mass.calculate_mass(peptide, aa_comp=aa_comp, charge=precursor_charge) - elif not parsed_proforma: - precursor_mz = mass.fast_mass2(peptide, aa_mass=aa_mass, charge=precursor_charge) - else: - precursor_mz = mass.mass_charge_ratio(parsed_proforma.mass, precursor_charge) - except PyteomicsError: - raise PyteomicsError('Cannot obtain precursor m/z, please specify `precursor_mz` argument.') - - spectrum = _spectrum_utils_create_spectrum(spectrum, *args, - precursor_mz=precursor_mz, precursor_charge=precursor_charge, **kwargs) - if remove_precursor_peak: - spectrum = spectrum.remove_precursor_peak(tol, tol_mode) - spectrum = spectrum.annotate_proforma(peptide_pro, tol, tol_mode, types, maxcharge) - - return spectrum - - -class SpectrumUtilsColorScheme: - """Context manager that temporarily changes `spectrum_utils.plot.colors`.""" - def __init__(self, colors): - self.colors = colors - self.previous_colors = sup.colors.copy() - - def __enter__(self): - if self.colors: - sup.colors.update(self.colors) - - def __exit__(self, *args, **kwargs): - sup.colors = self.previous_colors - - -def _spectrum_utils_annotate_plot(spectrum, peptide, *args, **kwargs): - - with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): - spectrum = _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs) - return sup.spectrum(spectrum, annot_kws=kwargs.pop('text_kw', None), ax=kwargs.pop('ax', None)) - - -def _spectrum_utils_annotate_iplot(spectrum, peptide, *args, **kwargs): - import spectrum_utils.iplot as supi - with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): - spectrum = _spectrum_utils_annotate_spectrum(spectrum, peptide, *args, **kwargs) - return supi.spectrum(spectrum, annot_kws=kwargs.pop('text_kw', None)) - - -_annotation_backends = { - 'default': _default_annotate_spectrum, - 'spectrum_utils': _spectrum_utils_annotate_plot, - 'spectrum_utils.iplot': _spectrum_utils_annotate_iplot, -} - - -def annotate_spectrum(spectrum, peptide, *args, **kwargs): - """Plot a spectrum and annotate matching fragment peaks. - - Parameters - ---------- - spectrum : dict - A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys. - peptide : str - A modX sequence. - backend : str, keyword only, optional - One of `{'default', 'spectrum_utils', 'spectrum_utils.iplot'}`. - The `spectrum_utils` backend requires installing :py:mod:`spectrum_utils`. - The `spectrum_utils.iplot` backend requires installing :py:mod:`spectrum_utils[iplot]`. - ion_types : Container, keyword only, optional - Ion types to be considered for annotation. Default is `('b', 'y')`. - precursor_charge : int, keyword only, optional - If not specified, an attempt is made to extract it from `spectrum`. - maxcharge : int, keyword only, optional - Maximum charge state for fragment ions to be considered. Default is `precursor_charge - 1`. - colors : dict, keyword only, optional - Keys are ion types, values are colors to plot the annotated peaks with. Default depends on backend. - ftol : float, keyword only, optional - A fixed m/z tolerance value for peak matching. Alternative to `rtol`. - rtol : float, keyword only, optional - A relative m/z error for peak matching. Default is 10 ppm. - aa_mass : dict, keyword only, optional - A dictionary of amino acid residue masses. - text_kw : dict, keyword only, optional - Keyword arguments for :py:func:`pylab.text`. - xlabel : str, keyword only, optional - Label for the X axis. Default is "m/z". Does not work with `spectrum_utils.iplot` backend. - ylabel : str, keyword only, optional - Label for the Y axis. Default is "intensity". Does not work with `spectrum_utils.iplot` backend. - title : str, keyword only, optional - The title. Empty by default. Does not work with `spectrum_utils.iplot` backend. - ax : matplotlib.pyplot.Axes, keyword only, optional - Axes to draw the spectrum. Does not work with `spectrum_utils.iplot` backend. - - *args - Passed to the plotting backend. - **kwargs - Passed to the plotting backend. - - centroided : bool, keyword only, optional - Passed to :py:func:`plot_spectrum`. Only works with `default` backend. - ion_comp : dict, keyword only, optional - A dictionary defining ion compositions to override :py:const:`pyteomics.mass.std_ion_comp`. - Only works with `default` backend. - mass_data : dict, keyword only, optional - A dictionary of element masses to override :py:const:`pyteomics.mass.nist_mass`. - Only works with `default` backend. - - adjust_text : bool, keyword only, optional - Adjust the overlapping text annotations using :py:mod:`adjustText`. Only works with `default` backend. - adjust_kw : dict, keyword only, optional - Keyword arguments for :py:func:`adjust_text`. Only works with `default` backend. - - remove_precursor_peak : bool, keyword only, optional - Remove precursor peak from spectrum before annotation. Default is :py:const:`False`. - Only works with `spectrum_utils` backend. - min_intensity : float, keyword only, optional - Remove low-intensity peaks; this is a factor of maximum peak intensity. Default is 0 (no filtering). - Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - max_num_peaks : int or None, keyword only, optional - Remove low-intensity peaks; this is the number of peaks to keep. Default is :py:const:`None` (no filtering). - Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - scaling : one of `{'root', 'log', 'rank'}` or None, keyword only, optional - Scaling to apply to peak intensities. Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - max_intensity : float or None, keyword only, optional - Intensity of the most intense peak relative to which the peaks will be scaled - (the default is :py:const:`None`, which means that no scaling - relative to the most intense peak will be performed). - Only works with `spectrum_utils` and `spectrum_utils.iplot` backends. - aa_comp : dict, keyword only, optional - Amino acid compositions, including modified ones. If given, will be used for conversion from *modX* to ProForma. - mod_names : dict or callable, keyword only, optional - If given, will be used for conversion from *modX* to ProForma. - prefix : str, keyword only, optional - If given, will be used for conversion from *modX* to ProForma. - - Returns - ------- - out : matplotlib.pyplot.Axes - """ - bname = kwargs.pop('backend', 'default') - backend = _annotation_backends.get(bname) - if backend is None: - raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format( - bname, '; '.join(_annotation_backends))) - - pylab.xlabel(kwargs.pop('xlabel', 'm/z')) - pylab.ylabel(kwargs.pop('ylabel', 'intensity')) - pylab.title(kwargs.pop('title', '')) - return backend(spectrum, peptide, *args, **kwargs) - - -def _spectrum_utils_mirror(spec_top, spec_bottom, spectrum_kws=None, ax=None, **kwargs): - with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): - ax = sup.mirror(spec_top, spec_bottom, spectrum_kws=spectrum_kws, ax=ax) - ax.set_xlabel(kwargs.pop('xlabel', 'm/z')) - ax.set_ylabel(kwargs.pop('ylabel', 'intensity')) - ax.set_title(kwargs.pop('title', '')) - return ax - - -def _spectrum_utils_iplot_mirror(spec_top, spec_bottom, spectrum_kws=None, **kwargs): - import spectrum_utils.iplot as supi - with SpectrumUtilsColorScheme(kwargs.pop('colors', None)): - return supi.mirror(spec_top, spec_bottom, spectrum_kws=spectrum_kws) - - -_mirror_backends = { - 'spectrum_utils': _spectrum_utils_mirror, - 'spectrum_utils.iplot': _spectrum_utils_iplot_mirror, -} - - -def mirror(spec_top, spec_bottom, peptide=None, spectrum_kws=None, ax=None, **kwargs): - """Create a mirror plot of two (possible annotated) spectra using `spectrum_utils`. - - Parameters - ---------- - spec_top : dict - A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys. - spec_bottom : dict - A spectrum as returned by Pyteomics parsers. Needs to have 'm/z array' and 'intensity array' keys. - peptide : str or None, optional - A modX sequence or ProForma. If provided, the peaks will be annotated as peptide fragments. - spectrum_kws : dict or None, optional - Passed to :py:func:`spectrum_utils.plot.mirror`. - backend : str, keyword only, optional - One of {'spectrum_utils', 'spectrum_utils.iplot'}. Default is 'spectrum_utils'. - - .. note :: - Requires :py:mod:`spectrum_utils` or :py:mod:`spectrun_utils[iplot]`, respectively. - - ax : matplotlib.pyplot.Axes or None, optional - Passed to :py:func:`spectrum_utils.plot.mirror`. Works only for the 'spectrum_utils' backend. - xlabel : str, keyword only, optional - Label for the X axis. Default is "m/z". Works only for the 'spectrum_utils' backend. - ylabel : str, keyword only, optional - Label for the Y axis. Default is "intensity". Works only for the 'spectrum_utils' backend. - title : str, keyword only, optional - The title. Empty by default. Works only for the 'spectrum_utils' backend. - - **kwargs : same as for :py:func:`annotate_spectrum` for `spectrum_utils` backends. - - Returns - ------- - out : matplotlib.pyplot.Axes - """ - - spec_gen = _spectrum_utils_create_spectrum if peptide is None else _spectrum_utils_annotate_spectrum - spec_top = spec_gen(spec_top, peptide, **kwargs) - spec_bottom = spec_gen(spec_bottom, peptide, **kwargs) - - bname = kwargs.pop('backend', 'spectrum_utils') - backend = _mirror_backends.get(bname) - if backend is None: - raise PyteomicsError('Unknown backend name: {}. Should be one of: {}.'.format( - bname, '; '.join(_mirror_backends))) - backend_kw = {'spectrum_kws': spectrum_kws} - if bname == 'spectrum_utils': - backend_kw['ax'] = ax - backend_kw.update(kwargs) - return backend(spec_top, spec_bottom, **backend_kw) diff --git a/pyteomics/tandem.py b/pyteomics/tandem.py deleted file mode 100644 index ba08b43f84799fe4ef04735966e26246c0cd34d7..0000000000000000000000000000000000000000 --- a/pyteomics/tandem.py +++ /dev/null @@ -1,384 +0,0 @@ -""" -tandem - X!Tandem output file reader -==================================== - -Summary -------- - -`X!Tandem <http://thegpm.org/tandem/>`_ is an open-source proteomic search -engine with a very simple, sophisticated application programming interface -(API): it simply takes an XML file of instructions on its command line, -and outputs the results into an XML file, which has been specified in the input -XML file. The output format is described -`here (PDF) <http://www.thegpm.org/docs/X_series_output_form.pdf>`_. - -This module provides a minimalistic way to extract information from X!Tandem -output files. You can use the old functional interface (:py:func:`read`) or the -new object-oriented interface (:py:class:`TandemXML`) to iterate over entries in -`<group>` elements, i.e. identifications for a certain spectrum. - -Data access ------------ - - :py:class:`TandemXML` - a class representing a single X!Tandem output file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through peptide-spectrum matches in an X!Tandem - output file. Data from a single PSM are converted to a human-readable dict. - - :py:func:`chain` - read multiple files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - - :py:func:`DataFrame` - read X!Tandem output files into a :py:class:`pandas.DataFrame`. - -Target-decoy approach ---------------------- - - :py:func:`filter` - iterate through peptide-spectrum matches in a chain of - X!Tandem output files, yielding only top PSMs and keeping false discovery rate - (FDR) at the desired level. The FDR is estimated using the target-decoy - approach (TDA). - - :py:func:`filter.chain` - chain a series of filters applied independently to - several files. - - :py:func:`filter.chain.from_iterable` - chain a series of filters applied - independently to an iterable of files. - - :py:func:`filter_df` - filter X!Tandem output files and return a :py:class:`pandas.DataFrame`. - - - :py:func:`is_decoy` - determine if a PSM is from the decoy database. - - :py:func:`fdr` - estimate the FDR in a data set using TDA. - - :py:func:`qvalues` - get an array of scores and local FDR values for a PSM - set using the target-decoy approach. - -Deprecated functions --------------------- - - :py:func:`iterfind` - iterate over elements in an X!Tandem file. - You can just call the corresponding method of the :py:class:`TandemXML` - object. - -Dependencies ------------- - -This module requires :py:mod:`lxml` and :py:mod:`numpy`. - -------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import operator -from . import xml, auxiliary as aux, _schema_defaults - - -class TandemXML(xml.XML): - """Parser class for TandemXML files.""" - file_format = "TandemXML" - _root_element = "bioml" - _default_schema = _schema_defaults._tandem_schema_defaults - _default_iter_path = 'group[@type="model"]' - _structures_to_flatten = {'domain'} - - def __init__(self, *args, **kwargs): - if 'recursive' not in kwargs: - super(TandemXML, self).__init__(*args, recursive=True, **kwargs) - else: - super(TandemXML, self).__init__(*args, **kwargs) - - __init__.__doc__ = xml.XML.__init__.__doc__ - - def _get_info_smart(self, element, **kw): - info = self._get_info(element, **kw) - # handy simplifications below - if isinstance(info.get('note'), list) and len(info['note']) == 1 and set(info['note'][0]) == {'label', 'note'}: - info['note'] = info['note'][0]['note'] - if 'protein' in info and 'label' in info: - del info['label'] - if 'group' in info: - for g in info['group']: - label = g.pop('label') - type_ = g.pop('type') - info.setdefault(type_, {})[label] = g - del info['group'] - if 'trace' in info: - for t in info['trace']: - info[t.pop('type')] = t - del info['trace'] - if isinstance(info.get('values'), dict): - info['values'] = info['values']['values'] - if isinstance(info.get('attribute'), list): - for a in info.pop('attribute'): - info[a['type']] = float(a['attribute']) - if 'support' in info: - for d in info['support'].get('supporting data', {}).values(): - for label in ['Xdata', 'Ydata']: - d[label]['values'] = d[label]['values'].astype(int) - del d[label]['label'] - if 'fragment ion mass spectrum' in info['support']: - fims = info['support']['fragment ion mass spectrum'] - fims.update(fims.pop('tandem mass spectrum')) - for label in ['Xdata', 'Ydata']: - del info['support']['fragment ion mass spectrum'][label]['label'] - if 'charge' in info: - info['charge'] = int(info['charge']) - if info.get('rt') == '': - info['rt'] = None - - return info - - def _get_schema_info(self, read_schema): - return self._default_schema - - def __next__(self): - n = super(TandemXML, self).__next__() - del n['type'] - return n - - next = __next__ - - -def read(source, iterative=True, **kwargs): - """Parse `source` and iterate through peptide-spectrum matches. - - Parameters - ---------- - source : str or file - A path to a target X!Tandem output file or the file object itself. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - Returns - ------- - out : iterator - An iterator over dicts with PSM properties. - """ - return TandemXML(source, read_schema=False, recursive=True, iterative=iterative) - - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified "XPath". - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create a :py:class:`TandemXML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - Returns - ------- - out : iterator - """ - return TandemXML(source, **kwargs).iterfind(path, **kwargs) - - -# chain = aux._make_chain(read, 'read') -chain = aux.ChainBase._make_chain(TandemXML) - - -def _is_decoy_prefix(psm, prefix='DECOY_'): - """Given a PSM dict, return :py:const:`True` if all protein names for - the PSM start with `prefix`, and :py:const:`False` otherwise. - - Parameters - ---------- - psm : dict - A dict, as yielded by :py:func:`read`. - prefix : str, optional - A prefix used to mark decoy proteins. Default is `'DECOY_'`. - - Returns - ------- - out : bool - """ - return all(prot['label'].startswith(prefix) for prot in psm['protein']) - - -def _is_decoy_suffix(psm, suffix='_DECOY'): - """Given a PSM dict, return :py:const:`True` if all protein names for - the PSM end with `suffix`, and :py:const:`False` otherwise. - - Parameters - ---------- - psm : dict - A dict, as yielded by :py:func:`read`. - suffix : str, optional - A suffix used to mark decoy proteins. Default is `'_DECOY'`. - - Returns - ------- - out : bool - """ - return all(prot['label'].endswith(suffix) for prot in psm['protein']) - - -is_decoy = _is_decoy_prefix -qvalues = aux._make_qvalues(chain, _is_decoy_prefix, _is_decoy_suffix, operator.itemgetter('expect')) -filter = aux._make_filter(chain, _is_decoy_prefix, _is_decoy_suffix, operator.itemgetter('expect'), qvalues) -fdr = aux._make_fdr(_is_decoy_prefix, _is_decoy_suffix) -filter.chain = aux._make_chain(filter, 'filter', True) - - -def DataFrame(*args, **kwargs): - """Read X!Tandem output files into a :py:class:`pandas.DataFrame`. - - Requires :py:mod:`pandas`. - - Parameters - ---------- - - sep : str or None, optional - Some values related to PSMs (such as protein information) are variable-length - lists. If `sep` is a :py:class:`str`, they will be packed into single string using - this delimiter. If `sep` is :py:const:`None`, they are kept as lists. Default is - :py:const:`None`. - - pd_kwargs : dict, optional - Keyword arguments passed to the :py:class:`pandas.DataFrame` constructor. - - *args - Passed to :py:func:`chain`. - - **kwargs - Passed to :py:func:`chain`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - data = [] - prot_keys = ['id', 'uid', 'label', 'expect'] - pep_keys = ['id', 'pre', 'post', 'start', 'end'] - sep = kwargs.pop('sep', None) - pd_kwargs = kwargs.pop('pd_kwargs', {}) - with chain(*args, **kwargs) as f: - for item in f: - info = {} - for k, v in item.items(): - if isinstance(v, (str, int, float)): - info[k] = v - protein = item['protein'][0] - - for key in prot_keys: - vals = [prot.get(key) for prot in item['protein']] - if sep is not None: - vals = sep.join(str(val) if val is not None else '' for val in vals) - info['protein_' + key] = vals - for key in pep_keys: - vals = [prot['peptide'].get(key) for prot in item['protein']] - if sep is not None: - vals = sep.join(str(val) if val is not None else '' for val in vals) - info['peptide_' + key] = vals - aa = protein['peptide'].pop('aa', []) - info['modifications'] = ','.join('{0[modified]:.3f}@{0[type]}'.format(x) for x in aa) - for k in prot_keys: - protein.pop(k, None) - for k in pep_keys: - protein['peptide'].pop(k, None) - info.update(protein['peptide']) - fims = item['support']['fragment ion mass spectrum'] - try: - info['scan'] = fims['note'] - except KeyError: - info['scan'] = fims['id'] - data.append(info) - return pd.DataFrame(data, **pd_kwargs) - - -def filter_df(*args, **kwargs): - """Read X!Tandem output files or DataFrames and return a :py:class:`DataFrame` with filtered PSMs. - Positional arguments can be X!Tandem output files or DataFrames. - - Requires :py:mod:`pandas`. - - Parameters - ---------- - key : str / iterable / callable, optional - Default is 'expect'. - is_decoy : str / iterable / callable, optional - Default is to check if all strings in the "protein" column start with `'DECOY_'` - *args - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - **kwargs - Passed to :py:func:`auxiliary.filter` and/or :py:func:`DataFrame`. - - Returns - ------- - out : pandas.DataFrame - """ - import pandas as pd - sep = kwargs.get('sep') - kwargs.setdefault('key', 'expect') - if all(isinstance(arg, pd.DataFrame) for arg in args): - if len(args) > 1: - df = pd.concat(args) - else: - df = args[0] - else: - read_kw = {k: kwargs.pop(k) for k in ['iterative', 'read_schema', 'sep', 'pd_kwargs'] if k in kwargs} - df = DataFrame(*args, **read_kw) - - if 'is_decoy' not in kwargs: - if sep is not None: - if 'decoy_suffix' in kwargs: - kwargs['is_decoy'] = df['protein_label'].str.split(sep).apply( - lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) - else: - kwargs['is_decoy'] = df['protein_label'].str.split(sep).apply( - lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) - else: - if 'decoy_suffix' in kwargs: - kwargs['is_decoy'] = df['protein_label'].apply( - lambda s: all(x.endswith(kwargs['decoy_suffix']) for x in s)) - else: - kwargs['is_decoy'] = df['protein_label'].apply( - lambda s: all(x.startswith(kwargs.get('decoy_prefix', 'DECOY_')) for x in s)) - - return aux.filter(df, **kwargs) diff --git a/pyteomics/traml.py b/pyteomics/traml.py deleted file mode 100644 index 66ed27e675eb0cdc8afe873eebc35b9c1efb2ad5..0000000000000000000000000000000000000000 --- a/pyteomics/traml.py +++ /dev/null @@ -1,235 +0,0 @@ -""" -traml - targeted MS transition data in TraML format -=================================================== - -Summary -------- - -TraML is a standard rich XML-format for targeted mass spectrometry method definitions. -Please refer to `psidev.info <http://www.psidev.info/traml>`_ -for the detailed specification of the format and structure of TraML files. - -This module provides a minimalistic way to extract information from TraML -files. You can use the object-oriented interface (:class:`TraML` instances) to -access target definitions and transitions. :class:`TraML` objects also support -indexing with entity IDs directly. - -Data access ------------ - - :py:class:`TraML` - a class representing a single TraML file. - Other data access functions use this class internally. - - :py:func:`read` - iterate through transitions in TraML format. - - :py:func:`chain` - read multiple TraML files at once. - - :py:func:`chain.from_iterable` - read multiple files at once, using an - iterable of files. - -Controlled Vocabularies -~~~~~~~~~~~~~~~~~~~~~~~ -TraML relies on controlled vocabularies to describe its contents extensibly. See -`Controlled Vocabulary Terms <../data.html#controlled-vocabulary-terms-in-structured-data>`_ -for more details on how they are used. - -Handling Time Units and Other Qualified Quantities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -TraML contains information which may be described as using a variety of different time units. -See `Unit Handling <../data.html#unit-handling>`_ for more information. - -Deprecated functions --------------------- - - :py:func:`version_info` - get version information about the TraML file. - You can just read the corresponding attribute of the :py:class:`TraML` object. - - :py:func:`iterfind` - iterate over elements in an TraML file. - You can just call the corresponding method of the :py:class:`TraML` object. - -Dependencies ------------- - -This module requires :py:mod:`lxml` - -------------------------------------------------------------------------------- -""" - -# Copyright 2018 Joshua Klein, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings -from . import xml, _schema_defaults, auxiliary as aux - - -class TraML(xml.MultiProcessingXML, xml.IndexSavingXML): - """Parser class for TraML files.""" - file_format = 'TraML' - _root_element = 'TraML' - _default_schema = _schema_defaults._traml_schema_defaults - _default_version = '1.0.0' - - _default_iter_tag = 'Transition' - _indexed_tags = { - 'Transition', - 'Peptide', - 'Compound', - 'Target', - 'Protein', - 'Compound', - } - - _element_handlers = xml.XML._element_handlers.copy() - _element_handlers.update({ - 'Modification': xml.XML._promote_empty_parameter_to_name, - 'Interpretation': xml.XML._promote_empty_parameter_to_name, - 'Software': xml.XML._promote_empty_parameter_to_name, - }) - - def __init__(self, *args, **kwargs): - kwargs.setdefault('retrieve_refs', True) - super(TraML, self).__init__(*args, **kwargs) - - def _get_info_smart(self, element, **kw): - kwargs = dict(kw) - rec = kwargs.pop('recursive', None) - info = self._get_info( - element, - recursive=(rec if rec is not None else True), - **kwargs) - return info - - def _retrieve_refs(self, info, **kwargs): - """Retrieves and embeds the data for each attribute in `info` that - ends in `Ref`. Removes the id attribute from `info`""" - for k, v in dict(info).items(): - if k[-3:] in {'Ref', 'ref'}: - if isinstance(v, str): - key = v - elif isinstance(v, dict): - key = v['ref'] - else: - if k != 'ref': - info[k[:-3]] = info.pop(k) - continue - try: - by_id = self.get_by_id(key, retrieve_refs=True) - except KeyError: - warnings.warn('Ignoring unresolved reference: ' + key) - else: - if k == 'ref': - info.update(by_id) - else: - # by_id.pop('id', None) - info[k[:-3]] = by_id - del info[k] - - - -def read(source, retrieve_refs=True, read_schema=False, iterative=True, use_index=False, huge_tree=False): - """Parse `source` and iterate through transitions. - - Parameters - ---------- - source : str or file - A path to a target TraML file or the file object itself. - - retrieve_refs : bool, optional - If :py:const:`True`, additional information from references will be - automatically added to the results. The file processing time will - increase. Default is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the TraML header. Otherwise, use default parameters. - Not recommended without Internet connection or - if you don't like to get the related warnings. - - iterative : bool, optional - Defines whether iterative parsing should be used. It helps reduce - memory usage at almost the same parsing speed. Default is - :py:const:`True`. - - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - spectrum elements. Default is :py:const:`False`. - - huge_tree : bool, optional - This option is passed to the `lxml` parser and defines whether - security checks for XML tree depth and node size should be disabled. - Default is :py:const:`False`. - Enable this option for trusted files to avoid XMLSyntaxError exceptions - (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). - - Returns - ------- - out : TraML - A :py:class:`TraML` object, suitable for iteration and possibly random access. - """ - - return TraML(source, retrieve_refs=retrieve_refs, read_schema=read_schema, iterative=iterative, - use_index=use_index, huge_tree=huge_tree) - - -def iterfind(source, path, **kwargs): - """Parse `source` and yield info on elements with specified local - name or by specified "XPath". - - .. note:: This function is provided for backward compatibility only. - If you do multiple :py:func:`iterfind` calls on one file, you should - create an :py:class:`TraML` object and use its - :py:meth:`!iterfind` method. - - Parameters - ---------- - source : str or file - File name or file-like object. - - path : str - Element name or XPath-like expression. Only local names separated - with slashes are accepted. An asterisk (`*`) means any element. - You can specify a single condition in the end, such as: - ``"/path/to/element[some_value>1.5]"`` - Note: you can do much more powerful filtering using plain Python. - The path can be absolute or "free". Please don't specify - namespaces. - - recursive : bool, optional - If :py:const:`False`, subelements will not be processed when - extracting info from elements. Default is :py:const:`True`. - - iterative : bool, optional - Specifies whether iterative XML parsing should be used. Iterative - parsing significantly reduces memory usage and may be just a little - slower. When `retrieve_refs` is :py:const:`True`, however, it is - highly recommended to disable iterative parsing if possible. - Default value is :py:const:`True`. - - read_schema : bool, optional - If :py:const:`True`, attempt to extract information from the XML schema - mentioned in the mzIdentML header. Otherwise, use default - parameters. Not recommended without Internet connection or - if you don't like to get the related warnings. - - Returns - ------- - out : iterator - """ - return TraML(source, **kwargs).iterfind(path, **kwargs) - - -version_info = xml._make_version_info(TraML) - -chain = aux.ChainBase._make_chain(TraML) diff --git a/pyteomics/usi.py b/pyteomics/usi.py deleted file mode 100644 index 57a265b09f1d665f063d1144c979d5c38004995e..0000000000000000000000000000000000000000 --- a/pyteomics/usi.py +++ /dev/null @@ -1,527 +0,0 @@ -""" -usi - Universal Spectrum Identifier (USI) parser and minimal PROXI client -========================================================================= - -Summary -------- -`USI <http://www.psidev.info/usi>`_ is a standardized method of referencing a specific -spectrum in a dataset, possibly attached to an interpretation. This module includes a -:class:`USI` type which can represent these constructs, :meth:`~USI.parse` them and -reconstruct them. - -One use-case for USI is to request spectrum information from a `PROXI <http://www.psidev.info/proxi>`_ -service host. PROXI services are available from several of the major national proteomics data hosts, -including MassIVE, PeptideAtlas, PRIDE, and jPOST. - -.. seealso:: - LeDuc, Richard D., Eric W. Deutsch, Pierre-Alain Binz, Ryan T. Fellers, Anthony J. Cesnik, - Joshua A. Klein, Tim Van Den Bossche, et al. - "Proteomics Standards Initiative's ProForma 2.0: Unifying the Encoding of Proteoforms and Peptidoforms." - ArXiv:2109.11352 [q-Bio], September 23, 2021. http://arxiv.org/abs/2109.11352. - - - -Data access ------------ - - :py:class:`USI` for representing Universal Spectrum Identifiers. Call :meth:`USI.parse` to parse a USI - string. - - :py:func:`proxi` to request a USI from a remote service. Provides access to the PeptideAtlas, MassIVE, - PRIDE and jPOST hosts. - -""" -import json -import warnings -import threading -import multiprocessing - -from collections import namedtuple, defaultdict - -try: - from multiprocessing.dummy import Pool as ThreadPool -except ImportError: - ThreadPool = None - -try: - from urllib2 import Request, urlopen -except ImportError: - from urllib.request import Request, urlopen - -try: - import numpy as np - - def coerce_array(array_data): - return np.array([float(v) for v in array_data]) - -except ImportError: - - def coerce_array(array_data): - return [float(v) for v in array_data] - -from .auxiliary import PyteomicsError - - -class USI(namedtuple("USI", ['protocol', 'dataset', 'datafile', 'scan_identifier_type', 'scan_identifier', 'interpretation'])): - '''Represent a Universal Spectrum Identifier (USI). - - .. note:: - This implementation will capture the interpretation component but will not interpret it at this time. - - Attributes - ---------- - protocol: str - The protocol to use to access the data (usually mzspec) - dataset: str - The name or accession number for the dataset the spectrum residues in - datafile: str - The basename of the data file from :attr:`dataset` to retrieve the spectrum from - scan_identifier_type: str - The format of the scan identifier, one of (scan, index, nativeId, trace) - scan_identifier: str - A usually numerical but potentially comma separated value encoded as a string to uniquely - identify the spectrum to be recovered from :attr:`datafile` in :attr:`dataset`. - interpretation: str - The trailing material of the USI, such as the ProForma peptide sequence and charge - ''' - def __str__(self): - return ':'.join(filter(lambda x: x is not None, self)) - - @classmethod - def parse(cls, usi): - '''Parse a USI string into a :class:`USI` object. - - Parameters - ---------- - usi: str - The USI string to parse - - Returns - ------- - USI - ''' - return cls(*_usi_parser(str(usi))) - - -def cast_numeric(value): - try: - return int(value) - except ValueError: - pass - try: - return float(value) - except ValueError: - return value - - -def _usi_parser(usi): - tokens = usi.split(":", 5) - protocol = tokens[0] - dataset = tokens[1] - datafile = tokens[2] - scan_identifier_type = tokens[3] - scan_identifier = tokens[4] - try: - interpretation = tokens[5] - except IndexError: - interpretation = None - return (protocol, dataset, datafile, scan_identifier_type, scan_identifier, interpretation) - - -class _PROXIBackend(object): - '''A base class for all PROXI backends to implement the gory details of HTTP requests - and protocol parsing. - - If special processing needs to be done to interpret the spectrum returned from the service - provider, override the :meth:`_coerce` method. - - If extra information needs to be provided to the service provider for them to fulfill the - request not passed through the URL, override the :meth:`_request` method. - - Attributes - ---------- - name: str - The name of the backend service - url_template: str - The URL with {} fields to populate with the USI and any other relevant options, like protocol version - or the like. - options: dict - Additional options to be used when preparing the request URL. - ''' - def __init__(self, name, url_template, **kwargs): - kwargs.setdefault('version', '0.1') - self.name = name - self.url_template = url_template - self.options = kwargs - - def __repr__(self): - return "{self.__class__.__name__}({self.options})".format(self=self) - - def _request(self, usi): - url = self.url_template.format(usi=usi, **self.options) - req = Request(url) - response = urlopen(req) - if response.getcode() != 200: - raise ValueError("PROXI Service Response Code %r" % (response.getcode())) - data = response.read().decode("utf-8") - data = json.loads(data) - return data - - def get(self, usi): - '''Retrieve a ``USI`` from the host PROXI service over the network. - - Parameters - ---------- - usi : str or :class:`USI` - The universal spectrum identifier to retrieve. - - Returns - ------- - dict: - The spectrum as represented by the requested PROXI host. - ''' - data = self._request(usi) - result = self._coerce(data) - return result - - def _coerce(self, data): - '''Override and extend this method to change how the spectrum information is refined. - - This implementation just deals with properly formatting the peak arrays and doing minor - cosmetic name normalization. - - Parameters - ---------- - data: dict - The raw mzSpecML representation parsed from JSON - - Returns - ------- - dict: - The coerced spectrum data of appropriate types - ''' - if isinstance(data, list): - data_collection = data - data = data_collection[0] - result = {} - result['attributes'] = data.pop('attributes', []) - for attrib in result['attributes']: - if 'value' in attrib and isinstance(attrib['value'], str) and attrib['value'][0].isdigit(): - try: - attrib['value'] = cast_numeric(attrib['value']) - except TypeError: - continue - result['m/z array'] = coerce_array(data.pop('mzs', [])) - result['intensity array'] = coerce_array(data.pop('intensities', [])) - for key, value in data.items(): - if key in result: - raise ValueError( - "Attempting to set explicit value for {key!r}".format(key=key)) - result[key] = value - return result - - def __call__(self, usi): - return self.get(usi) - - -class PeptideAtlasBackend(_PROXIBackend): - _url_template = "http://www.peptideatlas.org/api/proxi/v{version}/spectra?resultType=full&usi={usi!s}" - - def __init__(self, **kwargs): - - super(PeptideAtlasBackend, self).__init__( - 'PeptideAtlas', self._url_template, **kwargs) - - -class MassIVEBackend(_PROXIBackend): - - _url_template = "http://massive.ucsd.edu/ProteoSAFe/proxi/v{version}/spectra?resultType=full&usi={usi}" - - def __init__(self, **kwargs): - super(MassIVEBackend, self).__init__( - 'MassIVE', self._url_template, **kwargs) - - -class PRIDEBackend(_PROXIBackend): - _url_template = "http://wwwdev.ebi.ac.uk/pride/proxi/archive/v{version}/spectra?resultType=full&usi={usi}" - - def __init__(self, **kwargs): - super(PRIDEBackend, self).__init__( - 'PRIDE', self._url_template, **kwargs) - - -class JPOSTBackend(_PROXIBackend): - _url_template = 'https://repository.jpostdb.org/proxi/spectra?resultType=full&usi={usi}' - - def __init__(self, **kwargs): - super(JPOSTBackend, self).__init__('jPOST', self._url_template, **kwargs) - kwargs.pop("version", None) - - -class ProteomeExchangeBackend(_PROXIBackend): - _url_template = 'http://proteomecentral.proteomexchange.org/api/proxi/v{version}/spectra?resultType=full&usi={usi!s}' - - def __init__(self, **kwargs): - - super(ProteomeExchangeBackend, self).__init__( - 'ProteomeExchange', self._url_template, **kwargs) - - -class PROXIAggregator(object): - '''Aggregate across requests across multiple PROXI servers. - - Will attempt to coalesce responses from responding servers into a single spectrum - representation. - - Attributes - ---------- - backends : :class:`dict` mapping :class:`str` to :class:`_PROXIBackend` - The backend servers to query. Defaults to the set of all available backends. - n_threads : int - The number of threads to run concurrently to while making requests. Defaults - to the number of servers to query. - timeout : float - The number of seconds to wait for a response. - ephemeral_pool : bool - Whether or not to tear down the thread pool between requests. - ''' - - _coalesce_resolution_methods = ("first", ) - - def __init__(self, backends=None, n_threads=None, timeout=15, merge=True, ephemeral_pool=True, **kwargs): - if backends is None: - backends = {k: v() for k, v in _proxies.items()} - if n_threads is None: - n_threads = len(backends) - - self.lock = threading.RLock() - - self.timeout = timeout - self.backends = backends - self.n_threads = n_threads - self.ephemeral_pool = ephemeral_pool - self.pool = None - self.merge = merge - - def _init_pool(self): - if ThreadPool is None: - return False - if self.pool is not None: - return True - with self.lock: - if self.pool is None: - self.pool = ThreadPool(self.n_threads) - return True - - def _clean_up_pool(self): - if self.pool: - self.pool.close() - self.pool.terminate() - self.pool = None - - def _fetch_usi(self, usi): - use_pool = self._init_pool() - agg = [] - if use_pool: - with self.lock: - for backend in self.backends.values(): - result = self.pool.apply_async(backend.get, (usi, )) - agg.append((backend, result)) - tmp = [] - for backend, res in agg: - try: - res = res.get(self.timeout) - tmp.append((backend, res)) - except (multiprocessing.TimeoutError, Exception) as err: - tmp.append((backend, err)) - agg = tmp - if self.ephemeral_pool: - self._clean_up_pool() - else: - for backend in self.backends.values(): - try: - agg.append(backend, backend.get(usi)) - except Exception as err: - agg.append((backend, err)) - continue - return agg - - def coalesce(self, responses, method='first'): - '''Merge responses from disparate servers into a single spectrum representation. - - The merging process will use the first of every array encountered, and all unique - attributes. - - Parameters - ---------- - responses : list - A list of response values, pairs (:class:`_PROXIBackend` and either - :class:`dict` or :class:`Exception`). - method : str - The name of the coalescence technique to use. Currently only "first" is - supported. - - Returns - ------- - result : :class:`dict` - The coalesced spectrum - ''' - if method not in self._coalesce_resolution_methods: - raise ValueError("Coalescence method %r not recognized" % (method, )) - - def collapse_attribute(values): - try: - acc = list(set(v['value'] for v in values)) - except TypeError: - acc = [] - for v in values: - if v['value'] not in acc: - acc.append(v['value']) - - result = [] - template = values[0].copy() - for v in acc: - t = template.copy() - t['value'] = v - result.append(t) - return result - - arrays = {} - attributes = defaultdict(list) - - found = [] - error = [] - - for backend, response in responses: - if isinstance(response, Exception): - error.append((backend.name, (response))) - continue - else: - found.append(backend.name) - for array_name in ('m/z array', 'intensity array'): - if array_name not in arrays: - arrays[array_name] = response[array_name] - else: - array = response[array_name] - if len(array) != len(arrays[array_name]): - warnings.warn("Length mismatch from %s for %s" % - (backend.name, array_name)) - arrays[array_name] = max((array, arrays[array_name]), key=len) - elif not np.allclose(array, arrays[array_name]): - warnings.warn("Value mismatch from %s for %s" % - (backend.name, array_name)) - for attr in response['attributes']: - attributes[attr.get('accession', attr.get('name'))].append(attr) - - finalized_attributes = [] - for k, v in attributes.items(): - finalized_attributes.extend(collapse_attribute(v)) - - result = {"responders": found, 'errors': error, 'attributes': finalized_attributes} - result.update(arrays) - if 'm/z array' not in result: - raise ValueError("No valid responses found") - return result - - def tag_with_source(self, responses): - '''Mark each response with it's source. - - Parameters - ---------- - responses : list - A list of response values, pairs (:class:`_PROXIBackend` and either - :class:`dict` or :class:`Exception`). - - Returns - ------- - result : list[dict] - The tagged :class:`dict` for each response. - ''' - output = [] - for backend, response in responses: - if isinstance(response, dict): - response['source'] = backend - else: - response = { - "source": backend, - "error": response - } - output.append(response) - return output - - def get(self, usi): - '''Retrieve a ``USI`` from each PROXI service over the network. - - Parameters - ---------- - usi : str or :class:`USI` - The universal spectrum identifier to retrieve. - - Returns - ------- - result : dict or list[dict] - The spectrum coalesced from all responding PROXI hosts if :attr:`merge` is :const:`True`, - or a list of responses marked by host. - ''' - agg = self._fetch_usi(usi) - if self.merge: - return self.coalesce(agg) - else: - return self.tag_with_source(agg) - - def __call__(self, usi): - return self.get(usi) - - def __del__(self): - self._clean_up_pool() - -_proxies = { - "peptide_atlas": PeptideAtlasBackend, - "massive": MassIVEBackend, - "pride": PRIDEBackend, - "jpost": JPOSTBackend, - 'proteome_exchange': ProteomeExchangeBackend, -} - -default_backend = 'peptide_atlas' - -AGGREGATOR_KEY = "aggregator" -AGGREGATOR = PROXIAggregator() - - -def proxi(usi, backend=default_backend, **kwargs): - '''Retrieve a ``USI`` from a `PROXI <http://www.psidev.info/proxi>`. - - Parameters - ---------- - usi : str or :class:`USI` - The universal spectrum identifier to request. - backend : str or :class:`Callable` - Either the name of a PROXI host (peptide_atlas, massive, pride, jpost, or aggregator), - or a callable object (which :class:`_PROXIBackend` instances are) which will be used - to resolve the USI. The "aggregator" backend will use a :class:`PROXIAggregator` instance - which will request the same USI from all the registered servers and attempt to merge their - responses into a single whole. See :meth:`PROXIAggregator.coalesce` for more details on the - merging process. - **kwargs: - extra arguments passed when constructing the backend by name. - - Returns - ------- - dict : - The spectrum as represented by the requested PROXI host. - ''' - if isinstance(backend, str): - if backend == AGGREGATOR_KEY: - backend = AGGREGATOR - elif backend in _proxies: - backend = _proxies[backend](**kwargs) - else: - raise PyteomicsError("Unknown PROXI backend name: {}.".format(backend)) - elif isinstance(backend, type) and issubclass(backend, (_PROXIBackend, PROXIAggregator)): - backend = backend(**kwargs) - elif callable(backend): - backend = backend - else: - raise TypeError("Unrecognized backend type: {0.__name__}".format(type(backend))) - return backend(usi) diff --git a/pyteomics/version.py b/pyteomics/version.py deleted file mode 100644 index 66aca50842f95c0f102e86e70cfab06152ca0d2d..0000000000000000000000000000000000000000 --- a/pyteomics/version.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -version - Pyteomics version information -======================================= - -This module is provided for convenience and captures information about the current version number of Pyteomics. - -Classes -------- - - :py:class:`VersionInfo` - a namedtuple for version numbers that supports comparisons and can be initialized - from a version string. - -Constants ---------- - - :py:const:`version` - a string with the current version. - - :py:const:`version_info` - a tuple with structured information about the current version. - -""" - -__version__ = '4.6.4b3' - -from collections import namedtuple -import re - - -class VersionInfo(namedtuple('VersionInfo', ('major', 'minor', 'micro', 'releaselevel', 'serial'))): - """Tuple mimicking :py:const:`sys.version_info`""" - def __new__(cls, version_str): - if isinstance(version_str, str): - groups = re.match(r'(\d+)\.(\d+)(?:\.)?(\d+)?([a-zA-Z]+)?(\d+)?', version_str).groups() - inst = super(VersionInfo, cls).__new__(cls, *groups) - else: - inst = super(VersionInfo, cls).__new__(cls, *(str(x) if x is not None else x for x in version_str)) - inst._version_str = version_str - inst._version_ints = tuple(int(x) if isinstance(x, str) and x.isdigit() else 0 for x in inst) - return inst - - def __str__(self): - return 'Version {}'.format(self._version_str) - - def __lt__(self, other): - if not isinstance(other, VersionInfo): - other = VersionInfo(other) - return self._version_ints < other._version_ints - - def __gt__(self, other): - if not isinstance(other, VersionInfo): - other = VersionInfo(other) - return self._version_ints > other._version_ints - - def __le__(self, other): - return self == other or self < other - - def __ge__(self, other): - return self == other or self > other - - def __eq__(self, other): - if not isinstance(other, VersionInfo): - other = VersionInfo(other) - return super(VersionInfo, self).__eq__(other) - - -version_info = VersionInfo(__version__) -version = __version__ diff --git a/pyteomics/xml.py b/pyteomics/xml.py deleted file mode 100644 index db960c7eb3b5fe4375971408bc6b2f23e65e21b2..0000000000000000000000000000000000000000 --- a/pyteomics/xml.py +++ /dev/null @@ -1,1335 +0,0 @@ -""" -xml - utilities for XML parsing -=============================== - -This module is not intended for end users. It implements the abstract classes -for all XML parsers, :py:class:`XML` and :py:class:`IndexedXML`, and some utility functions. - -Dependencies ------------- - -This module requres :py:mod:`lxml` and :py:mod:`numpy`. - --------------------------------------------------------------------------------- -""" - -# Copyright 2012 Anton Goloborodko, Lev Levitsky -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -import socket -from traceback import format_exc -import warnings -from collections import OrderedDict, namedtuple -from itertools import islice -from lxml import etree -import numpy as np - -from .auxiliary import FileReader, PyteomicsError, basestring, _file_obj, HierarchicalOffsetIndex -from .auxiliary import unitint, unitfloat, unitstr, cvstr -from .auxiliary import _keepstate_method as _keepstate -from .auxiliary import BinaryDataArrayTransformer -from .auxiliary import TaskMappingMixin, IndexedReaderMixin, IndexSavingMixin - -try: # Python 2.7 - from urllib2 import urlopen, URLError -except ImportError: # Python 3.x - from urllib.request import urlopen, URLError - - -def _local_name(element): - """Strip namespace from the XML element's name""" - tag = element.tag - if tag and tag[0] == '{': - return tag.rpartition('}')[2] - return tag - - -def xsd_parser(schema_url): - """Parse an XSD file from the specified URL into a schema dictionary - that can be used by :class:`XML` parsers to automatically cast data to - the appropriate type. - - Parameters - ---------- - schema_url : str - The URL to retrieve the schema from - - Returns - ------- - dict - """ - ret = {} - if not (schema_url.startswith('http://') or - schema_url.startswith('https://') or - schema_url.startswith('file://')): - schema_url = 'file://' + schema_url - schema_file = urlopen(schema_url) - p = etree.XMLParser(remove_comments=True) - schema_tree = etree.parse(schema_file, parser=p) - types = {'ints': {'int', 'long', 'nonNegativeInteger', 'positiveInt', - 'integer', 'unsignedInt'}, - 'floats': {'float', 'double'}, - 'bools': {'boolean'}, - 'intlists': {'listOfIntegers'}, - 'floatlists': {'listOfFloats'}, - 'charlists': {'listOfChars', 'listOfCharsOrAny'}} - for k, val in types.items(): - tuples = set() - for elem in schema_tree.iter(): - if _local_name(elem) == 'attribute' and elem.attrib.get( - 'type', '').split(':')[-1] in val: - anc = elem.getparent() - anc_name = _local_name(anc) - while not ( - (anc_name == 'complexType' and 'name' in anc.attrib) or anc_name == 'element'): - anc = anc.getparent() - anc_name = _local_name(anc) - if anc is None: - break - else: - if anc_name == 'complexType': - elnames = [x.attrib['name'] for x in - schema_tree.iter() - if x.attrib.get('type', '').split(':')[-1] == anc.attrib['name']] - else: - elnames = (anc.attrib['name'],) - for elname in elnames: - tuples.add( - (elname, elem.attrib['name'])) - ret[k] = tuples - ret['lists'] = set(elem.attrib['name'] for elem in schema_tree.xpath( - '//*[local-name()="element"]') if 'name' in elem.attrib and - elem.attrib.get('maxOccurs', '1') != '1') - return ret - - -class XMLValueConverter(object): - # Adapted from http://stackoverflow.com/questions/2764269/parsing-an-xsduration-datatype-into-a-python-datetime-timedelta-object - _duration_parser = re.compile( - (r'(?P<sign>-?)P(?:(?P<years>\d+\.?\d*)Y)?(?:(?P<months>\d+\.?\d*)M)?(?:(?P<days>\d+\.?\d*)D)?(?:T(?:(?P<hours>\d+\.?\d*)H)?(?:(?P<minutes>\d+\.?\d*)M)?(?:(?P<seconds>\d+\.?\d*)S)?)?')) - - @classmethod - def duration_str_to_float(cls, s): - # Not a duration, so pass along - if not s.startswith('P'): - try: - return unitfloat(s, 'duration') - except ValueError: - return unitstr(s, 'duration') - match = cls._duration_parser.search(s) - if match: - matchdict = match.groupdict() - hours = float(matchdict.get('hours', 0) or 0) - minutes = float(matchdict.get('minutes', 0) or 0) - seconds = float(matchdict.get('seconds', 0) or 0) - minutes += hours * 60. - minutes += (seconds / 60.) - return unitfloat(minutes, 'minute') - else: - return unitstr(s, 'duration') - - @classmethod - def str_to_bool(cls, s): - if s.lower() in {'true', '1', 'y'}: - return True - if s.lower() in {'false', '0', 'n'}: - return False - raise PyteomicsError('Cannot convert string to bool: ' + s) - - @classmethod - def str_to_num(cls, s, numtype): - return numtype(s) if s else None - - @classmethod - def to(cls, t): - def convert_from(s): - return cls.str_to_num(s, t) - return convert_from - - @classmethod - def converters(cls): - return { - 'ints': cls.to(unitint), 'floats': cls.to(unitfloat), 'bools': cls.str_to_bool, - 'intlists': lambda x: np.fromstring(x.replace('\n', ' '), dtype=int, sep=' '), - 'floatlists': lambda x: np.fromstring(x.replace('\n', ' '), sep=' '), - 'charlists': list, - 'duration': cls.duration_str_to_float - } - - -class _XMLParam(namedtuple("XMLParam", ("name", "value", "type"))): - '''A holder for semantic parameters used in several common XML formats - - Attributes - ---------- - name: :class:`~.cvstr` - The name of the attribute, carrying the accession and unit information - value: :class:`~.unitfloat`, :class:`~.unitint` or :class:`~.unitstr` - The value of the parameter - type: :class:`str` - The parameter's local XML tag name. - ''' - __slots__ = () - - def is_empty(self): - value = self.value - return value == "" or value is None - - -class XML(FileReader): - """Base class for all format-specific XML parsers. The instances can be used - as context managers and as iterators. - """ - # Configurable data - file_format = 'XML' - _root_element = None - _default_schema = {} - _read_schema = False - _default_version = 0 - _default_iter_tag = None - _default_iter_path = None - _structures_to_flatten = [] - _schema_location_param = 'schemaLocation' - _default_id_attr = 'id' - _huge_tree = False - _retrieve_refs_enabled = None # only some subclasses implement this - _iterative = True - - # Configurable plugin logic - _converters = XMLValueConverter.converters() - _element_handlers = {} - - # Must be implemented by subclasses - def _get_info_smart(self, element, **kwargs): - raise NotImplementedError - - def __init__(self, source, read_schema=None, iterative=None, build_id_cache=False, **kwargs): - """Create an XML parser object. - - Parameters - ---------- - source : str or file - File name or file-like object corresponding to an XML file. - read_schema : bool, optional - Defines whether schema file referenced in the file header - should be used to extract information about value conversion. - Default is :py:const:`False`. - iterative : bool, optional - Defines whether an :py:class:`ElementTree` object should be - constructed and stored on the instance or if iterative parsing - should be used instead. Iterative parsing keeps the memory usage - low for large XML files. Default is :py:const:`True`. - build_id_cache : bool, optional - Defines whether a dictionary mapping IDs to XML tree elements - should be built and stored on the instance. It is used in - :py:meth:`XML.get_by_id`, e.g. when using - :py:class:`pyteomics.mzid.MzIdentML` with ``retrieve_refs=True``. - huge_tree : bool, optional - This option is passed to the `lxml` parser and defines whether - security checks for XML tree depth and node size should be disabled. - Default is :py:const:`False`. - Enable this option for trusted files to avoid XMLSyntaxError exceptions - (e.g. `XMLSyntaxError: xmlSAX2Characters: huge text node`). - """ - - super(XML, self).__init__(source, mode='rb', parser_func=self.iterfind, pass_file=False, - args=(self._default_iter_path or self._default_iter_tag,), kwargs=kwargs) - if iterative is None: - iterative = self._iterative - if iterative: - self._tree = None - else: - self.build_tree() - if build_id_cache: - self.build_id_cache() - else: - self._id_dict = None - - self.version_info = self._get_version_info() - if read_schema is not None: - self._read_schema = read_schema - self.schema_info = self._get_schema_info(read_schema) - - self._converters_items = self._converters.items() - self._huge_tree = kwargs.get('huge_tree', self._huge_tree) - self._retrieve_refs_enabled = kwargs.get('retrieve_refs') - - def __reduce_ex__(self, protocol): - return self.__class__, ( - self._source_init, self._read_schema, self._tree is None, - False, - ), self.__getstate__() - - def __getstate__(self): - state = super(XML, self).__getstate__() - state['_huge_tree'] = self._huge_tree - state['_retrieve_refs_enabled'] = self._retrieve_refs_enabled - state['_id_dict'] = self._id_dict - return state - - def __setstate__(self, state): - super(XML, self).__setstate__(state) - self._huge_tree = state['_huge_tree'] - self._retrieve_refs_enabled = state['_retrieve_refs_enabled'] - self._id_dict = state['_id_dict'] - - @_keepstate - def _get_version_info(self): - """ - Provide version information about the XML file. - - Returns - ------- - out : tuple - A (version, schema URL) tuple, both elements are strings or None. - """ - for _, elem in etree.iterparse( - self._source, events=('start',), remove_comments=True, huge_tree=self._huge_tree): - if _local_name(elem) == self._root_element: - return (elem.attrib.get('version'), - elem.attrib.get(('{{{}}}'.format(elem.nsmap['xsi']) - if 'xsi' in elem.nsmap else '') + self._schema_location_param)) - - @_keepstate - def _get_schema_info(self, read_schema=True): - """Stores defaults for the schema, tries to retrieve the schema for - other versions. Keys are: 'floats', 'ints', 'bools', 'lists', - 'intlists', 'floatlists', 'charlists'.""" - if not read_schema: - return self._default_schema - - version, schema = self.version_info - if version == self._default_version: - return self._default_schema - - ret = {} - try: - if not schema: - schema_url = '' - raise PyteomicsError( - 'Schema information not found in {}.'.format(self.name)) - schema_url = schema.split()[-1] - ret = xsd_parser(schema_url) - except Exception as e: - if isinstance(e, (URLError, socket.error, socket.timeout)): - warnings.warn("Can't get the {0.file_format} schema for version " - "`{1}` from <{2}> at the moment.\n" - "Using defaults for {0._default_version}.\n" - "You can disable reading the schema by specifying " - "`read_schema=False`.".format(self, version, schema_url)) - else: - warnings.warn("Unknown {0.file_format} version `{1}`.\n" - "Attempt to use schema " - "information from <{2}> failed.\n" - "Exception information:\n{3}\n" - "Falling back to defaults for {0._default_version}\n" - "NOTE: This is just a warning, probably from a badly-" - "generated XML file.\nYou will still most probably get " - "decent results.\nLook here for suppressing warnings:\n" - "http://docs.python.org/library/warnings.html#" - "temporarily-suppressing-warnings\n" - "You can also disable reading the schema by specifying " - "`read_schema=False`.\n" - "If you think this shouldn't have happened, please " - "report this to\n" - "http://github.com/levitsky/pyteomics/issues\n" - "".format(self, version, schema_url, format_exc())) - ret = self._default_schema - return ret - - def _handle_param(self, element, **kwargs): - """Unpacks cvParam and userParam tags into key-value pairs""" - types = {'int': unitint, 'float': unitfloat, 'string': unitstr} - attribs = element.attrib - unit_info = None - unit_accesssion = None - if 'unitCvRef' in attribs or 'unitName' in attribs: - unit_accesssion = attribs.get('unitAccession') - unit_name = attribs.get('unitName', unit_accesssion) - unit_info = unit_name - accession = attribs.get('accession') - value = attribs.get('value', '') - try: - if attribs.get('type') in types: - value = types[attribs['type']](value, unit_info) - else: - value = unitfloat(value, unit_info) - except ValueError: - value = unitstr(value, unit_info) - - # return {cvstr(attribs['name'], accession, unit_accesssion): value} - return _XMLParam(cvstr(attribs['name'], accession, unit_accesssion), value, _local_name(element)) - - def _handle_referenceable_param_group(self, param_group_ref, **kwargs): - raise NotImplementedError() - return [] - - def _find_immediate_params(self, element, **kwargs): - return element.xpath( - './*[local-name()="cvParam" or local-name()="userParam" or local-name()="UserParam" or local-name()="referenceableParamGroupRef"]') - - def _insert_param(self, info_dict, param): - key = param.name - if key in info_dict: - if isinstance(info_dict[key], list): - info_dict[key].append(param.value) - else: - info_dict[key] = [info_dict[key], param.value] - else: - info_dict[key] = param.value - - def _promote_empty_parameter_to_name(self, info, params): - empty_values = [] - not_empty_values = [] - for param in params: - if param.is_empty(): - empty_values.append(param) - else: - not_empty_values.append(param) - - if len(empty_values) == 1 and 'name' not in info: - info['name'] = empty_values[0].name - return info, not_empty_values - return info, params - - def _get_info(self, element, **kwargs): - """Extract info from element's attributes, possibly recursive. - <cvParam> and <userParam> elements are treated in a special way.""" - try: - name = kwargs.pop('ename') - except KeyError: - name = _local_name(element) - schema_info = self.schema_info - if name in {'cvParam', 'userParam', 'UserParam'}: - return self._handle_param(element, **kwargs) - elif name == "referenceableParamGroupRef": - return self._handle_referenceable_param_group(element, **kwargs) - - info = dict(element.attrib) - # process subelements - params = [] - if kwargs.get('recursive'): - for child in element.iterchildren(): - cname = _local_name(child) - if cname in {'cvParam', 'userParam', 'UserParam'}: - newinfo = self._handle_param(child, **kwargs) - params.append(newinfo) - elif cname == "referenceableParamGroupRef": - params.extend(self._handle_referenceable_param_group(child, **kwargs)) - else: - if cname not in schema_info['lists']: - info[cname] = self._get_info_smart(child, ename=cname, **kwargs) - else: - info.setdefault(cname, []).append( - self._get_info_smart(child, ename=cname, **kwargs)) - else: - # handle the case where we do not want to unpack all children, but - # *Param tags are considered part of the current entity, semantically - for child in self._find_immediate_params(element, **kwargs): - param_or_group = self._handle_param(child, **kwargs) - if isinstance(param_or_group, list): - params.extend(param_or_group) - else: - params.append(param_or_group) - - handler = self._element_handlers.get(name) - if handler is not None: - info, params = handler(self, info, params) - - for param in params: - self._insert_param(info, param) - - # process element text - if element.text: - stext = element.text.strip() - if stext: - if info: - info[name] = stext - else: - return stext - - # convert types - try: - for k, v in info.items(): - for t, a in self._converters_items: - if t in schema_info and (name, k) in schema_info[t]: - info[k] = a(v) - except ValueError as e: - message = 'Error when converting types: {}'.format(e.args) - if not self._read_schema: - message += '\nTry reading the file with read_schema=True' - raise PyteomicsError(message) - - # resolve refs - if kwargs.get('retrieve_refs', self._retrieve_refs_enabled): - self._retrieve_refs(info, **kwargs) - - # flatten the excessive nesting - for k, v in dict(info).items(): - if k in self._structures_to_flatten: - if isinstance(v, list): - for vi in v: - info.update(vi) - else: - info.update(v) - del info[k] - - # another simplification - for k, v in dict(info).items(): - if isinstance(v, dict) and 'name' in v and len(v) == 1: - info[k] = v['name'] - if len(info) == 2 and 'name' in info and ( - 'value' in info or 'values' in info): - name = info.pop('name') - info = {name: info.popitem()[1]} - return info - - @_keepstate - def build_tree(self): - """Build and store the :py:class:`ElementTree` instance - for the underlying file""" - p = etree.XMLParser(remove_comments=True, huge_tree=True) - self._tree = etree.parse(self._source, parser=p) - - def clear_tree(self): - """Remove the saved :py:class:`ElementTree`.""" - self._tree = None - - def _retrieve_refs(self, info, **kwargs): - """Retrieves and embeds the data for each attribute in `info` that - ends in _ref. Removes the id attribute from `info`. - - This implementation is a stub and must be implemented for each specific - subclass. It is only called if :attr:`retrieve_refs` """ - raise NotImplementedError( - ("_retrieve_refs is not implemented for {}. " - "Do not use `retrieve_refs=True`.").format( - self.__class__.__name__)) - - def iterfind(self, path, **kwargs): - """Parse the XML and yield info on elements with specified local - name or by specified "XPath". - - Parameters - ---------- - path : str - Element name or XPath-like expression. The path is very close to - full XPath syntax, but local names should be used for all elements in the path. - They will be substituted with local-name() checks, up to the (first) predicate. - The path can be absolute or "free". Please don't specify namespaces. - **kwargs : passed to :py:meth:`self._get_info_smart`. - - Returns - ------- - out : iterator - """ - return Iterfind(self, path, **kwargs) - - @_keepstate - def _iterfind_impl(self, path, **kwargs): - """Parse the XML and yield info on elements with specified local - name or by specified "XPath". - - Parameters - ---------- - path : str - Element name or XPath-like expression. The path is very close to - full XPath syntax, but local names should be used for all elements in the path. - They will be substituted with local-name() checks, up to the (first) predicate. - The path can be absolute or "free". Please don't specify namespaces. - **kwargs : passed to :py:meth:`self._get_info_smart`. - - Returns - ------- - out : iterator - """ - try: - path, tail = re.match(pattern_path, path).groups() - except AttributeError: - raise PyteomicsError('Invalid path: ' + path) - if path[:2] == '//' or path[0] != '/': - absolute = False - if path[:2] == '//': - path = path[2:] - if path[0] == '/' or '//' in path: - raise PyteomicsError("Too many /'s in a row.") - else: - absolute = True - path = path[1:] - nodes = path.rstrip('/').split('/') - if not nodes: - raise PyteomicsError('Invalid path: ' + path) - - if not self._tree: - if tail: - if tail[0] == '[': - tail = '(.)' + tail - else: - raise PyteomicsError('Cannot parse path tail: ' + tail) - xpath = etree.XPath(tail) - localname = nodes[0] - found = False - for ev, elem in etree.iterparse(self, events=('start', 'end'), remove_comments=True, huge_tree=self._huge_tree): - name_lc = _local_name(elem) - if ev == 'start': - if name_lc == localname or localname == '*': - found += 1 - else: - if name_lc == localname or localname == '*': - if (absolute and elem.getparent() is None) or not absolute: - for child in get_rel_path(elem, nodes[1:]): - if tail: - for elem in xpath(child): - info = self._get_info_smart(elem, **kwargs) - yield info - else: - info = self._get_info_smart(child, **kwargs) - yield info - if not localname == '*': - found -= 1 - if not found: - elem.clear() - else: - xpath = ('/' if absolute else '//') + '/'.join( - '*[local-name()="{}"]'.format(node) if node != '*' else '*' for node in nodes ) + tail - for elem in self._tree.xpath(xpath): - info = self._get_info_smart(elem, **kwargs) - yield info - - @_keepstate - def build_id_cache(self): - """Construct a cache for each element in the document, indexed by id - attribute""" - stack = 0 - id_dict = {} - for event, elem in etree.iterparse(self._source, events=('start', 'end'), - remove_comments=True, huge_tree=self._huge_tree): - if event == 'start': - if 'id' in elem.attrib: - stack += 1 - else: - if 'id' in elem.attrib: - stack -= 1 - id_dict[elem.attrib['id']] = elem - elif stack == 0: - elem.clear() - self._id_dict = id_dict - - def clear_id_cache(self): - """Clear the element ID cache""" - self._id_dict = {} - - def _find_by_id_no_reset(self, elem_id, id_key=None): - """ - An almost exact copy of :meth:`get_by_id` with the difference that it does - not reset the file reader's position before iterative parsing. - - Parameters - ---------- - elem_id : str - The element id to query for - - Returns - ------- - lxml.Element - """ - found = False - if id_key is None: - id_key = self._default_id_attr - for event, elem in etree.iterparse( - self._source, events=('start', 'end'), remove_comments=True, huge_tree=self._huge_tree): - if event == 'start': - if elem.attrib.get(id_key) == elem_id: - found = True - else: - if elem.attrib.get(id_key) == elem_id: - return elem - if not found: - elem.clear() - raise KeyError(elem_id) - - @_keepstate - def get_by_id(self, elem_id, **kwargs): - """Parse the file and return the element with `id` attribute equal - to `elem_id`. Returns :py:const:`None` if no such element is found. - - Parameters - ---------- - elem_id : str - The value of the `id` attribute to match. - - Returns - ------- - out : :py:class:`dict` or :py:const:`None` - """ - if not self._id_dict: - elem = self._find_by_id_no_reset(elem_id) - else: - elem = self._id_dict[elem_id] - return self._get_info_smart(elem, **kwargs) - - -# XPath emulator tools -pattern_path = re.compile(r'([\w/*]*)(.*)') - - -def get_rel_path(element, names): - if not names: - yield element - else: - for child in element.iterchildren(): - if names[0] == '*' or _local_name(child) == names[0]: - if len(names) == 1: - yield child - else: - for gchild in get_rel_path(child, names[1:]): - yield gchild - - -def xpath(tree, path, ns=None): - """Return the results of XPath query with added namespaces. - Assumes the ns declaration is on the root element or absent. - - Parameters - ---------- - - tree : ElementTree - path : str - ns : str or None, optional - """ - if hasattr(tree, 'getroot'): - root = tree.getroot() - else: - root = tree - while root.getparent() is not None: - root = root.getparent() - ns = root.nsmap.get(ns) - - def repl(m): - s = m.group(1) - if not ns: return s - if not s: return 'd:' - return '/d:' - new_path = re.sub(r'(\/|^)(?![\*\/])', repl, path) - n_s = ({'d': ns} if ns else None) - return tree.xpath(new_path, namespaces=n_s) - - -def _make_version_info(cls): - def version_info(source): - return cls(source).version_info - version_info.__doc__ = """ - Provide version information about the {0.file_format} file. - - .. note:: This function is provided for backward compatibility only. - It simply creates an :py:class:`{0.__name__}` instance - and returns its :py:data:`!version_info` attribute. - - Parameters - ---------- - source : str or file - File name or file-like object. - - Returns - ------- - out : tuple - A (version, schema URL) tuple, both elements are strings or None. - """.format(cls) - return version_info - - -class ByteCountingXMLScanner(_file_obj): - """ - Carry out the construction of a byte offset index for `source` XML file - for each type of tag in :attr:`indexed_tags`. - - Inheris from :py:class:`pyteomics.auxiliary._file_obj` to support the object-oriented - :py:func:`_keep_state` interface. - """ - entities = { - 'quot': '"', - 'amp': '&', - 'apos': "'", - 'lt': '<', - 'gt': '>', - } - - xml_entity_pattern = re.compile(r"&({});".format('|'.join(entities.keys()))) - - def __init__(self, source, indexed_tags, block_size=1000000): - """ - Parameters - ---------- - indexed_tags : iterable of bytes - The XML tags (without namespaces) to build indices for. - block_size : int, optional - The size of the each chunk or "block" of the file to hold in memory as a - partitioned string at any given time. Defaults to `1000000`. - """ - super(ByteCountingXMLScanner, self).__init__(source, 'rb') - self.indexed_tags = ensure_bytes(indexed_tags) - self.block_size = block_size - - def _chunk_iterator(self): - """ - Read a file in large blocks and chunk up each block into parts - resembling XML tags, yielding each chunk. - - Assumes the file is opened in binary mode. - """ - f = self.file - read_size = self.block_size - delim = b'<' - buff = f.read(read_size) - started_with_delim = buff.startswith(delim) - parts = buff.split(delim) - tail = parts[-1] - front = parts[:-1] - i = 0 - for part in front: - i += 1 - if part == b"": - continue - if i == 1: - if started_with_delim: - yield delim + part - else: - yield part - else: - yield delim + part - running = True - while running: - buff = f.read(read_size) - if not buff: - running = False - buff = tail - else: - buff = tail + buff - parts = buff.split(delim) - tail = parts[-1] - front = parts[:-1] - for part in front: - yield delim + part - - def _generate_offsets(self): - """ - Iterate over the lines of an XML file where each line contains exactly one tag, - tracking the byte count for each line. When a line contains a tag whose name matches - a name in :attr:`indexed_tags`, yield the byte offset, the tag type, and it's attributes. - - Yields - ------ - offset : int - The byte offset of a matched tag's opening line - tag_type : bytes - The type of tag matched - attr_dict : dict - The attributes on the matched tag - """ - i = 0 - packed = b"|".join(self.indexed_tags) - pattern = re.compile((r"^\s*<(%s)\s" % packed.decode()).encode()) - attrs = re.compile(br"(\S+)=[\"']([^\"']*)[\"']") - for line in self._chunk_iterator(): - match = pattern.match(line) - if match: - yield i, match.group(1), dict(attrs.findall(line)) - i += len(line) - - def _entity_sub_cb(self, match): - ent = match.group(1) - return self.entities[ent] - - def replace_entities(self, key): - '''Replace XML entities in a string with their character representation - - Uses the minimal mapping of XML entities pre-defined for all XML documents and - does not attempt to deal with external DTD defined entities. This mapping is found - in :attr:`entities`. - - Parameters - ---------- - key : str - The string to substitute - - Returns - ------- - str - ''' - return self.xml_entity_pattern.sub(self._entity_sub_cb, key) - - @_keepstate - def build_byte_index(self, lookup_id_key_mapping=None): - """ - Builds a byte offset index for one or more types of tags. - - Parameters - ---------- - lookup_id_key_mapping : Mapping, optional - A mapping from tag name to the attribute to look up the identity - for each entity of that type to be extracted. Defaults to 'id' for - each type of tag. - - Returns - ------- - defaultdict(dict) - Mapping from tag type to dict from identifier to byte offset - """ - if lookup_id_key_mapping is None: - lookup_id_key_mapping = {} - lookup_id_key_mapping = {ensure_bytes_single(key): ensure_bytes_single(value) - for key, value in lookup_id_key_mapping.items()} - - for name in self.indexed_tags: - bname = ensure_bytes_single(name) - lookup_id_key_mapping.setdefault(bname, 'id') - lookup_id_key_mapping[bname] = ensure_bytes_single(lookup_id_key_mapping[bname]) - - indices = HierarchicalOffsetIndex() - g = self._generate_offsets() - for offset, offset_type, attrs in g: - k = attrs[lookup_id_key_mapping[offset_type]].decode('utf-8') - if '&' in k: - k = self.replace_entities(k) - indices[offset_type.decode('utf-8')][k] = offset - return indices - - @classmethod - def scan(cls, source, indexed_tags): - inst = cls(source, indexed_tags) - return inst.build_byte_index() - - -class TagSpecificXMLByteIndex(object): - """ - Encapsulates the construction and querying of a byte offset index - for a set of XML tags. - - This type mimics an immutable Mapping. - - Attributes - ---------- - indexed_tags : iterable of bytes - The tag names to index, not including a namespace - offsets : defaultdict(OrderedDict(str, int)) - The hierarchy of byte offsets organized ``{"tag_type": {"id": byte_offset}}`` - indexed_tag_keys: dict(str, str) - A mapping from tag name to unique identifier attribute - - Parameters - ---------- - index_tags: iterable of bytes - The tag names to include in the index - - """ - _default_indexed_tags = [] - _default_keys = {} - _scanner_class = ByteCountingXMLScanner - - def __init__(self, source, indexed_tags=None, keys=None): - if keys is None: - keys = self._default_keys.copy() - if indexed_tags is None: - indexed_tags = self._default_indexed_tags - self.indexed_tags = indexed_tags - self.indexed_tag_keys = keys - self.source = source - self.offsets = HierarchicalOffsetIndex() - self.build_index() - - def __getstate__(self): - state = {} - state['indexed_tags'] = self.indexed_tags - state['indexed_tag_keys'] = self.indexed_tag_keys - state['offsets'] = self.offsets - return state - - def __setstate__(self, state): - self.indexed_tags = state['indexed_tags'] - self.indexed_tag_keys = state['indexed_tag_keys'] - self.offsets = state['offsets'] - - def __getitem__(self, key): - return self.offsets[key] - - def build_index(self): - """ - Perform the byte offset index building for py:attr:`source`. - - Returns - ------- - offsets: defaultdict - The hierarchical offset, stored in offsets - """ - scanner = self._scanner_class(self.source, self.indexed_tags) - self.offsets = scanner.build_byte_index(self.indexed_tag_keys) - return self.offsets - - def items(self): - return self.offsets.items() - - def keys(self): - return self.offsets.keys() - - def __iter__(self): - return iter(self.keys()) - - def __len__(self): - return sum(len(group) for key, group in self.items()) - - @classmethod - def build(cls, source, indexed_tags=None, keys=None): - indexer = cls(source, indexed_tags, keys) - return indexer.offsets - - -def ensure_bytes_single(string): - if isinstance(string, bytes): - return string - try: - return string.encode('utf-8') - except (AttributeError, UnicodeEncodeError): - raise PyteomicsError('{!r} could not be encoded'.format(string)) - - -def ensure_bytes(strings): - if isinstance(strings, basestring): - strings = [strings] - return [ensure_bytes_single(string) for string in strings] - - -def _flatten_map(hierarchical_map): - all_records = [] - for key, records in hierarchical_map.items(): - all_records.extend(records.items()) - - all_records.sort(key=lambda x: x[1]) - return OrderedDict(all_records) - - -class IndexedXML(IndexedReaderMixin, XML): - """Subclass of :py:class:`XML` which uses an index of byte offsets for some - elements for quick random access. - """ - _indexed_tags = set() - _indexed_tag_keys = {} - _use_index = True - - def __init__(self, source, read_schema=False, iterative=True, build_id_cache=False, - use_index=None, *args, **kwargs): - """Create an indexed XML parser object. - - Parameters - ---------- - source : str or file - File name or file-like object corresponding to an XML file. - read_schema : bool, optional - Defines whether schema file referenced in the file header - should be used to extract information about value conversion. - Default is :py:const:`False`. - iterative : bool, optional - Defines whether an :py:class:`ElementTree` object should be - constructed and stored on the instance or if iterative parsing - should be used instead. Iterative parsing keeps the memory usage - low for large XML files. Default is :py:const:`True`. - use_index : bool, optional - Defines whether an index of byte offsets needs to be created for - elements listed in `indexed_tags`. - This is useful for random access to spectra in mzML or elements of mzIdentML files, - or for iterative parsing of mzIdentML with ``retrieve_refs=True``. - If :py:const:`True`, `build_id_cache` is ignored. - If :py:const:`False`, the object acts exactly like :py:class:`XML`. - Default is :py:const:`True`. - indexed_tags : container of bytes, optional - If `use_index` is :py:const:`True`, elements listed in this parameter - will be indexed. Empty set by default. - """ - tags = kwargs.get('indexed_tags') - tag_index_keys = kwargs.get('indexed_tag_keys') - - if tags is not None: - self._indexed_tags = tags - if tag_index_keys is not None: - self._indexed_tag_keys = tag_index_keys - - if use_index is not None: - self._use_index = use_index - - if use_index: - build_id_cache = False - if self._default_iter_path and self._default_iter_path != self._default_iter_tag: - warnings.warn('_default_iter_path differs from _default_iter_tag and index is enabled. ' - '_default_iter_tag will be used in the index, mind the consequences.') - super(IndexedXML, self).__init__(source, read_schema, iterative, build_id_cache, *args, **kwargs) - - self._offset_index = None - self._build_index() - - @property - def default_index(self): - return self._offset_index[self._default_iter_tag] - - def __reduce_ex__(self, protocol): - reconstructor, args, state = XML.__reduce_ex__(self, protocol) - args = args + (False, ) - return reconstructor, args, state - - def __getstate__(self): - state = super(IndexedXML, self).__getstate__() - state['_indexed_tags'] = self._indexed_tags - state['_indexed_tag_keys'] = self._indexed_tag_keys - state['_use_index'] = self._use_index - state['_offset_index'] = self._offset_index - return state - - def __setstate__(self, state): - super(IndexedXML, self).__setstate__(state) - self._indexed_tags = state['_indexed_tags'] - self._indexed_tag_keys = state['_indexed_tag_keys'] - self._use_index = state['_use_index'] - self._offset_index = state['_offset_index'] - - @_keepstate - def _build_index(self): - """ - Build up a `dict` of `dict` of offsets for elements. Calls :func:`find_index_list` - on :attr:`_source` and assigns the return value to :attr:`_offset_index` - """ - if not self._indexed_tags or not self._use_index: - return - self._offset_index = TagSpecificXMLByteIndex.build( - self._source, self._indexed_tags, self._indexed_tag_keys) - - @_keepstate - def _find_by_id_reset(self, elem_id, id_key=None): - return self._find_by_id_no_reset(elem_id, id_key=id_key) - - @_keepstate - def get_by_id(self, elem_id, id_key=None, element_type=None, **kwargs): - """ - Retrieve the requested entity by its id. If the entity - is a spectrum described in the offset index, it will be retrieved - by immediately seeking to the starting position of the entry, otherwise - falling back to parsing from the start of the file. - - Parameters - ---------- - elem_id : str - The id value of the entity to retrieve. - id_key : str, optional - The name of the XML attribute to use for lookup. - Defaults to :py:attr:`self._default_id_attr`. - - Returns - ------- - dict - """ - try: - index = self._offset_index - if element_type is None: - offset, element_type = index.find_no_type(elem_id) - else: - offset = index.find(elem_id, element_type) - self._source.seek(offset) - if id_key is None: - id_key = self._indexed_tag_keys.get(element_type) - elem = self._find_by_id_no_reset(elem_id, id_key=id_key) - except (KeyError, AttributeError, etree.LxmlError): - elem = self._find_by_id_reset(elem_id, id_key=id_key) - data = self._get_info_smart(elem, **kwargs) - return data - - def __contains__(self, key): - return key in self._offset_index[self._default_iter_tag] - - def __len__(self): - return len(self._offset_index[self._default_iter_tag]) - - def iterfind(self, path, **kwargs): - """Parse the XML and yield info on elements with specified local - name or by specified "XPath". - - Parameters - ---------- - path : str - Element name or XPath-like expression. The path is very close to - full XPath syntax, but local names should be used for all elements in the path. - They will be substituted with local-name() checks, up to the (first) predicate. - The path can be absolute or "free". Please don't specify namespaces. - **kwargs : passed to :py:meth:`self._get_info_smart`. - - Returns - ------- - out : iterator - """ - if path in self._indexed_tags and self._use_index: - return IndexedIterfind(self, path, **kwargs) - return Iterfind(self, path, **kwargs) - - -class MultiProcessingXML(IndexedXML, TaskMappingMixin): - """XML reader that feeds indexes to external processes - for parallel parsing and analysis of XML entries.""" - - def _task_map_iterator(self): - """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC - queue used by :meth:`map` - - Returns - ------- - :class:`Iteratable` - """ - return iter(self._offset_index[self._default_iter_tag]) - - -class IndexSavingXML(IndexSavingMixin, IndexedXML): - """An extension to the IndexedXML type which - adds facilities to read and write the byte offset - index externally. - """ - _index_class = HierarchicalOffsetIndex - - def _read_byte_offsets(self): - """Read the byte offset index JSON file at :attr:`_byte_offset_filename` - and populate :attr:`_offset_index` - """ - with open(self._byte_offset_filename, 'r') as f: - index = self._index_class.load(f) - if index.schema_version is None: - raise TypeError("Legacy Offset Index!") - self._offset_index = index - - -class Iterfind(object): - def __init__(self, parser, tag_name, **kwargs): - self.parser = parser - self.tag_name = tag_name - self.config = kwargs - self._iterator = None - - def __repr__(self): - template = "{self.__class__.__name__}({self.tag_name!r}{config})" - if self.config: - config = ", " + repr(self.config) - else: - config = '' - return template.format(self=self, config=config) - - def __iter__(self): - return self - - def _make_iterator(self): - return self.parser._iterfind_impl(self.tag_name, **self.config) - - def __next__(self): - if self._iterator is None: - self._iterator = self._make_iterator() - return next(self._iterator) - - def next(self): - return self.__next__() - - @property - def is_indexed(self): - return False - - def reset(self): - self._iterator = None - self.parser.reset() - - def __enter__(self): - return self - - def __exit__(self, *args, **kwargs): - self.reset() - - def map(self, *args,**kwargs): - raise NotImplementedError("This query isn't indexed, it cannot be mapped with multiprocessing") - - def _get_by_index(self, idx): - self.reset() - value = next(islice(self, idx, idx + 1)) - return value - - def _get_by_slice(self, slc): - self.reset() - value = list(islice(self, slc.start, slc.stop, slc.step)) - return value - - def __getitem__(self, i): - if isinstance(i, slice): - return self._get_by_slice(i) - return self._get_by_index(i) - - -class IndexedIterfind(TaskMappingMixin, Iterfind): - - def __init__(self, parser, tag_name, **kwargs): - TaskMappingMixin.__init__(self, **kwargs) - Iterfind.__init__(self, parser, tag_name, **kwargs) - - def _task_map_iterator(self): - """Returns the :class:`Iteratable` to use when dealing work items onto the input IPC - queue used by :meth:`map` - - Returns - ------- - :class:`Iteratable` - """ - return iter(self._index) - - @property - def _offset_index(self): - return self._index - - @property - def _index(self): - return self.parser.index[self.tag_name] - - def _get_reader_for_worker_spec(self): - return self.parser - - def _yield_from_index(self): - for key in self._task_map_iterator(): - yield self.parser.get_by_id(key, **self.config) - - def _make_iterator(self): - if self.is_indexed: - return self._yield_from_index() - warnings.warn("Non-indexed iterator created from %r" % (self, )) - return super(IndexedIterfind, self)._make_iterator() - - @property - def is_indexed(self): - if hasattr(self.parser, 'index'): - if self.parser.index is not None: - index = self.parser.index - if isinstance(index, HierarchicalOffsetIndex): - return bool(self.tag_name in index and index[self.tag_name]) - return False - - def _get_by_index(self, idx): - index = self._index - key = index.from_index(idx) - return self.parser.get_by_id(key) - - def _get_by_slice(self, slc): - index = self._index - keys = index.from_slice(slc) - return self.parser.get_by_ids(keys) - - def __len__(self): - index = self._index - return len(index) diff --git a/seaborn/__init__.py b/seaborn/__init__.py deleted file mode 100644 index d1ca9754d31183f7bbad2dad059edf8e3a46aa4a..0000000000000000000000000000000000000000 --- a/seaborn/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Import seaborn objects -from .rcmod import * # noqa: F401,F403 -from .utils import * # noqa: F401,F403 -from .palettes import * # noqa: F401,F403 -from .relational import * # noqa: F401,F403 -from .regression import * # noqa: F401,F403 -from .categorical import * # noqa: F401,F403 -from .distributions import * # noqa: F401,F403 -from .matrix import * # noqa: F401,F403 -from .miscplot import * # noqa: F401,F403 -from .axisgrid import * # noqa: F401,F403 -from .widgets import * # noqa: F401,F403 -from .colors import xkcd_rgb, crayons # noqa: F401 -from . import cm # noqa: F401 - -# Capture the original matplotlib rcParams -import matplotlib as mpl -_orig_rc_params = mpl.rcParams.copy() - -# Define the seaborn version -__version__ = "0.14.0.dev0" diff --git a/seaborn/_base.py b/seaborn/_base.py deleted file mode 100644 index 0b43523193782ec89ca1f2f573da01683fdaae6f..0000000000000000000000000000000000000000 --- a/seaborn/_base.py +++ /dev/null @@ -1,1777 +0,0 @@ -from __future__ import annotations -import warnings -import itertools -from copy import copy -from collections import UserString -from collections.abc import Iterable, Sequence, Mapping -from numbers import Number -from datetime import datetime - -import numpy as np -import pandas as pd -import matplotlib as mpl - -from seaborn._core.data import PlotData -from seaborn.palettes import ( - QUAL_PALETTES, - color_palette, -) -from seaborn.utils import ( - _check_argument, - _version_predates, - desaturate, - locator_to_legend_entries, - get_color_cycle, - remove_na, -) - - -class SemanticMapping: - """Base class for mapping data values to plot attributes.""" - - # -- Default attributes that all SemanticMapping subclasses must set - - # Whether the mapping is numeric, categorical, or datetime - map_type: str | None = None - - # Ordered list of unique values in the input data - levels = None - - # A mapping from the data values to corresponding plot attributes - lookup_table = None - - def __init__(self, plotter): - - # TODO Putting this here so we can continue to use a lot of the - # logic that's built into the library, but the idea of this class - # is to move towards semantic mappings that are agnostic about the - # kind of plot they're going to be used to draw. - # Fully achieving that is going to take some thinking. - self.plotter = plotter - - def _check_list_length(self, levels, values, variable): - """Input check when values are provided as a list.""" - # Copied from _core/properties; eventually will be replaced for that. - message = "" - if len(levels) > len(values): - message = " ".join([ - f"\nThe {variable} list has fewer values ({len(values)})", - f"than needed ({len(levels)}) and will cycle, which may", - "produce an uninterpretable plot." - ]) - values = [x for _, x in zip(levels, itertools.cycle(values))] - - elif len(values) > len(levels): - message = " ".join([ - f"The {variable} list has more values ({len(values)})", - f"than needed ({len(levels)}), which may not be intended.", - ]) - values = values[:len(levels)] - - if message: - warnings.warn(message, UserWarning, stacklevel=6) - - return values - - def _lookup_single(self, key): - """Apply the mapping to a single data value.""" - return self.lookup_table[key] - - def __call__(self, key, *args, **kwargs): - """Get the attribute(s) values for the data key.""" - if isinstance(key, (list, np.ndarray, pd.Series)): - return [self._lookup_single(k, *args, **kwargs) for k in key] - else: - return self._lookup_single(key, *args, **kwargs) - - -class HueMapping(SemanticMapping): - """Mapping that sets artist colors according to data values.""" - # A specification of the colors that should appear in the plot - palette = None - - # An object that normalizes data values to [0, 1] range for color mapping - norm = None - - # A continuous colormap object for interpolating in a numeric context - cmap = None - - def __init__( - self, plotter, palette=None, order=None, norm=None, saturation=1, - ): - """Map the levels of the `hue` variable to distinct colors. - - Parameters - ---------- - # TODO add generic parameters - - """ - super().__init__(plotter) - - data = plotter.plot_data.get("hue", pd.Series(dtype=float)) - - if isinstance(palette, np.ndarray): - msg = ( - "Numpy array is not a supported type for `palette`. " - "Please convert your palette to a list. " - "This will become an error in v0.14" - ) - warnings.warn(msg, stacklevel=4) - palette = palette.tolist() - - if data.isna().all(): - if palette is not None: - msg = "Ignoring `palette` because no `hue` variable has been assigned." - warnings.warn(msg, stacklevel=4) - else: - - map_type = self.infer_map_type( - palette, norm, plotter.input_format, plotter.var_types["hue"] - ) - - # Our goal is to end up with a dictionary mapping every unique - # value in `data` to a color. We will also keep track of the - # metadata about this mapping we will need for, e.g., a legend - - # --- Option 1: numeric mapping with a matplotlib colormap - - if map_type == "numeric": - - data = pd.to_numeric(data) - levels, lookup_table, norm, cmap = self.numeric_mapping( - data, palette, norm, - ) - - # --- Option 2: categorical mapping using seaborn palette - - elif map_type == "categorical": - - cmap = norm = None - levels, lookup_table = self.categorical_mapping( - data, palette, order, - ) - - # --- Option 3: datetime mapping - - else: - # TODO this needs actual implementation - cmap = norm = None - levels, lookup_table = self.categorical_mapping( - # Casting data to list to handle differences in the way - # pandas and numpy represent datetime64 data - list(data), palette, order, - ) - - self.saturation = saturation - self.map_type = map_type - self.lookup_table = lookup_table - self.palette = palette - self.levels = levels - self.norm = norm - self.cmap = cmap - - def _lookup_single(self, key): - """Get the color for a single value, using colormap to interpolate.""" - try: - # Use a value that's in the original data vector - value = self.lookup_table[key] - except KeyError: - - if self.norm is None: - # Currently we only get here in scatterplot with hue_order, - # because scatterplot does not consider hue a grouping variable - # So unused hue levels are in the data, but not the lookup table - return (0, 0, 0, 0) - - # Use the colormap to interpolate between existing datapoints - # (e.g. in the context of making a continuous legend) - try: - normed = self.norm(key) - except TypeError as err: - if np.isnan(key): - value = (0, 0, 0, 0) - else: - raise err - else: - if np.ma.is_masked(normed): - normed = np.nan - value = self.cmap(normed) - - if self.saturation < 1: - value = desaturate(value, self.saturation) - - return value - - def infer_map_type(self, palette, norm, input_format, var_type): - """Determine how to implement the mapping.""" - if palette in QUAL_PALETTES: - map_type = "categorical" - elif norm is not None: - map_type = "numeric" - elif isinstance(palette, (dict, list)): - map_type = "categorical" - elif input_format == "wide": - map_type = "categorical" - else: - map_type = var_type - - return map_type - - def categorical_mapping(self, data, palette, order): - """Determine colors when the hue mapping is categorical.""" - # -- Identify the order and name of the levels - - levels = categorical_order(data, order) - n_colors = len(levels) - - # -- Identify the set of colors to use - - if isinstance(palette, dict): - - missing = set(levels) - set(palette) - if any(missing): - err = "The palette dictionary is missing keys: {}" - raise ValueError(err.format(missing)) - - lookup_table = palette - - else: - - if palette is None: - if n_colors <= len(get_color_cycle()): - colors = color_palette(None, n_colors) - else: - colors = color_palette("husl", n_colors) - elif isinstance(palette, list): - colors = self._check_list_length(levels, palette, "palette") - else: - colors = color_palette(palette, n_colors) - - lookup_table = dict(zip(levels, colors)) - - return levels, lookup_table - - def numeric_mapping(self, data, palette, norm): - """Determine colors when the hue variable is quantitative.""" - if isinstance(palette, dict): - - # The presence of a norm object overrides a dictionary of hues - # in specifying a numeric mapping, so we need to process it here. - levels = list(sorted(palette)) - colors = [palette[k] for k in sorted(palette)] - cmap = mpl.colors.ListedColormap(colors) - lookup_table = palette.copy() - - else: - - # The levels are the sorted unique values in the data - levels = list(np.sort(remove_na(data.unique()))) - - # --- Sort out the colormap to use from the palette argument - - # Default numeric palette is our default cubehelix palette - # TODO do we want to do something complicated to ensure contrast? - palette = "ch:" if palette is None else palette - - if isinstance(palette, mpl.colors.Colormap): - cmap = palette - else: - cmap = color_palette(palette, as_cmap=True) - - # Now sort out the data normalization - if norm is None: - norm = mpl.colors.Normalize() - elif isinstance(norm, tuple): - norm = mpl.colors.Normalize(*norm) - elif not isinstance(norm, mpl.colors.Normalize): - err = "``hue_norm`` must be None, tuple, or Normalize object." - raise ValueError(err) - - if not norm.scaled(): - norm(np.asarray(data.dropna())) - - lookup_table = dict(zip(levels, cmap(norm(levels)))) - - return levels, lookup_table, norm, cmap - - -class SizeMapping(SemanticMapping): - """Mapping that sets artist sizes according to data values.""" - # An object that normalizes data values to [0, 1] range - norm = None - - def __init__( - self, plotter, sizes=None, order=None, norm=None, - ): - """Map the levels of the `size` variable to distinct values. - - Parameters - ---------- - # TODO add generic parameters - - """ - super().__init__(plotter) - - data = plotter.plot_data.get("size", pd.Series(dtype=float)) - - if data.notna().any(): - - map_type = self.infer_map_type( - norm, sizes, plotter.var_types["size"] - ) - - # --- Option 1: numeric mapping - - if map_type == "numeric": - - levels, lookup_table, norm, size_range = self.numeric_mapping( - data, sizes, norm, - ) - - # --- Option 2: categorical mapping - - elif map_type == "categorical": - - levels, lookup_table = self.categorical_mapping( - data, sizes, order, - ) - size_range = None - - # --- Option 3: datetime mapping - - # TODO this needs an actual implementation - else: - - levels, lookup_table = self.categorical_mapping( - # Casting data to list to handle differences in the way - # pandas and numpy represent datetime64 data - list(data), sizes, order, - ) - size_range = None - - self.map_type = map_type - self.levels = levels - self.norm = norm - self.sizes = sizes - self.size_range = size_range - self.lookup_table = lookup_table - - def infer_map_type(self, norm, sizes, var_type): - - if norm is not None: - map_type = "numeric" - elif isinstance(sizes, (dict, list)): - map_type = "categorical" - else: - map_type = var_type - - return map_type - - def _lookup_single(self, key): - - try: - value = self.lookup_table[key] - except KeyError: - normed = self.norm(key) - if np.ma.is_masked(normed): - normed = np.nan - value = self.size_range[0] + normed * np.ptp(self.size_range) - return value - - def categorical_mapping(self, data, sizes, order): - - levels = categorical_order(data, order) - - if isinstance(sizes, dict): - - # Dict inputs map existing data values to the size attribute - missing = set(levels) - set(sizes) - if any(missing): - err = f"Missing sizes for the following levels: {missing}" - raise ValueError(err) - lookup_table = sizes.copy() - - elif isinstance(sizes, list): - - # List inputs give size values in the same order as the levels - sizes = self._check_list_length(levels, sizes, "sizes") - lookup_table = dict(zip(levels, sizes)) - - else: - - if isinstance(sizes, tuple): - - # Tuple input sets the min, max size values - if len(sizes) != 2: - err = "A `sizes` tuple must have only 2 values" - raise ValueError(err) - - elif sizes is not None: - - err = f"Value for `sizes` not understood: {sizes}" - raise ValueError(err) - - else: - - # Otherwise, we need to get the min, max size values from - # the plotter object we are attached to. - - # TODO this is going to cause us trouble later, because we - # want to restructure things so that the plotter is generic - # across the visual representation of the data. But at this - # point, we don't know the visual representation. Likely we - # want to change the logic of this Mapping so that it gives - # points on a normalized range that then gets un-normalized - # when we know what we're drawing. But given the way the - # package works now, this way is cleanest. - sizes = self.plotter._default_size_range - - # For categorical sizes, use regularly-spaced linear steps - # between the minimum and maximum sizes. Then reverse the - # ramp so that the largest value is used for the first entry - # in size_order, etc. This is because "ordered" categories - # are often though to go in decreasing priority. - sizes = np.linspace(*sizes, len(levels))[::-1] - lookup_table = dict(zip(levels, sizes)) - - return levels, lookup_table - - def numeric_mapping(self, data, sizes, norm): - - if isinstance(sizes, dict): - # The presence of a norm object overrides a dictionary of sizes - # in specifying a numeric mapping, so we need to process it - # dictionary here - levels = list(np.sort(list(sizes))) - size_values = sizes.values() - size_range = min(size_values), max(size_values) - - else: - - # The levels here will be the unique values in the data - levels = list(np.sort(remove_na(data.unique()))) - - if isinstance(sizes, tuple): - - # For numeric inputs, the size can be parametrized by - # the minimum and maximum artist values to map to. The - # norm object that gets set up next specifies how to - # do the mapping. - - if len(sizes) != 2: - err = "A `sizes` tuple must have only 2 values" - raise ValueError(err) - - size_range = sizes - - elif sizes is not None: - - err = f"Value for `sizes` not understood: {sizes}" - raise ValueError(err) - - else: - - # When not provided, we get the size range from the plotter - # object we are attached to. See the note in the categorical - # method about how this is suboptimal for future development. - size_range = self.plotter._default_size_range - - # Now that we know the minimum and maximum sizes that will get drawn, - # we need to map the data values that we have into that range. We will - # use a matplotlib Normalize class, which is typically used for numeric - # color mapping but works fine here too. It takes data values and maps - # them into a [0, 1] interval, potentially nonlinear-ly. - - if norm is None: - # Default is a linear function between the min and max data values - norm = mpl.colors.Normalize() - elif isinstance(norm, tuple): - # It is also possible to give different limits in data space - norm = mpl.colors.Normalize(*norm) - elif not isinstance(norm, mpl.colors.Normalize): - err = f"Value for size `norm` parameter not understood: {norm}" - raise ValueError(err) - else: - # If provided with Normalize object, copy it so we can modify - norm = copy(norm) - - # Set the mapping so all output values are in [0, 1] - norm.clip = True - - # If the input range is not set, use the full range of the data - if not norm.scaled(): - norm(levels) - - # Map from data values to [0, 1] range - sizes_scaled = norm(levels) - - # Now map from the scaled range into the artist units - if isinstance(sizes, dict): - lookup_table = sizes - else: - lo, hi = size_range - sizes = lo + sizes_scaled * (hi - lo) - lookup_table = dict(zip(levels, sizes)) - - return levels, lookup_table, norm, size_range - - -class StyleMapping(SemanticMapping): - """Mapping that sets artist style according to data values.""" - - # Style mapping is always treated as categorical - map_type = "categorical" - - def __init__(self, plotter, markers=None, dashes=None, order=None): - """Map the levels of the `style` variable to distinct values. - - Parameters - ---------- - # TODO add generic parameters - - """ - super().__init__(plotter) - - data = plotter.plot_data.get("style", pd.Series(dtype=float)) - - if data.notna().any(): - - # Cast to list to handle numpy/pandas datetime quirks - if variable_type(data) == "datetime": - data = list(data) - - # Find ordered unique values - levels = categorical_order(data, order) - - markers = self._map_attributes( - markers, levels, unique_markers(len(levels)), "markers", - ) - dashes = self._map_attributes( - dashes, levels, unique_dashes(len(levels)), "dashes", - ) - - # Build the paths matplotlib will use to draw the markers - paths = {} - filled_markers = [] - for k, m in markers.items(): - if not isinstance(m, mpl.markers.MarkerStyle): - m = mpl.markers.MarkerStyle(m) - paths[k] = m.get_path().transformed(m.get_transform()) - filled_markers.append(m.is_filled()) - - # Mixture of filled and unfilled markers will show line art markers - # in the edge color, which defaults to white. This can be handled, - # but there would be additional complexity with specifying the - # weight of the line art markers without overwhelming the filled - # ones with the edges. So for now, we will disallow mixtures. - if any(filled_markers) and not all(filled_markers): - err = "Filled and line art markers cannot be mixed" - raise ValueError(err) - - lookup_table = {} - for key in levels: - lookup_table[key] = {} - if markers: - lookup_table[key]["marker"] = markers[key] - lookup_table[key]["path"] = paths[key] - if dashes: - lookup_table[key]["dashes"] = dashes[key] - - self.levels = levels - self.lookup_table = lookup_table - - def _lookup_single(self, key, attr=None): - """Get attribute(s) for a given data point.""" - if attr is None: - value = self.lookup_table[key] - else: - value = self.lookup_table[key][attr] - return value - - def _map_attributes(self, arg, levels, defaults, attr): - """Handle the specification for a given style attribute.""" - if arg is True: - lookup_table = dict(zip(levels, defaults)) - elif isinstance(arg, dict): - missing = set(levels) - set(arg) - if missing: - err = f"These `{attr}` levels are missing values: {missing}" - raise ValueError(err) - lookup_table = arg - elif isinstance(arg, Sequence): - arg = self._check_list_length(levels, arg, attr) - lookup_table = dict(zip(levels, arg)) - elif arg: - err = f"This `{attr}` argument was not understood: {arg}" - raise ValueError(err) - else: - lookup_table = {} - - return lookup_table - - -# =========================================================================== # - - -class VectorPlotter: - """Base class for objects underlying *plot functions.""" - - wide_structure = { - "x": "@index", "y": "@values", "hue": "@columns", "style": "@columns", - } - flat_structure = {"x": "@index", "y": "@values"} - - _default_size_range = 1, 2 # Unused but needed in tests, ugh - - def __init__(self, data=None, variables={}): - - self._var_levels = {} - # var_ordered is relevant only for categorical axis variables, and may - # be better handled by an internal axis information object that tracks - # such information and is set up by the scale_* methods. The analogous - # information for numeric axes would be information about log scales. - self._var_ordered = {"x": False, "y": False} # alt., used DefaultDict - self.assign_variables(data, variables) - - # TODO Lots of tests assume that these are called to initialize the - # mappings to default values on class initialization. I'd prefer to - # move away from that and only have a mapping when explicitly called. - for var in ["hue", "size", "style"]: - if var in variables: - getattr(self, f"map_{var}")() - - @property - def has_xy_data(self): - """Return True at least one of x or y is defined.""" - return bool({"x", "y"} & set(self.variables)) - - @property - def var_levels(self): - """Property interface to ordered list of variables levels. - - Each time it's accessed, it updates the var_levels dictionary with the - list of levels in the current semantic mappers. But it also allows the - dictionary to persist, so it can be used to set levels by a key. This is - used to track the list of col/row levels using an attached FacetGrid - object, but it's kind of messy and ideally fixed by improving the - faceting logic so it interfaces better with the modern approach to - tracking plot variables. - - """ - for var in self.variables: - if (map_obj := getattr(self, f"_{var}_map", None)) is not None: - self._var_levels[var] = map_obj.levels - return self._var_levels - - def assign_variables(self, data=None, variables={}): - """Define plot variables, optionally using lookup from `data`.""" - x = variables.get("x", None) - y = variables.get("y", None) - - if x is None and y is None: - self.input_format = "wide" - frame, names = self._assign_variables_wideform(data, **variables) - else: - # When dealing with long-form input, use the newer PlotData - # object (internal but introduced for the objects interface) - # to centralize / standardize data consumption logic. - self.input_format = "long" - plot_data = PlotData(data, variables) - frame = plot_data.frame - names = plot_data.names - - self.plot_data = frame - self.variables = names - self.var_types = { - v: variable_type( - frame[v], - boolean_type="numeric" if v in "xy" else "categorical" - ) - for v in names - } - - return self - - def _assign_variables_wideform(self, data=None, **kwargs): - """Define plot variables given wide-form data. - - Parameters - ---------- - data : flat vector or collection of vectors - Data can be a vector or mapping that is coerceable to a Series - or a sequence- or mapping-based collection of such vectors, or a - rectangular numpy array, or a Pandas DataFrame. - kwargs : variable -> data mappings - Behavior with keyword arguments is currently undefined. - - Returns - ------- - plot_data : :class:`pandas.DataFrame` - Long-form data object mapping seaborn variables (x, y, hue, ...) - to data vectors. - variables : dict - Keys are defined seaborn variables; values are names inferred from - the inputs (or None when no name can be determined). - - """ - # Raise if semantic or other variables are assigned in wide-form mode - assigned = [k for k, v in kwargs.items() if v is not None] - if any(assigned): - s = "s" if len(assigned) > 1 else "" - err = f"The following variable{s} cannot be assigned with wide-form data: " - err += ", ".join(f"`{v}`" for v in assigned) - raise ValueError(err) - - # Determine if the data object actually has any data in it - empty = data is None or not len(data) - - # Then, determine if we have "flat" data (a single vector) - if isinstance(data, dict): - values = data.values() - else: - values = np.atleast_1d(np.asarray(data, dtype=object)) - flat = not any( - isinstance(v, Iterable) and not isinstance(v, (str, bytes)) - for v in values - ) - - if empty: - - # Make an object with the structure of plot_data, but empty - plot_data = pd.DataFrame() - variables = {} - - elif flat: - - # Handle flat data by converting to pandas Series and using the - # index and/or values to define x and/or y - # (Could be accomplished with a more general to_series() interface) - flat_data = pd.Series(data).copy() - names = { - "@values": flat_data.name, - "@index": flat_data.index.name - } - - plot_data = {} - variables = {} - - for var in ["x", "y"]: - if var in self.flat_structure: - attr = self.flat_structure[var] - plot_data[var] = getattr(flat_data, attr[1:]) - variables[var] = names[self.flat_structure[var]] - - plot_data = pd.DataFrame(plot_data) - - else: - - # Otherwise assume we have some collection of vectors. - - # Handle Python sequences such that entries end up in the columns, - # not in the rows, of the intermediate wide DataFrame. - # One way to accomplish this is to convert to a dict of Series. - if isinstance(data, Sequence): - data_dict = {} - for i, var in enumerate(data): - key = getattr(var, "name", i) - # TODO is there a safer/more generic way to ensure Series? - # sort of like np.asarray, but for pandas? - data_dict[key] = pd.Series(var) - - data = data_dict - - # Pandas requires that dict values either be Series objects - # or all have the same length, but we want to allow "ragged" inputs - if isinstance(data, Mapping): - data = {key: pd.Series(val) for key, val in data.items()} - - # Otherwise, delegate to the pandas DataFrame constructor - # This is where we'd prefer to use a general interface that says - # "give me this data as a pandas DataFrame", so we can accept - # DataFrame objects from other libraries - wide_data = pd.DataFrame(data, copy=True) - - # At this point we should reduce the dataframe to numeric cols - numeric_cols = [ - k for k, v in wide_data.items() if variable_type(v) == "numeric" - ] - wide_data = wide_data[numeric_cols] - - # Now melt the data to long form - melt_kws = {"var_name": "@columns", "value_name": "@values"} - use_index = "@index" in self.wide_structure.values() - if use_index: - melt_kws["id_vars"] = "@index" - try: - orig_categories = wide_data.columns.categories - orig_ordered = wide_data.columns.ordered - wide_data.columns = wide_data.columns.add_categories("@index") - except AttributeError: - category_columns = False - else: - category_columns = True - wide_data["@index"] = wide_data.index.to_series() - - plot_data = wide_data.melt(**melt_kws) - - if use_index and category_columns: - plot_data["@columns"] = pd.Categorical(plot_data["@columns"], - orig_categories, - orig_ordered) - - # Assign names corresponding to plot semantics - for var, attr in self.wide_structure.items(): - plot_data[var] = plot_data[attr] - - # Define the variable names - variables = {} - for var, attr in self.wide_structure.items(): - obj = getattr(wide_data, attr[1:]) - variables[var] = getattr(obj, "name", None) - - # Remove redundant columns from plot_data - plot_data = plot_data[list(variables)] - - return plot_data, variables - - def map_hue(self, palette=None, order=None, norm=None, saturation=1): - mapping = HueMapping(self, palette, order, norm, saturation) - self._hue_map = mapping - - def map_size(self, sizes=None, order=None, norm=None): - mapping = SizeMapping(self, sizes, order, norm) - self._size_map = mapping - - def map_style(self, markers=None, dashes=None, order=None): - mapping = StyleMapping(self, markers, dashes, order) - self._style_map = mapping - - def iter_data( - self, grouping_vars=None, *, - reverse=False, from_comp_data=False, - by_facet=True, allow_empty=False, dropna=True, - ): - """Generator for getting subsets of data defined by semantic variables. - - Also injects "col" and "row" into grouping semantics. - - Parameters - ---------- - grouping_vars : string or list of strings - Semantic variables that define the subsets of data. - reverse : bool - If True, reverse the order of iteration. - from_comp_data : bool - If True, use self.comp_data rather than self.plot_data - by_facet : bool - If True, add faceting variables to the set of grouping variables. - allow_empty : bool - If True, yield an empty dataframe when no observations exist for - combinations of grouping variables. - dropna : bool - If True, remove rows with missing data. - - Yields - ------ - sub_vars : dict - Keys are semantic names, values are the level of that semantic. - sub_data : :class:`pandas.DataFrame` - Subset of ``plot_data`` for this combination of semantic values. - - """ - # TODO should this default to using all (non x/y?) semantics? - # or define grouping vars somewhere? - if grouping_vars is None: - grouping_vars = [] - elif isinstance(grouping_vars, str): - grouping_vars = [grouping_vars] - elif isinstance(grouping_vars, tuple): - grouping_vars = list(grouping_vars) - - # Always insert faceting variables - if by_facet: - facet_vars = {"col", "row"} - grouping_vars.extend( - facet_vars & set(self.variables) - set(grouping_vars) - ) - - # Reduce to the semantics used in this plot - grouping_vars = [var for var in grouping_vars if var in self.variables] - - if from_comp_data: - data = self.comp_data - else: - data = self.plot_data - - if dropna: - data = data.dropna() - - levels = self.var_levels.copy() - if from_comp_data: - for axis in {"x", "y"} & set(grouping_vars): - converter = self.converters[axis].iloc[0] - if self.var_types[axis] == "categorical": - if self._var_ordered[axis]: - # If the axis is ordered, then the axes in a possible - # facet grid are by definition "shared", or there is a - # single axis with a unique cat -> idx mapping. - # So we can just take the first converter object. - levels[axis] = converter.convert_units(levels[axis]) - else: - # Otherwise, the mappings may not be unique, but we can - # use the unique set of index values in comp_data. - levels[axis] = np.sort(data[axis].unique()) - else: - transform = converter.get_transform().transform - levels[axis] = transform(converter.convert_units(levels[axis])) - - if grouping_vars: - - grouped_data = data.groupby( - grouping_vars, sort=False, as_index=False, observed=False, - ) - - grouping_keys = [] - for var in grouping_vars: - key = levels.get(var) - grouping_keys.append([] if key is None else key) - - iter_keys = itertools.product(*grouping_keys) - if reverse: - iter_keys = reversed(list(iter_keys)) - - for key in iter_keys: - - pd_key = ( - key[0] if len(key) == 1 and _version_predates(pd, "2.2.0") else key - ) - try: - data_subset = grouped_data.get_group(pd_key) - except KeyError: - # XXX we are adding this to allow backwards compatibility - # with the empty artists that old categorical plots would - # add (before 0.12), which we may decide to break, in which - # case this option could be removed - data_subset = data.loc[[]] - - if data_subset.empty and not allow_empty: - continue - - sub_vars = dict(zip(grouping_vars, key)) - - yield sub_vars, data_subset.copy() - - else: - - yield {}, data.copy() - - @property - def comp_data(self): - """Dataframe with numeric x and y, after unit conversion and log scaling.""" - if not hasattr(self, "ax"): - # Probably a good idea, but will need a bunch of tests updated - # Most of these tests should just use the external interface - # Then this can be re-enabled. - # raise AttributeError("No Axes attached to plotter") - return self.plot_data - - if not hasattr(self, "_comp_data"): - - comp_data = ( - self.plot_data - .copy(deep=False) - .drop(["x", "y"], axis=1, errors="ignore") - ) - - for var in "yx": - if var not in self.variables: - continue - - parts = [] - grouped = self.plot_data[var].groupby(self.converters[var], sort=False) - for converter, orig in grouped: - orig = orig.mask(orig.isin([np.inf, -np.inf]), np.nan) - orig = orig.dropna() - if var in self.var_levels: - # TODO this should happen in some centralized location - # it is similar to GH2419, but more complicated because - # supporting `order` in categorical plots is tricky - orig = orig[orig.isin(self.var_levels[var])] - comp = pd.to_numeric(converter.convert_units(orig)).astype(float) - transform = converter.get_transform().transform - parts.append(pd.Series(transform(comp), orig.index, name=orig.name)) - if parts: - comp_col = pd.concat(parts) - else: - comp_col = pd.Series(dtype=float, name=var) - comp_data.insert(0, var, comp_col) - - self._comp_data = comp_data - - return self._comp_data - - def _get_axes(self, sub_vars): - """Return an Axes object based on existence of row/col variables.""" - row = sub_vars.get("row", None) - col = sub_vars.get("col", None) - if row is not None and col is not None: - return self.facets.axes_dict[(row, col)] - elif row is not None: - return self.facets.axes_dict[row] - elif col is not None: - return self.facets.axes_dict[col] - elif self.ax is None: - return self.facets.ax - else: - return self.ax - - def _attach( - self, - obj, - allowed_types=None, - log_scale=None, - ): - """Associate the plotter with an Axes manager and initialize its units. - - Parameters - ---------- - obj : :class:`matplotlib.axes.Axes` or :class:'FacetGrid` - Structural object that we will eventually plot onto. - allowed_types : str or list of str - If provided, raise when either the x or y variable does not have - one of the declared seaborn types. - log_scale : bool, number, or pair of bools or numbers - If not False, set the axes to use log scaling, with the given - base or defaulting to 10. If a tuple, interpreted as separate - arguments for the x and y axes. - - """ - from .axisgrid import FacetGrid - if isinstance(obj, FacetGrid): - self.ax = None - self.facets = obj - ax_list = obj.axes.flatten() - if obj.col_names is not None: - self.var_levels["col"] = obj.col_names - if obj.row_names is not None: - self.var_levels["row"] = obj.row_names - else: - self.ax = obj - self.facets = None - ax_list = [obj] - - # Identify which "axis" variables we have defined - axis_variables = set("xy").intersection(self.variables) - - # -- Verify the types of our x and y variables here. - # This doesn't really make complete sense being here here, but it's a fine - # place for it, given the current system. - # (Note that for some plots, there might be more complicated restrictions) - # e.g. the categorical plots have their own check that as specific to the - # non-categorical axis. - if allowed_types is None: - allowed_types = ["numeric", "datetime", "categorical"] - elif isinstance(allowed_types, str): - allowed_types = [allowed_types] - - for var in axis_variables: - var_type = self.var_types[var] - if var_type not in allowed_types: - err = ( - f"The {var} variable is {var_type}, but one of " - f"{allowed_types} is required" - ) - raise TypeError(err) - - # -- Get axis objects for each row in plot_data for type conversions and scaling - - facet_dim = {"x": "col", "y": "row"} - - self.converters = {} - for var in axis_variables: - other_var = {"x": "y", "y": "x"}[var] - - converter = pd.Series(index=self.plot_data.index, name=var, dtype=object) - share_state = getattr(self.facets, f"_share{var}", True) - - # Simplest cases are that we have a single axes, all axes are shared, - # or sharing is only on the orthogonal facet dimension. In these cases, - # all datapoints get converted the same way, so use the first axis - if share_state is True or share_state == facet_dim[other_var]: - converter.loc[:] = getattr(ax_list[0], f"{var}axis") - - else: - - # Next simplest case is when no axes are shared, and we can - # use the axis objects within each facet - if share_state is False: - for axes_vars, axes_data in self.iter_data(): - ax = self._get_axes(axes_vars) - converter.loc[axes_data.index] = getattr(ax, f"{var}axis") - - # In the more complicated case, the axes are shared within each - # "file" of the facetgrid. In that case, we need to subset the data - # for that file and assign it the first axis in the slice of the grid - else: - - names = getattr(self.facets, f"{share_state}_names") - for i, level in enumerate(names): - idx = (i, 0) if share_state == "row" else (0, i) - axis = getattr(self.facets.axes[idx], f"{var}axis") - converter.loc[self.plot_data[share_state] == level] = axis - - # Store the converter vector, which we use elsewhere (e.g comp_data) - self.converters[var] = converter - - # Now actually update the matplotlib objects to do the conversion we want - grouped = self.plot_data[var].groupby(self.converters[var], sort=False) - for converter, seed_data in grouped: - if self.var_types[var] == "categorical": - if self._var_ordered[var]: - order = self.var_levels[var] - else: - order = None - seed_data = categorical_order(seed_data, order) - converter.update_units(seed_data) - - # -- Set numerical axis scales - - # First unpack the log_scale argument - if log_scale is None: - scalex = scaley = False - else: - # Allow single value or x, y tuple - try: - scalex, scaley = log_scale - except TypeError: - scalex = log_scale if self.var_types.get("x") == "numeric" else False - scaley = log_scale if self.var_types.get("y") == "numeric" else False - - # Now use it - for axis, scale in zip("xy", (scalex, scaley)): - if scale: - for ax in ax_list: - set_scale = getattr(ax, f"set_{axis}scale") - if scale is True: - set_scale("log", nonpositive="mask") - else: - set_scale("log", base=scale, nonpositive="mask") - - # For categorical y, we want the "first" level to be at the top of the axis - if self.var_types.get("y", None) == "categorical": - for ax in ax_list: - ax.yaxis.set_inverted(True) - - # TODO -- Add axes labels - - def _get_scale_transforms(self, axis): - """Return a function implementing the scale transform (or its inverse).""" - if self.ax is None: - axis_list = [getattr(ax, f"{axis}axis") for ax in self.facets.axes.flat] - scales = {axis.get_scale() for axis in axis_list} - if len(scales) > 1: - # It is a simplifying assumption that faceted axes will always have - # the same scale (even if they are unshared and have distinct limits). - # Nothing in the seaborn API allows you to create a FacetGrid with - # a mixture of scales, although it's possible via matplotlib. - # This is constraining, but no more so than previous behavior that - # only (properly) handled log scales, and there are some places where - # it would be much too complicated to use axes-specific transforms. - err = "Cannot determine transform with mixed scales on faceted axes." - raise RuntimeError(err) - transform_obj = axis_list[0].get_transform() - else: - # This case is more straightforward - transform_obj = getattr(self.ax, f"{axis}axis").get_transform() - - return transform_obj.transform, transform_obj.inverted().transform - - def _add_axis_labels(self, ax, default_x="", default_y=""): - """Add axis labels if not present, set visibility to match ticklabels.""" - # TODO ax could default to None and use attached axes if present - # but what to do about the case of facets? Currently using FacetGrid's - # set_axis_labels method, which doesn't add labels to the interior even - # when the axes are not shared. Maybe that makes sense? - if not ax.get_xlabel(): - x_visible = any(t.get_visible() for t in ax.get_xticklabels()) - ax.set_xlabel(self.variables.get("x", default_x), visible=x_visible) - if not ax.get_ylabel(): - y_visible = any(t.get_visible() for t in ax.get_yticklabels()) - ax.set_ylabel(self.variables.get("y", default_y), visible=y_visible) - - def add_legend_data( - self, ax, func, common_kws=None, attrs=None, semantic_kws=None, - ): - """Add labeled artists to represent the different plot semantics.""" - verbosity = self.legend - if isinstance(verbosity, str) and verbosity not in ["auto", "brief", "full"]: - err = "`legend` must be 'auto', 'brief', 'full', or a boolean." - raise ValueError(err) - elif verbosity is True: - verbosity = "auto" - - keys = [] - legend_kws = {} - common_kws = {} if common_kws is None else common_kws.copy() - semantic_kws = {} if semantic_kws is None else semantic_kws.copy() - - # Assign a legend title if there is only going to be one sub-legend, - # otherwise, subtitles will be inserted into the texts list with an - # invisible handle (which is a hack) - titles = { - title for title in - (self.variables.get(v, None) for v in ["hue", "size", "style"]) - if title is not None - } - title = "" if len(titles) != 1 else titles.pop() - title_kws = dict( - visible=False, color="w", s=0, linewidth=0, marker="", dashes="" - ) - - def update(var_name, val_name, **kws): - - key = var_name, val_name - if key in legend_kws: - legend_kws[key].update(**kws) - else: - keys.append(key) - legend_kws[key] = dict(**kws) - - if attrs is None: - attrs = {"hue": "color", "size": ["linewidth", "s"], "style": None} - for var, names in attrs.items(): - self._update_legend_data( - update, var, verbosity, title, title_kws, names, semantic_kws.get(var), - ) - - legend_data = {} - legend_order = [] - - # Don't allow color=None so we can set a neutral color for size/style legends - if common_kws.get("color", False) is None: - common_kws.pop("color") - - for key in keys: - - _, label = key - kws = legend_kws[key] - level_kws = {} - use_attrs = [ - *self._legend_attributes, - *common_kws, - *[attr for var_attrs in semantic_kws.values() for attr in var_attrs], - ] - for attr in use_attrs: - if attr in kws: - level_kws[attr] = kws[attr] - artist = func(label=label, **{"color": ".2", **common_kws, **level_kws}) - if _version_predates(mpl, "3.5.0"): - if isinstance(artist, mpl.lines.Line2D): - ax.add_line(artist) - elif isinstance(artist, mpl.patches.Patch): - ax.add_patch(artist) - elif isinstance(artist, mpl.collections.Collection): - ax.add_collection(artist) - else: - ax.add_artist(artist) - legend_data[key] = artist - legend_order.append(key) - - self.legend_title = title - self.legend_data = legend_data - self.legend_order = legend_order - - def _update_legend_data( - self, - update, - var, - verbosity, - title, - title_kws, - attr_names, - other_props, - ): - """Generate legend tick values and formatted labels.""" - brief_ticks = 6 - mapper = getattr(self, f"_{var}_map", None) - if mapper is None: - return - - brief = mapper.map_type == "numeric" and ( - verbosity == "brief" - or (verbosity == "auto" and len(mapper.levels) > brief_ticks) - ) - if brief: - if isinstance(mapper.norm, mpl.colors.LogNorm): - locator = mpl.ticker.LogLocator(numticks=brief_ticks) - else: - locator = mpl.ticker.MaxNLocator(nbins=brief_ticks) - limits = min(mapper.levels), max(mapper.levels) - levels, formatted_levels = locator_to_legend_entries( - locator, limits, self.plot_data[var].infer_objects().dtype - ) - elif mapper.levels is None: - levels = formatted_levels = [] - else: - levels = formatted_levels = mapper.levels - - if not title and self.variables.get(var, None) is not None: - update((self.variables[var], "title"), self.variables[var], **title_kws) - - other_props = {} if other_props is None else other_props - - for level, formatted_level in zip(levels, formatted_levels): - if level is not None: - attr = mapper(level) - if isinstance(attr_names, list): - attr = {name: attr for name in attr_names} - elif attr_names is not None: - attr = {attr_names: attr} - attr.update({k: v[level] for k, v in other_props.items() if level in v}) - update(self.variables[var], formatted_level, **attr) - - # XXX If the scale_* methods are going to modify the plot_data structure, they - # can't be called twice. That means that if they are called twice, they should - # raise. Alternatively, we could store an original version of plot_data and each - # time they are called they operate on the store, not the current state. - - def scale_native(self, axis, *args, **kwargs): - - # Default, defer to matplotlib - - raise NotImplementedError - - def scale_numeric(self, axis, *args, **kwargs): - - # Feels needed to completeness, what should it do? - # Perhaps handle log scaling? Set the ticker/formatter/limits? - - raise NotImplementedError - - def scale_datetime(self, axis, *args, **kwargs): - - # Use pd.to_datetime to convert strings or numbers to datetime objects - # Note, use day-resolution for numeric->datetime to match matplotlib - - raise NotImplementedError - - def scale_categorical(self, axis, order=None, formatter=None): - """ - Enforce categorical (fixed-scale) rules for the data on given axis. - - Parameters - ---------- - axis : "x" or "y" - Axis of the plot to operate on. - order : list - Order that unique values should appear in. - formatter : callable - Function mapping values to a string representation. - - Returns - ------- - self - - """ - # This method both modifies the internal representation of the data - # (converting it to string) and sets some attributes on self. It might be - # a good idea to have a separate object attached to self that contains the - # information in those attributes (i.e. whether to enforce variable order - # across facets, the order to use) similar to the SemanticMapping objects - # we have for semantic variables. That object could also hold the converter - # objects that get used, if we can decouple those from an existing axis - # (cf. https://github.com/matplotlib/matplotlib/issues/19229). - # There are some interactions with faceting information that would need - # to be thought through, since the converts to use depend on facets. - # If we go that route, these methods could become "borrowed" methods similar - # to what happens with the alternate semantic mapper constructors, although - # that approach is kind of fussy and confusing. - - # TODO this method could also set the grid state? Since we like to have no - # grid on the categorical axis by default. Again, a case where we'll need to - # store information until we use it, so best to have a way to collect the - # attributes that this method sets. - - # TODO if we are going to set visual properties of the axes with these methods, - # then we could do the steps currently in CategoricalPlotter._adjust_cat_axis - - # TODO another, and distinct idea, is to expose a cut= param here - - _check_argument("axis", ["x", "y"], axis) - - # Categorical plots can be "univariate" in which case they get an anonymous - # category label on the opposite axis. - if axis not in self.variables: - self.variables[axis] = None - self.var_types[axis] = "categorical" - self.plot_data[axis] = "" - - # If the "categorical" variable has a numeric type, sort the rows so that - # the default result from categorical_order has those values sorted after - # they have been coerced to strings. The reason for this is so that later - # we can get facet-wise orders that are correct. - # XXX Should this also sort datetimes? - # It feels more consistent, but technically will be a default change - # If so, should also change categorical_order to behave that way - if self.var_types[axis] == "numeric": - self.plot_data = self.plot_data.sort_values(axis, kind="mergesort") - - # Now get a reference to the categorical data vector and remove na values - cat_data = self.plot_data[axis].dropna() - - # Get the initial categorical order, which we do before string - # conversion to respect the original types of the order list. - # Track whether the order is given explicitly so that we can know - # whether or not to use the order constructed here downstream - self._var_ordered[axis] = order is not None or cat_data.dtype.name == "category" - order = pd.Index(categorical_order(cat_data, order), name=axis) - - # Then convert data to strings. This is because in matplotlib, - # "categorical" data really mean "string" data, so doing this artists - # will be drawn on the categorical axis with a fixed scale. - # TODO implement formatter here; check that it returns strings? - if formatter is not None: - cat_data = cat_data.map(formatter) - order = order.map(formatter) - else: - cat_data = cat_data.astype(str) - order = order.astype(str) - - # Update the levels list with the type-converted order variable - self.var_levels[axis] = order - - # Now ensure that seaborn will use categorical rules internally - self.var_types[axis] = "categorical" - - # Put the string-typed categorical vector back into the plot_data structure - self.plot_data[axis] = cat_data - - return self - - -class VariableType(UserString): - """ - Prevent comparisons elsewhere in the library from using the wrong name. - - Errors are simple assertions because users should not be able to trigger - them. If that changes, they should be more verbose. - - """ - # TODO we can replace this with typing.Literal on Python 3.8+ - allowed = "numeric", "datetime", "categorical" - - def __init__(self, data): - assert data in self.allowed, data - super().__init__(data) - - def __eq__(self, other): - assert other in self.allowed, other - return self.data == other - - -def variable_type(vector, boolean_type="numeric"): - """ - Determine whether a vector contains numeric, categorical, or datetime data. - - This function differs from the pandas typing API in two ways: - - - Python sequences or object-typed PyData objects are considered numeric if - all of their entries are numeric. - - String or mixed-type data are considered categorical even if not - explicitly represented as a :class:`pandas.api.types.CategoricalDtype`. - - Parameters - ---------- - vector : :func:`pandas.Series`, :func:`numpy.ndarray`, or Python sequence - Input data to test. - boolean_type : 'numeric' or 'categorical' - Type to use for vectors containing only 0s and 1s (and NAs). - - Returns - ------- - var_type : 'numeric', 'categorical', or 'datetime' - Name identifying the type of data in the vector. - """ - vector = pd.Series(vector) - - # If a categorical dtype is set, infer categorical - if isinstance(vector.dtype, pd.CategoricalDtype): - return VariableType("categorical") - - # Special-case all-na data, which is always "numeric" - if pd.isna(vector).all(): - return VariableType("numeric") - - # At this point, drop nans to simplify further type inference - vector = vector.dropna() - - # Special-case binary/boolean data, allow caller to determine - # This triggers a numpy warning when vector has strings/objects - # https://github.com/numpy/numpy/issues/6784 - # Because we reduce with .all(), we are agnostic about whether the - # comparison returns a scalar or vector, so we will ignore the warning. - # It triggers a separate DeprecationWarning when the vector has datetimes: - # https://github.com/numpy/numpy/issues/13548 - # This is considered a bug by numpy and will likely go away. - with warnings.catch_warnings(): - warnings.simplefilter( - action='ignore', category=(FutureWarning, DeprecationWarning) - ) - try: - if np.isin(vector, [0, 1]).all(): - return VariableType(boolean_type) - except TypeError: - # .isin comparison is not guaranteed to be possible under NumPy - # casting rules, depending on the (unknown) dtype of 'vector' - pass - - # Defer to positive pandas tests - if pd.api.types.is_numeric_dtype(vector): - return VariableType("numeric") - - if pd.api.types.is_datetime64_dtype(vector): - return VariableType("datetime") - - # --- If we get to here, we need to check the entries - - # Check for a collection where everything is a number - - def all_numeric(x): - for x_i in x: - if not isinstance(x_i, Number): - return False - return True - - if all_numeric(vector): - return VariableType("numeric") - - # Check for a collection where everything is a datetime - - def all_datetime(x): - for x_i in x: - if not isinstance(x_i, (datetime, np.datetime64)): - return False - return True - - if all_datetime(vector): - return VariableType("datetime") - - # Otherwise, our final fallback is to consider things categorical - - return VariableType("categorical") - - -def infer_orient(x=None, y=None, orient=None, require_numeric=True): - """Determine how the plot should be oriented based on the data. - - For historical reasons, the convention is to call a plot "horizontally" - or "vertically" oriented based on the axis representing its dependent - variable. Practically, this is used when determining the axis for - numerical aggregation. - - Parameters - ---------- - x, y : Vector data or None - Positional data vectors for the plot. - orient : string or None - Specified orientation. If not None, can be "x" or "y", or otherwise - must start with "v" or "h". - require_numeric : bool - If set, raise when the implied dependent variable is not numeric. - - Returns - ------- - orient : "x" or "y" - - Raises - ------ - ValueError: When `orient` is an unknown string. - TypeError: When dependent variable is not numeric, with `require_numeric` - - """ - - x_type = None if x is None else variable_type(x) - y_type = None if y is None else variable_type(y) - - nonnumeric_dv_error = "{} orientation requires numeric `{}` variable." - single_var_warning = "{} orientation ignored with only `{}` specified." - - if x is None: - if str(orient).startswith("h"): - warnings.warn(single_var_warning.format("Horizontal", "y")) - if require_numeric and y_type != "numeric": - raise TypeError(nonnumeric_dv_error.format("Vertical", "y")) - return "x" - - elif y is None: - if str(orient).startswith("v"): - warnings.warn(single_var_warning.format("Vertical", "x")) - if require_numeric and x_type != "numeric": - raise TypeError(nonnumeric_dv_error.format("Horizontal", "x")) - return "y" - - elif str(orient).startswith("v") or orient == "x": - if require_numeric and y_type != "numeric": - raise TypeError(nonnumeric_dv_error.format("Vertical", "y")) - return "x" - - elif str(orient).startswith("h") or orient == "y": - if require_numeric and x_type != "numeric": - raise TypeError(nonnumeric_dv_error.format("Horizontal", "x")) - return "y" - - elif orient is not None: - err = ( - "`orient` must start with 'v' or 'h' or be None, " - f"but `{repr(orient)}` was passed." - ) - raise ValueError(err) - - elif x_type != "categorical" and y_type == "categorical": - return "y" - - elif x_type != "numeric" and y_type == "numeric": - return "x" - - elif x_type == "numeric" and y_type != "numeric": - return "y" - - elif require_numeric and "numeric" not in (x_type, y_type): - err = "Neither the `x` nor `y` variable appears to be numeric." - raise TypeError(err) - - else: - return "x" - - -def unique_dashes(n): - """Build an arbitrarily long list of unique dash styles for lines. - - Parameters - ---------- - n : int - Number of unique dash specs to generate. - - Returns - ------- - dashes : list of strings or tuples - Valid arguments for the ``dashes`` parameter on - :class:`matplotlib.lines.Line2D`. The first spec is a solid - line (``""``), the remainder are sequences of long and short - dashes. - - """ - # Start with dash specs that are well distinguishable - dashes = [ - "", - (4, 1.5), - (1, 1), - (3, 1.25, 1.5, 1.25), - (5, 1, 1, 1), - ] - - # Now programmatically build as many as we need - p = 3 - while len(dashes) < n: - - # Take combinations of long and short dashes - a = itertools.combinations_with_replacement([3, 1.25], p) - b = itertools.combinations_with_replacement([4, 1], p) - - # Interleave the combinations, reversing one of the streams - segment_list = itertools.chain(*zip( - list(a)[1:-1][::-1], - list(b)[1:-1] - )) - - # Now insert the gaps - for segments in segment_list: - gap = min(segments) - spec = tuple(itertools.chain(*((seg, gap) for seg in segments))) - dashes.append(spec) - - p += 1 - - return dashes[:n] - - -def unique_markers(n): - """Build an arbitrarily long list of unique marker styles for points. - - Parameters - ---------- - n : int - Number of unique marker specs to generate. - - Returns - ------- - markers : list of string or tuples - Values for defining :class:`matplotlib.markers.MarkerStyle` objects. - All markers will be filled. - - """ - # Start with marker specs that are well distinguishable - markers = [ - "o", - "X", - (4, 0, 45), - "P", - (4, 0, 0), - (4, 1, 0), - "^", - (4, 1, 45), - "v", - ] - - # Now generate more from regular polygons of increasing order - s = 5 - while len(markers) < n: - a = 360 / (s + 1) / 2 - markers.extend([ - (s + 1, 1, a), - (s + 1, 0, a), - (s, 1, 0), - (s, 0, 0), - ]) - s += 1 - - # Convert to MarkerStyle object, using only exactly what we need - # markers = [mpl.markers.MarkerStyle(m) for m in markers[:n]] - - return markers[:n] - - -def categorical_order(vector, order=None): - """Return a list of unique data values. - - Determine an ordered list of levels in ``values``. - - Parameters - ---------- - vector : list, array, Categorical, or Series - Vector of "categorical" values - order : list-like, optional - Desired order of category levels to override the order determined - from the ``values`` object. - - Returns - ------- - order : list - Ordered list of category levels not including null values. - - """ - if order is None: - if hasattr(vector, "categories"): - order = vector.categories - else: - try: - order = vector.cat.categories - except (TypeError, AttributeError): - - order = pd.Series(vector).unique() - - if variable_type(vector) == "numeric": - order = np.sort(order) - - order = filter(pd.notnull, order) - return list(order) diff --git a/seaborn/_compat.py b/seaborn/_compat.py deleted file mode 100644 index bd2f0c12d3a254f09d2cf85a300d9ee177c75f00..0000000000000000000000000000000000000000 --- a/seaborn/_compat.py +++ /dev/null @@ -1,123 +0,0 @@ -from __future__ import annotations -from typing import Literal - -import numpy as np -import pandas as pd -import matplotlib as mpl -from matplotlib.figure import Figure -from seaborn.utils import _version_predates - - -def norm_from_scale(scale, norm): - """Produce a Normalize object given a Scale and min/max domain limits.""" - # This is an internal maplotlib function that simplifies things to access - # It is likely to become part of the matplotlib API at some point: - # https://github.com/matplotlib/matplotlib/issues/20329 - if isinstance(norm, mpl.colors.Normalize): - return norm - - if scale is None: - return None - - if norm is None: - vmin = vmax = None - else: - vmin, vmax = norm # TODO more helpful error if this fails? - - class ScaledNorm(mpl.colors.Normalize): - - def __call__(self, value, clip=None): - # From github.com/matplotlib/matplotlib/blob/v3.4.2/lib/matplotlib/colors.py - # See github.com/matplotlib/matplotlib/tree/v3.4.2/LICENSE - value, is_scalar = self.process_value(value) - self.autoscale_None(value) - if self.vmin > self.vmax: - raise ValueError("vmin must be less or equal to vmax") - if self.vmin == self.vmax: - return np.full_like(value, 0) - if clip is None: - clip = self.clip - if clip: - value = np.clip(value, self.vmin, self.vmax) - # ***** Seaborn changes start **** - t_value = self.transform(value).reshape(np.shape(value)) - t_vmin, t_vmax = self.transform([self.vmin, self.vmax]) - # ***** Seaborn changes end ***** - if not np.isfinite([t_vmin, t_vmax]).all(): - raise ValueError("Invalid vmin or vmax") - t_value -= t_vmin - t_value /= (t_vmax - t_vmin) - t_value = np.ma.masked_invalid(t_value, copy=False) - return t_value[0] if is_scalar else t_value - - new_norm = ScaledNorm(vmin, vmax) - new_norm.transform = scale.get_transform().transform - - return new_norm - - -def get_colormap(name): - """Handle changes to matplotlib colormap interface in 3.6.""" - try: - return mpl.colormaps[name] - except AttributeError: - return mpl.cm.get_cmap(name) - - -def register_colormap(name, cmap): - """Handle changes to matplotlib colormap interface in 3.6.""" - try: - if name not in mpl.colormaps: - mpl.colormaps.register(cmap, name=name) - except AttributeError: - mpl.cm.register_cmap(name, cmap) - - -def set_layout_engine( - fig: Figure, - engine: Literal["constrained", "compressed", "tight", "none"], -) -> None: - """Handle changes to auto layout engine interface in 3.6""" - if hasattr(fig, "set_layout_engine"): - fig.set_layout_engine(engine) - else: - # _version_predates(mpl, 3.6) - if engine == "tight": - fig.set_tight_layout(True) # type: ignore # predates typing - elif engine == "constrained": - fig.set_constrained_layout(True) # type: ignore - elif engine == "none": - fig.set_tight_layout(False) # type: ignore - fig.set_constrained_layout(False) # type: ignore - - -def get_layout_engine(fig: Figure) -> mpl.layout_engine.LayoutEngine | None: - """Handle changes to auto layout engine interface in 3.6""" - if hasattr(fig, "get_layout_engine"): - return fig.get_layout_engine() - else: - # _version_predates(mpl, 3.6) - return None - - -def share_axis(ax0, ax1, which): - """Handle changes to post-hoc axis sharing.""" - if _version_predates(mpl, "3.5"): - group = getattr(ax0, f"get_shared_{which}_axes")() - group.join(ax1, ax0) - else: - getattr(ax1, f"share{which}")(ax0) - - -def get_legend_handles(legend): - """Handle legendHandles attribute rename.""" - if _version_predates(mpl, "3.7"): - return legend.legendHandles - else: - return legend.legend_handles - - -def groupby_apply_include_groups(val): - if _version_predates(pd, "2.2.0"): - return {} - return {"include_groups": val} diff --git a/seaborn/_core/__init__.py b/seaborn/_core/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/seaborn/_core/data.py b/seaborn/_core/data.py deleted file mode 100644 index c17bfe95c55d738b4d4ef69bfb945b0647343163..0000000000000000000000000000000000000000 --- a/seaborn/_core/data.py +++ /dev/null @@ -1,319 +0,0 @@ -""" -Components for parsing variable assignments and internally representing plot data. -""" -from __future__ import annotations - -from collections.abc import Mapping, Sized -from typing import cast -import warnings - -import pandas as pd -from pandas import DataFrame - -from seaborn._core.typing import DataSource, VariableSpec, ColumnName -from seaborn.utils import _version_predates - - -class PlotData: - """ - Data table with plot variable schema and mapping to original names. - - Contains logic for parsing variable specification arguments and updating - the table with layer-specific data and/or mappings. - - Parameters - ---------- - data - Input data where variable names map to vector values. - variables - Keys are names of plot variables (x, y, ...) each value is one of: - - - name of a column (or index level, or dictionary entry) in `data` - - vector in any format that can construct a :class:`pandas.DataFrame` - - Attributes - ---------- - frame - Data table with column names having defined plot variables. - names - Dictionary mapping plot variable names to names in source data structure(s). - ids - Dictionary mapping plot variable names to unique data source identifiers. - - """ - frame: DataFrame - frames: dict[tuple, DataFrame] - names: dict[str, str | None] - ids: dict[str, str | int] - source_data: DataSource - source_vars: dict[str, VariableSpec] - - def __init__( - self, - data: DataSource, - variables: dict[str, VariableSpec], - ): - - data = handle_data_source(data) - frame, names, ids = self._assign_variables(data, variables) - - self.frame = frame - self.names = names - self.ids = ids - - # The reason we possibly have a dictionary of frames is to support the - # Plot.pair operation, post scaling, where each x/y variable needs its - # own frame. This feels pretty clumsy and there are a bunch of places in - # the client code with awkard if frame / elif frames constructions. - # It would be great to have a cleaner abstraction here. - self.frames = {} - - self.source_data = data - self.source_vars = variables - - def __contains__(self, key: str) -> bool: - """Boolean check on whether a variable is defined in this dataset.""" - if self.frame is None: - return any(key in df for df in self.frames.values()) - return key in self.frame - - def join( - self, - data: DataSource, - variables: dict[str, VariableSpec] | None, - ) -> PlotData: - """Add, replace, or drop variables and return as a new dataset.""" - # Inherit the original source of the upstream data by default - if data is None: - data = self.source_data - - # TODO allow `data` to be a function (that is called on the source data?) - - if not variables: - variables = self.source_vars - - # Passing var=None implies that we do not want that variable in this layer - disinherit = [k for k, v in variables.items() if v is None] - - # Create a new dataset with just the info passed here - new = PlotData(data, variables) - - # -- Update the inherited DataSource with this new information - - drop_cols = [k for k in self.frame if k in new.frame or k in disinherit] - parts = [self.frame.drop(columns=drop_cols), new.frame] - - # Because we are combining distinct columns, this is perhaps more - # naturally thought of as a "merge"/"join". But using concat because - # some simple testing suggests that it is marginally faster. - frame = pd.concat(parts, axis=1, sort=False, copy=False) - - names = {k: v for k, v in self.names.items() if k not in disinherit} - names.update(new.names) - - ids = {k: v for k, v in self.ids.items() if k not in disinherit} - ids.update(new.ids) - - new.frame = frame - new.names = names - new.ids = ids - - # Multiple chained operations should always inherit from the original object - new.source_data = self.source_data - new.source_vars = self.source_vars - - return new - - def _assign_variables( - self, - data: DataFrame | Mapping | None, - variables: dict[str, VariableSpec], - ) -> tuple[DataFrame, dict[str, str | None], dict[str, str | int]]: - """ - Assign values for plot variables given long-form data and/or vector inputs. - - Parameters - ---------- - data - Input data where variable names map to vector values. - variables - Keys are names of plot variables (x, y, ...) each value is one of: - - - name of a column (or index level, or dictionary entry) in `data` - - vector in any format that can construct a :class:`pandas.DataFrame` - - Returns - ------- - frame - Table mapping seaborn variables (x, y, color, ...) to data vectors. - names - Keys are defined seaborn variables; values are names inferred from - the inputs (or None when no name can be determined). - ids - Like the `names` dict, but `None` values are replaced by the `id()` - of the data object that defined the variable. - - Raises - ------ - TypeError - When data source is not a DataFrame or Mapping. - ValueError - When variables are strings that don't appear in `data`, or when they are - non-indexed vector datatypes that have a different length from `data`. - - """ - source_data: Mapping | DataFrame - frame: DataFrame - names: dict[str, str | None] - ids: dict[str, str | int] - - plot_data = {} - names = {} - ids = {} - - given_data = data is not None - if data is None: - # Data is optional; all variables can be defined as vectors - # But simplify downstream code by always having a usable source data object - source_data = {} - else: - source_data = data - - # Variables can also be extracted from the index of a DataFrame - if isinstance(source_data, pd.DataFrame): - index = source_data.index.to_frame().to_dict("series") - else: - index = {} - - for key, val in variables.items(): - - # Simply ignore variables with no specification - if val is None: - continue - - # Try to treat the argument as a key for the data collection. - # But be flexible about what can be used as a key. - # Usually it will be a string, but allow other hashables when - # taking from the main data object. Allow only strings to reference - # fields in the index, because otherwise there is too much ambiguity. - - # TODO this will be rendered unnecessary by the following pandas fix: - # https://github.com/pandas-dev/pandas/pull/41283 - try: - hash(val) - val_is_hashable = True - except TypeError: - val_is_hashable = False - - val_as_data_key = ( - # See https://github.com/pandas-dev/pandas/pull/41283 - # (isinstance(val, abc.Hashable) and val in source_data) - (val_is_hashable and val in source_data) - or (isinstance(val, str) and val in index) - ) - - if val_as_data_key: - val = cast(ColumnName, val) - if val in source_data: - plot_data[key] = source_data[val] - elif val in index: - plot_data[key] = index[val] - names[key] = ids[key] = str(val) - - elif isinstance(val, str): - - # This looks like a column name but, lookup failed. - - err = f"Could not interpret value `{val}` for `{key}`. " - if not given_data: - err += "Value is a string, but `data` was not passed." - else: - err += "An entry with this name does not appear in `data`." - raise ValueError(err) - - else: - - # Otherwise, assume the value somehow represents data - - # Ignore empty data structures - if isinstance(val, Sized) and len(val) == 0: - continue - - # If vector has no index, it must match length of data table - if isinstance(data, pd.DataFrame) and not isinstance(val, pd.Series): - if isinstance(val, Sized) and len(data) != len(val): - val_cls = val.__class__.__name__ - err = ( - f"Length of {val_cls} vectors must match length of `data`" - f" when both are used, but `data` has length {len(data)}" - f" and the vector passed to `{key}` has length {len(val)}." - ) - raise ValueError(err) - - plot_data[key] = val - - # Try to infer the original name using pandas-like metadata - if hasattr(val, "name"): - names[key] = ids[key] = str(val.name) # type: ignore # mypy/1424 - else: - names[key] = None - ids[key] = id(val) - - # Construct a tidy plot DataFrame. This will convert a number of - # types automatically, aligning on index in case of pandas objects - # TODO Note: this fails when variable specs *only* have scalars! - frame = pd.DataFrame(plot_data) - - return frame, names, ids - - -def handle_data_source(data: object) -> pd.DataFrame | Mapping | None: - """Convert the data source object to a common union representation.""" - if isinstance(data, pd.DataFrame) or hasattr(data, "__dataframe__"): - # Check for pd.DataFrame inheritance could be removed once - # minimal pandas version supports dataframe interchange (1.5.0). - data = convert_dataframe_to_pandas(data) - elif data is not None and not isinstance(data, Mapping): - err = f"Data source must be a DataFrame or Mapping, not {type(data)!r}." - raise TypeError(err) - - return data - - -def convert_dataframe_to_pandas(data: object) -> pd.DataFrame: - """Use the DataFrame exchange protocol, or fail gracefully.""" - if isinstance(data, pd.DataFrame): - return data - - if not hasattr(pd.api, "interchange"): - msg = ( - "Support for non-pandas DataFrame objects requires a version of pandas " - "that implements the DataFrame interchange protocol. Please upgrade " - "your pandas version or coerce your data to pandas before passing " - "it to seaborn." - ) - raise TypeError(msg) - - if _version_predates(pd, "2.0.2"): - msg = ( - "DataFrame interchange with pandas<2.0.2 has some known issues. " - f"You are using pandas {pd.__version__}. " - "Continuing, but it is recommended to carefully inspect the results and to " - "consider upgrading." - ) - warnings.warn(msg, stacklevel=2) - - try: - # This is going to convert all columns in the input dataframe, even though - # we may only need one or two of them. It would be more efficient to select - # the columns that are going to be used in the plot prior to interchange. - # Solving that in general is a hard problem, especially with the objects - # interface where variables passed in Plot() may only be referenced later - # in Plot.add(). But noting here in case this seems to be a bottleneck. - return pd.api.interchange.from_dataframe(data) - except Exception as err: - msg = ( - "Encountered an exception when converting data source " - "to a pandas DataFrame. See traceback above for details." - ) - raise RuntimeError(msg) from err diff --git a/seaborn/_core/exceptions.py b/seaborn/_core/exceptions.py deleted file mode 100644 index 048443b0f8639e2e90a635c74e6202ae62e3ca8b..0000000000000000000000000000000000000000 --- a/seaborn/_core/exceptions.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Custom exceptions for the seaborn.objects interface. - -This is very lightweight, but it's a separate module to avoid circular imports. - -""" -from __future__ import annotations - - -class PlotSpecError(RuntimeError): - """ - Error class raised from seaborn.objects.Plot for compile-time failures. - - In the declarative Plot interface, exceptions may not be triggered immediately - by bad user input (and validation at input time may not be possible). This class - is used to signal that indirect dependency. It should be raised in an exception - chain when compile-time operations fail with an error message providing useful - context (e.g., scaling errors could specify the variable that failed.) - - """ - @classmethod - def _during(cls, step: str, var: str = "") -> PlotSpecError: - """ - Initialize the class to report the failure of a specific operation. - """ - message = [] - if var: - message.append(f"{step} failed for the `{var}` variable.") - else: - message.append(f"{step} failed.") - message.append("See the traceback above for more information.") - return cls(" ".join(message)) diff --git a/seaborn/_core/groupby.py b/seaborn/_core/groupby.py deleted file mode 100644 index cb63c670d29e8be63514d08ec4cbbfdfd0d79c46..0000000000000000000000000000000000000000 --- a/seaborn/_core/groupby.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Simplified split-apply-combine paradigm on dataframes for internal use.""" -from __future__ import annotations - -from typing import cast, Iterable - -import pandas as pd - -from seaborn._core.rules import categorical_order - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from typing import Callable - from pandas import DataFrame, MultiIndex, Index - - -class GroupBy: - """ - Interface for Pandas GroupBy operations allowing specified group order. - - Writing our own class to do this has a few advantages: - - It constrains the interface between Plot and Stat/Move objects - - It allows control over the row order of the GroupBy result, which is - important when using in the context of some Move operations (dodge, stack, ...) - - It simplifies some complexities regarding the return type and Index contents - one encounters with Pandas, especially for DataFrame -> DataFrame applies - - It increases future flexibility regarding alternate DataFrame libraries - - """ - def __init__(self, order: list[str] | dict[str, list | None]): - """ - Initialize the GroupBy from grouping variables and optional level orders. - - Parameters - ---------- - order - List of variable names or dict mapping names to desired level orders. - Level order values can be None to use default ordering rules. The - variables can include names that are not expected to appear in the - data; these will be dropped before the groups are defined. - - """ - if not order: - raise ValueError("GroupBy requires at least one grouping variable") - - if isinstance(order, list): - order = {k: None for k in order} - self.order = order - - def _get_groups( - self, data: DataFrame - ) -> tuple[str | list[str], Index | MultiIndex]: - """Return index with Cartesian product of ordered grouping variable levels.""" - levels = {} - for var, order in self.order.items(): - if var in data: - if order is None: - order = categorical_order(data[var]) - levels[var] = order - - grouper: str | list[str] - groups: Index | MultiIndex - if not levels: - grouper = [] - groups = pd.Index([]) - elif len(levels) > 1: - grouper = list(levels) - groups = pd.MultiIndex.from_product(levels.values(), names=grouper) - else: - grouper, = list(levels) - groups = pd.Index(levels[grouper], name=grouper) - return grouper, groups - - def _reorder_columns(self, res, data): - """Reorder result columns to match original order with new columns appended.""" - cols = [c for c in data if c in res] - cols += [c for c in res if c not in data] - return res.reindex(columns=pd.Index(cols)) - - def agg(self, data: DataFrame, *args, **kwargs) -> DataFrame: - """ - Reduce each group to a single row in the output. - - The output will have a row for each unique combination of the grouping - variable levels with null values for the aggregated variable(s) where - those combinations do not appear in the dataset. - - """ - grouper, groups = self._get_groups(data) - - if not grouper: - # We will need to see whether there are valid usecases that end up here - raise ValueError("No grouping variables are present in dataframe") - - res = ( - data - .groupby(grouper, sort=False, observed=False) - .agg(*args, **kwargs) - .reindex(groups) - .reset_index() - .pipe(self._reorder_columns, data) - ) - - return res - - def apply( - self, data: DataFrame, func: Callable[..., DataFrame], - *args, **kwargs, - ) -> DataFrame: - """Apply a DataFrame -> DataFrame mapping to each group.""" - grouper, groups = self._get_groups(data) - - if not grouper: - return self._reorder_columns(func(data, *args, **kwargs), data) - - parts = {} - for key, part_df in data.groupby(grouper, sort=False, observed=False): - parts[key] = func(part_df, *args, **kwargs) - stack = [] - for key in groups: - if key in parts: - if isinstance(grouper, list): - # Implies that we had a MultiIndex so key is iterable - group_ids = dict(zip(grouper, cast(Iterable, key))) - else: - group_ids = {grouper: key} - stack.append(parts[key].assign(**group_ids)) - - res = pd.concat(stack, ignore_index=True) - return self._reorder_columns(res, data) diff --git a/seaborn/_core/moves.py b/seaborn/_core/moves.py deleted file mode 100644 index 179926e71789bb6a6891aa21d80ee38696f89236..0000000000000000000000000000000000000000 --- a/seaborn/_core/moves.py +++ /dev/null @@ -1,274 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import ClassVar, Callable, Optional, Union, cast - -import numpy as np -from pandas import DataFrame - -from seaborn._core.groupby import GroupBy -from seaborn._core.scales import Scale -from seaborn._core.typing import Default - -default = Default() - - -@dataclass -class Move: - """Base class for objects that apply simple positional transforms.""" - - group_by_orient: ClassVar[bool] = True - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - raise NotImplementedError - - -@dataclass -class Jitter(Move): - """ - Random displacement along one or both axes to reduce overplotting. - - Parameters - ---------- - width : float - Magnitude of jitter, relative to mark width, along the orientation axis. - If not provided, the default value will be 0 when `x` or `y` are set, otherwise - there will be a small amount of jitter applied by default. - x : float - Magnitude of jitter, in data units, along the x axis. - y : float - Magnitude of jitter, in data units, along the y axis. - - Examples - -------- - .. include:: ../docstrings/objects.Jitter.rst - - """ - width: float | Default = default - x: float = 0 - y: float = 0 - seed: int | None = None - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - data = data.copy() - rng = np.random.default_rng(self.seed) - - def jitter(data, col, scale): - noise = rng.uniform(-.5, +.5, len(data)) - offsets = noise * scale - return data[col] + offsets - - if self.width is default: - width = 0.0 if self.x or self.y else 0.2 - else: - width = cast(float, self.width) - - if self.width: - data[orient] = jitter(data, orient, width * data["width"]) - if self.x: - data["x"] = jitter(data, "x", self.x) - if self.y: - data["y"] = jitter(data, "y", self.y) - - return data - - -@dataclass -class Dodge(Move): - """ - Displacement and narrowing of overlapping marks along orientation axis. - - Parameters - ---------- - empty : {'keep', 'drop', 'fill'} - gap : float - Size of gap between dodged marks. - by : list of variable names - Variables to apply the movement to, otherwise use all. - - Examples - -------- - .. include:: ../docstrings/objects.Dodge.rst - - """ - empty: str = "keep" # Options: keep, drop, fill - gap: float = 0 - - # TODO accept just a str here? - # TODO should this always be present? - # TODO should the default be an "all" singleton? - by: Optional[list[str]] = None - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - grouping_vars = [v for v in groupby.order if v in data] - groups = groupby.agg(data, {"width": "max"}) - if self.empty == "fill": - groups = groups.dropna() - - def groupby_pos(s): - grouper = [groups[v] for v in [orient, "col", "row"] if v in data] - return s.groupby(grouper, sort=False, observed=True) - - def scale_widths(w): - # TODO what value to fill missing widths??? Hard problem... - # TODO short circuit this if outer widths has no variance? - empty = 0 if self.empty == "fill" else w.mean() - filled = w.fillna(empty) - scale = filled.max() - norm = filled.sum() - if self.empty == "keep": - w = filled - return w / norm * scale - - def widths_to_offsets(w): - return w.shift(1).fillna(0).cumsum() + (w - w.sum()) / 2 - - new_widths = groupby_pos(groups["width"]).transform(scale_widths) - offsets = groupby_pos(new_widths).transform(widths_to_offsets) - - if self.gap: - new_widths *= 1 - self.gap - - groups["_dodged"] = groups[orient] + offsets - groups["width"] = new_widths - - out = ( - data - .drop("width", axis=1) - .merge(groups, on=grouping_vars, how="left") - .drop(orient, axis=1) - .rename(columns={"_dodged": orient}) - ) - - return out - - -@dataclass -class Stack(Move): - """ - Displacement of overlapping bar or area marks along the value axis. - - Examples - -------- - .. include:: ../docstrings/objects.Stack.rst - - """ - # TODO center? (or should this be a different move, eg. Stream()) - - def _stack(self, df, orient): - - # TODO should stack do something with ymin/ymax style marks? - # Should there be an upstream conversion to baseline/height parameterization? - - if df["baseline"].nunique() > 1: - err = "Stack move cannot be used when baselines are already heterogeneous" - raise RuntimeError(err) - - other = {"x": "y", "y": "x"}[orient] - stacked_lengths = (df[other] - df["baseline"]).dropna().cumsum() - offsets = stacked_lengths.shift(1).fillna(0) - - df[other] = stacked_lengths - df["baseline"] = df["baseline"] + offsets - - return df - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - # TODO where to ensure that other semantic variables are sorted properly? - # TODO why are we not using the passed in groupby here? - groupers = ["col", "row", orient] - return GroupBy(groupers).apply(data, self._stack, orient) - - -@dataclass -class Shift(Move): - """ - Displacement of all marks with the same magnitude / direction. - - Parameters - ---------- - x, y : float - Magnitude of shift, in data units, along each axis. - - Examples - -------- - .. include:: ../docstrings/objects.Shift.rst - - """ - x: float = 0 - y: float = 0 - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - data = data.copy(deep=False) - data["x"] = data["x"] + self.x - data["y"] = data["y"] + self.y - return data - - -@dataclass -class Norm(Move): - """ - Divisive scaling on the value axis after aggregating within groups. - - Parameters - ---------- - func : str or callable - Function called on each group to define the comparison value. - where : str - Query string defining the subset used to define the comparison values. - by : list of variables - Variables used to define aggregation groups. - percent : bool - If True, multiply the result by 100. - - Examples - -------- - .. include:: ../docstrings/objects.Norm.rst - - """ - - func: Union[Callable, str] = "max" - where: Optional[str] = None - by: Optional[list[str]] = None - percent: bool = False - - group_by_orient: ClassVar[bool] = False - - def _norm(self, df, var): - - if self.where is None: - denom_data = df[var] - else: - denom_data = df.query(self.where)[var] - df[var] = df[var] / denom_data.agg(self.func) - - if self.percent: - df[var] = df[var] * 100 - - return df - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - other = {"x": "y", "y": "x"}[orient] - return groupby.apply(data, self._norm, other) - - -# TODO -# @dataclass -# class Ridge(Move): -# ... diff --git a/seaborn/_core/plot.py b/seaborn/_core/plot.py deleted file mode 100644 index 14348e357fed9242c314530c88daf360cd9a69bd..0000000000000000000000000000000000000000 --- a/seaborn/_core/plot.py +++ /dev/null @@ -1,1830 +0,0 @@ -"""The classes for specifying and compiling a declarative visualization.""" -from __future__ import annotations - -import io -import os -import re -import inspect -import itertools -import textwrap -from contextlib import contextmanager -from collections import abc -from collections.abc import Callable, Generator -from typing import Any, List, Literal, Optional, cast -from xml.etree import ElementTree - -from cycler import cycler -import pandas as pd -from pandas import DataFrame, Series, Index -import matplotlib as mpl -from matplotlib.axes import Axes -from matplotlib.artist import Artist -from matplotlib.figure import Figure -import numpy as np -from PIL import Image - -from seaborn._marks.base import Mark -from seaborn._stats.base import Stat -from seaborn._core.data import PlotData -from seaborn._core.moves import Move -from seaborn._core.scales import Scale -from seaborn._core.subplots import Subplots -from seaborn._core.groupby import GroupBy -from seaborn._core.properties import PROPERTIES, Property -from seaborn._core.typing import ( - DataSource, - VariableSpec, - VariableSpecList, - OrderSpec, - Default, -) -from seaborn._core.exceptions import PlotSpecError -from seaborn._core.rules import categorical_order -from seaborn._compat import get_layout_engine, set_layout_engine -from seaborn.utils import _version_predates -from seaborn.rcmod import axes_style, plotting_context -from seaborn.palettes import color_palette - -from typing import TYPE_CHECKING, TypedDict -if TYPE_CHECKING: - from matplotlib.figure import SubFigure - - -default = Default() - - -# ---- Definitions for internal specs ---------------------------------------------- # - - -class Layer(TypedDict, total=False): - - mark: Mark # TODO allow list? - stat: Stat | None # TODO allow list? - move: Move | list[Move] | None - data: PlotData - source: DataSource - vars: dict[str, VariableSpec] - orient: str - legend: bool - label: str | None - - -class FacetSpec(TypedDict, total=False): - - variables: dict[str, VariableSpec] - structure: dict[str, list[str]] - wrap: int | None - - -class PairSpec(TypedDict, total=False): - - variables: dict[str, VariableSpec] - structure: dict[str, list[str]] - cross: bool - wrap: int | None - - -# --- Local helpers ---------------------------------------------------------------- # - - -@contextmanager -def theme_context(params: dict[str, Any]) -> Generator: - """Temporarily modify specifc matplotlib rcParams.""" - orig_params = {k: mpl.rcParams[k] for k in params} - color_codes = "bgrmyck" - nice_colors = [*color_palette("deep6"), (.15, .15, .15)] - orig_colors = [mpl.colors.colorConverter.colors[x] for x in color_codes] - # TODO how to allow this to reflect the color cycle when relevant? - try: - mpl.rcParams.update(params) - for (code, color) in zip(color_codes, nice_colors): - mpl.colors.colorConverter.colors[code] = color - yield - finally: - mpl.rcParams.update(orig_params) - for (code, color) in zip(color_codes, orig_colors): - mpl.colors.colorConverter.colors[code] = color - - -def build_plot_signature(cls): - """ - Decorator function for giving Plot a useful signature. - - Currently this mostly saves us some duplicated typing, but we would - like eventually to have a way of registering new semantic properties, - at which point dynamic signature generation would become more important. - - """ - sig = inspect.signature(cls) - params = [ - inspect.Parameter("args", inspect.Parameter.VAR_POSITIONAL), - inspect.Parameter("data", inspect.Parameter.KEYWORD_ONLY, default=None) - ] - params.extend([ - inspect.Parameter(name, inspect.Parameter.KEYWORD_ONLY, default=None) - for name in PROPERTIES - ]) - new_sig = sig.replace(parameters=params) - cls.__signature__ = new_sig - - known_properties = textwrap.fill( - ", ".join([f"|{p}|" for p in PROPERTIES]), - width=78, subsequent_indent=" " * 8, - ) - - if cls.__doc__ is not None: # support python -OO mode - cls.__doc__ = cls.__doc__.format(known_properties=known_properties) - - return cls - - -# ---- Plot configuration ---------------------------------------------------------- # - - -class ThemeConfig(mpl.RcParams): - """ - Configuration object for the Plot.theme, using matplotlib rc parameters. - """ - THEME_GROUPS = [ - "axes", "figure", "font", "grid", "hatch", "legend", "lines", - "mathtext", "markers", "patch", "savefig", "scatter", - "xaxis", "xtick", "yaxis", "ytick", - ] - - def __init__(self): - super().__init__() - self.reset() - - @property - def _default(self) -> dict[str, Any]: - - return { - **self._filter_params(mpl.rcParamsDefault), - **axes_style("darkgrid"), - **plotting_context("notebook"), - "axes.prop_cycle": cycler("color", color_palette("deep")), - } - - def reset(self) -> None: - """Update the theme dictionary with seaborn's default values.""" - self.update(self._default) - - def update(self, other: dict[str, Any] | None = None, /, **kwds): - """Update the theme with a dictionary or keyword arguments of rc parameters.""" - if other is not None: - theme = self._filter_params(other) - else: - theme = {} - theme.update(kwds) - super().update(theme) - - def _filter_params(self, params: dict[str, Any]) -> dict[str, Any]: - """Restruct to thematic rc params.""" - return { - k: v for k, v in params.items() - if any(k.startswith(p) for p in self.THEME_GROUPS) - } - - def _html_table(self, params: dict[str, Any]) -> list[str]: - - lines = ["<table>"] - for k, v in params.items(): - row = f"<tr><td>{k}:</td><td style='text-align:left'>{v!r}</td></tr>" - lines.append(row) - lines.append("</table>") - return lines - - def _repr_html_(self) -> str: - - repr = [ - "<div style='height: 300px'>", - "<div style='border-style: inset; border-width: 2px'>", - *self._html_table(self), - "</div>", - "</div>", - ] - return "\n".join(repr) - - -class DisplayConfig(TypedDict): - """Configuration for IPython's rich display hooks.""" - format: Literal["png", "svg"] - scaling: float - hidpi: bool - - -class PlotConfig: - """Configuration for default behavior / appearance of class:`Plot` instances.""" - def __init__(self): - - self._theme = ThemeConfig() - self._display = {"format": "png", "scaling": .85, "hidpi": True} - - @property - def theme(self) -> dict[str, Any]: - """ - Dictionary of base theme parameters for :class:`Plot`. - - Keys and values correspond to matplotlib rc params, as documented here: - https://matplotlib.org/stable/tutorials/introductory/customizing.html - - """ - return self._theme - - @property - def display(self) -> DisplayConfig: - """ - Dictionary of parameters for rich display in Jupyter notebook. - - Valid parameters: - - - format ("png" or "svg"): Image format to produce - - scaling (float): Relative scaling of embedded image - - hidpi (bool): When True, double the DPI while preserving the size - - """ - return self._display - - -# ---- The main interface for declarative plotting --------------------------------- # - - -@build_plot_signature -class Plot: - """ - An interface for declaratively specifying statistical graphics. - - Plots are constructed by initializing this class and adding one or more - layers, comprising a `Mark` and optional `Stat` or `Move`. Additionally, - faceting variables or variable pairings may be defined to divide the space - into multiple subplots. The mappings from data values to visual properties - can be parametrized using scales, although the plot will try to infer good - defaults when scales are not explicitly defined. - - The constructor accepts a data source (a :class:`pandas.DataFrame` or - dictionary with columnar values) and variable assignments. Variables can be - passed as keys to the data source or directly as data vectors. If multiple - data-containing objects are provided, they will be index-aligned. - - The data source and variables defined in the constructor will be used for - all layers in the plot, unless overridden or disabled when adding a layer. - - The following variables can be defined in the constructor: - {known_properties} - - The `data`, `x`, and `y` variables can be passed as positional arguments or - using keywords. Whether the first positional argument is interpreted as a - data source or `x` variable depends on its type. - - The methods of this class return a copy of the instance; use chaining to - build up a plot through multiple calls. Methods can be called in any order. - - Most methods only add information to the plot spec; no actual processing - happens until the plot is shown or saved. It is also possible to compile - the plot without rendering it to access the lower-level representation. - - """ - config = PlotConfig() - - _data: PlotData - _layers: list[Layer] - - _scales: dict[str, Scale] - _shares: dict[str, bool | str] - _limits: dict[str, tuple[Any, Any]] - _labels: dict[str, str | Callable[[str], str]] - _theme: dict[str, Any] - - _facet_spec: FacetSpec - _pair_spec: PairSpec - - _figure_spec: dict[str, Any] - _subplot_spec: dict[str, Any] - _layout_spec: dict[str, Any] - - def __init__( - self, - *args: DataSource | VariableSpec, - data: DataSource = None, - **variables: VariableSpec, - ): - - if args: - data, variables = self._resolve_positionals(args, data, variables) - - unknown = [x for x in variables if x not in PROPERTIES] - if unknown: - err = f"Plot() got unexpected keyword argument(s): {', '.join(unknown)}" - raise TypeError(err) - - self._data = PlotData(data, variables) - - self._layers = [] - - self._scales = {} - self._shares = {} - self._limits = {} - self._labels = {} - self._theme = {} - - self._facet_spec = {} - self._pair_spec = {} - - self._figure_spec = {} - self._subplot_spec = {} - self._layout_spec = {} - - self._target = None - - def _resolve_positionals( - self, - args: tuple[DataSource | VariableSpec, ...], - data: DataSource, - variables: dict[str, VariableSpec], - ) -> tuple[DataSource, dict[str, VariableSpec]]: - """Handle positional arguments, which may contain data / x / y.""" - if len(args) > 3: - err = "Plot() accepts no more than 3 positional arguments (data, x, y)." - raise TypeError(err) - - if ( - isinstance(args[0], (abc.Mapping, pd.DataFrame)) - or hasattr(args[0], "__dataframe__") - ): - if data is not None: - raise TypeError("`data` given by both name and position.") - data, args = args[0], args[1:] - - if len(args) == 2: - x, y = args - elif len(args) == 1: - x, y = *args, None - else: - x = y = None - - for name, var in zip("yx", (y, x)): - if var is not None: - if name in variables: - raise TypeError(f"`{name}` given by both name and position.") - # Keep coordinates at the front of the variables dict - # Cast type because we know this isn't a DataSource at this point - variables = {name: cast(VariableSpec, var), **variables} - - return data, variables - - def __add__(self, other): - - if isinstance(other, Mark) or isinstance(other, Stat): - raise TypeError("Sorry, this isn't ggplot! Perhaps try Plot.add?") - - other_type = other.__class__.__name__ - raise TypeError(f"Unsupported operand type(s) for +: 'Plot' and '{other_type}") - - def _repr_png_(self) -> tuple[bytes, dict[str, float]] | None: - - if Plot.config.display["format"] != "png": - return None - return self.plot()._repr_png_() - - def _repr_svg_(self) -> str | None: - - if Plot.config.display["format"] != "svg": - return None - return self.plot()._repr_svg_() - - def _clone(self) -> Plot: - """Generate a new object with the same information as the current spec.""" - new = Plot() - - # TODO any way to enforce that data does not get mutated? - new._data = self._data - - new._layers.extend(self._layers) - - new._scales.update(self._scales) - new._shares.update(self._shares) - new._limits.update(self._limits) - new._labels.update(self._labels) - new._theme.update(self._theme) - - new._facet_spec.update(self._facet_spec) - new._pair_spec.update(self._pair_spec) - - new._figure_spec.update(self._figure_spec) - new._subplot_spec.update(self._subplot_spec) - new._layout_spec.update(self._layout_spec) - - new._target = self._target - - return new - - def _theme_with_defaults(self) -> dict[str, Any]: - - theme = self.config.theme.copy() - theme.update(self._theme) - return theme - - @property - def _variables(self) -> list[str]: - - variables = ( - list(self._data.frame) - + list(self._pair_spec.get("variables", [])) - + list(self._facet_spec.get("variables", [])) - ) - for layer in self._layers: - variables.extend(v for v in layer["vars"] if v not in variables) - - # Coerce to str in return to appease mypy; we know these will only - # ever be strings but I don't think we can type a DataFrame that way yet - return [str(v) for v in variables] - - def on(self, target: Axes | SubFigure | Figure) -> Plot: - """ - Provide existing Matplotlib figure or axes for drawing the plot. - - When using this method, you will also need to explicitly call a method that - triggers compilation, such as :meth:`Plot.show` or :meth:`Plot.save`. If you - want to postprocess using matplotlib, you'd need to call :meth:`Plot.plot` - first to compile the plot without rendering it. - - Parameters - ---------- - target : Axes, SubFigure, or Figure - Matplotlib object to use. Passing :class:`matplotlib.axes.Axes` will add - artists without otherwise modifying the figure. Otherwise, subplots will be - created within the space of the given :class:`matplotlib.figure.Figure` or - :class:`matplotlib.figure.SubFigure`. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.on.rst - - """ - accepted_types: tuple # Allow tuple of various length - accepted_types = ( - mpl.axes.Axes, mpl.figure.SubFigure, mpl.figure.Figure - ) - accepted_types_str = ( - f"{mpl.axes.Axes}, {mpl.figure.SubFigure}, or {mpl.figure.Figure}" - ) - - if not isinstance(target, accepted_types): - err = ( - f"The `Plot.on` target must be an instance of {accepted_types_str}. " - f"You passed an instance of {target.__class__} instead." - ) - raise TypeError(err) - - new = self._clone() - new._target = target - - return new - - def add( - self, - mark: Mark, - *transforms: Stat | Move, - orient: str | None = None, - legend: bool = True, - label: str | None = None, - data: DataSource = None, - **variables: VariableSpec, - ) -> Plot: - """ - Specify a layer of the visualization in terms of mark and data transform(s). - - This is the main method for specifying how the data should be visualized. - It can be called multiple times with different arguments to define - a plot with multiple layers. - - Parameters - ---------- - mark : :class:`Mark` - The visual representation of the data to use in this layer. - transforms : :class:`Stat` or :class:`Move` - Objects representing transforms to be applied before plotting the data. - Currently, at most one :class:`Stat` can be used, and it - must be passed first. This constraint will be relaxed in the future. - orient : "x", "y", "v", or "h" - The orientation of the mark, which also affects how transforms are computed. - Typically corresponds to the axis that defines groups for aggregation. - The "v" (vertical) and "h" (horizontal) options are synonyms for "x" / "y", - but may be more intuitive with some marks. When not provided, an - orientation will be inferred from characteristics of the data and scales. - legend : bool - Option to suppress the mark/mappings for this layer from the legend. - label : str - A label to use for the layer in the legend, independent of any mappings. - data : DataFrame or dict - Data source to override the global source provided in the constructor. - variables : data vectors or identifiers - Additional layer-specific variables, including variables that will be - passed directly to the transforms without scaling. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.add.rst - - """ - if not isinstance(mark, Mark): - msg = f"mark must be a Mark instance, not {type(mark)!r}." - raise TypeError(msg) - - # TODO This API for transforms was a late decision, and previously Plot.add - # accepted 0 or 1 Stat instances and 0, 1, or a list of Move instances. - # It will take some work to refactor the internals so that Stat and Move are - # treated identically, and until then well need to "unpack" the transforms - # here and enforce limitations on the order / types. - - stat: Optional[Stat] - move: Optional[List[Move]] - error = False - if not transforms: - stat, move = None, None - elif isinstance(transforms[0], Stat): - stat = transforms[0] - move = [m for m in transforms[1:] if isinstance(m, Move)] - error = len(move) != len(transforms) - 1 - else: - stat = None - move = [m for m in transforms if isinstance(m, Move)] - error = len(move) != len(transforms) - - if error: - msg = " ".join([ - "Transforms must have at most one Stat type (in the first position),", - "and all others must be a Move type. Given transform type(s):", - ", ".join(str(type(t).__name__) for t in transforms) + "." - ]) - raise TypeError(msg) - - new = self._clone() - new._layers.append({ - "mark": mark, - "stat": stat, - "move": move, - # TODO it doesn't work to supply scalars to variables, but it should - "vars": variables, - "source": data, - "legend": legend, - "label": label, - "orient": {"v": "x", "h": "y"}.get(orient, orient), # type: ignore - }) - - return new - - def pair( - self, - x: VariableSpecList = None, - y: VariableSpecList = None, - wrap: int | None = None, - cross: bool = True, - ) -> Plot: - """ - Produce subplots by pairing multiple `x` and/or `y` variables. - - Parameters - ---------- - x, y : sequence(s) of data vectors or identifiers - Variables that will define the grid of subplots. - wrap : int - When using only `x` or `y`, "wrap" subplots across a two-dimensional grid - with this many columns (when using `x`) or rows (when using `y`). - cross : bool - When False, zip the `x` and `y` lists such that the first subplot gets the - first pair, the second gets the second pair, etc. Otherwise, create a - two-dimensional grid from the cartesian product of the lists. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.pair.rst - - """ - # TODO Add transpose= arg, which would then draw pair(y=[...]) across rows - # This may also be possible by setting `wrap=1`, but is that too unobvious? - # TODO PairGrid features not currently implemented: diagonals, corner - - pair_spec: PairSpec = {} - - axes = {"x": [] if x is None else x, "y": [] if y is None else y} - for axis, arg in axes.items(): - if isinstance(arg, (str, int)): - err = f"You must pass a sequence of variable keys to `{axis}`" - raise TypeError(err) - - pair_spec["variables"] = {} - pair_spec["structure"] = {} - - for axis in "xy": - keys = [] - for i, col in enumerate(axes[axis]): - key = f"{axis}{i}" - keys.append(key) - pair_spec["variables"][key] = col - - if keys: - pair_spec["structure"][axis] = keys - - if not cross and len(axes["x"]) != len(axes["y"]): - err = "Lengths of the `x` and `y` lists must match with cross=False" - raise ValueError(err) - - pair_spec["cross"] = cross - pair_spec["wrap"] = wrap - - new = self._clone() - new._pair_spec.update(pair_spec) - return new - - def facet( - self, - col: VariableSpec = None, - row: VariableSpec = None, - order: OrderSpec | dict[str, OrderSpec] = None, - wrap: int | None = None, - ) -> Plot: - """ - Produce subplots with conditional subsets of the data. - - Parameters - ---------- - col, row : data vectors or identifiers - Variables used to define subsets along the columns and/or rows of the grid. - Can be references to the global data source passed in the constructor. - order : list of strings, or dict with dimensional keys - Define the order of the faceting variables. - wrap : int - When using only `col` or `row`, wrap subplots across a two-dimensional - grid with this many subplots on the faceting dimension. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.facet.rst - - """ - variables: dict[str, VariableSpec] = {} - if col is not None: - variables["col"] = col - if row is not None: - variables["row"] = row - - structure = {} - if isinstance(order, dict): - for dim in ["col", "row"]: - dim_order = order.get(dim) - if dim_order is not None: - structure[dim] = list(dim_order) - elif order is not None: - if col is not None and row is not None: - err = " ".join([ - "When faceting on both col= and row=, passing `order` as a list" - "is ambiguous. Use a dict with 'col' and/or 'row' keys instead." - ]) - raise RuntimeError(err) - elif col is not None: - structure["col"] = list(order) - elif row is not None: - structure["row"] = list(order) - - spec: FacetSpec = { - "variables": variables, - "structure": structure, - "wrap": wrap, - } - - new = self._clone() - new._facet_spec.update(spec) - - return new - - # TODO def twin()? - - def scale(self, **scales: Scale) -> Plot: - """ - Specify mappings from data units to visual properties. - - Keywords correspond to variables defined in the plot, including coordinate - variables (`x`, `y`) and semantic variables (`color`, `pointsize`, etc.). - - A number of "magic" arguments are accepted, including: - - The name of a transform (e.g., `"log"`, `"sqrt"`) - - The name of a palette (e.g., `"viridis"`, `"muted"`) - - A tuple of values, defining the output range (e.g. `(1, 5)`) - - A dict, implying a :class:`Nominal` scale (e.g. `{"a": .2, "b": .5}`) - - A list of values, implying a :class:`Nominal` scale (e.g. `["b", "r"]`) - - For more explicit control, pass a scale spec object such as :class:`Continuous` - or :class:`Nominal`. Or pass `None` to use an "identity" scale, which treats - data values as literally encoding visual properties. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.scale.rst - - """ - new = self._clone() - new._scales.update(scales) - return new - - def share(self, **shares: bool | str) -> Plot: - """ - Control sharing of axis limits and ticks across subplots. - - Keywords correspond to variables defined in the plot, and values can be - boolean (to share across all subplots), or one of "row" or "col" (to share - more selectively across one dimension of a grid). - - Behavior for non-coordinate variables is currently undefined. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.share.rst - - """ - new = self._clone() - new._shares.update(shares) - return new - - def limit(self, **limits: tuple[Any, Any]) -> Plot: - """ - Control the range of visible data. - - Keywords correspond to variables defined in the plot, and values are a - `(min, max)` tuple (where either can be `None` to leave unset). - - Limits apply only to the axis; data outside the visible range are - still used for any stat transforms and added to the plot. - - Behavior for non-coordinate variables is currently undefined. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.limit.rst - - """ - new = self._clone() - new._limits.update(limits) - return new - - def label( - self, *, - title: str | None = None, - legend: str | None = None, - **variables: str | Callable[[str], str] - ) -> Plot: - """ - Control the labels and titles for axes, legends, and subplots. - - Additional keywords correspond to variables defined in the plot. - Values can be one of the following types: - - - string (used literally; pass "" to clear the default label) - - function (called on the default label) - - For coordinate variables, the value sets the axis label. - For semantic variables, the value sets the legend title. - For faceting variables, `title=` modifies the subplot-specific label, - while `col=` and/or `row=` add a label for the faceting variable. - - When using a single subplot, `title=` sets its title. - - The `legend=` parameter sets the title for the "layer" legend - (i.e., when using `label` in :meth:`Plot.add`). - - Examples - -------- - .. include:: ../docstrings/objects.Plot.label.rst - - - """ - new = self._clone() - if title is not None: - new._labels["title"] = title - if legend is not None: - new._labels["legend"] = legend - new._labels.update(variables) - return new - - def layout( - self, - *, - size: tuple[float, float] | Default = default, - engine: str | None | Default = default, - extent: tuple[float, float, float, float] | Default = default, - ) -> Plot: - """ - Control the figure size and layout. - - .. note:: - - Default figure sizes and the API for specifying the figure size are subject - to change in future "experimental" releases of the objects API. The default - layout engine may also change. - - Parameters - ---------- - size : (width, height) - Size of the resulting figure, in inches. Size is inclusive of legend when - using pyplot, but not otherwise. - engine : {{"tight", "constrained", "none"}} - Name of method for automatically adjusting the layout to remove overlap. - The default depends on whether :meth:`Plot.on` is used. - extent : (left, bottom, right, top) - Boundaries of the plot layout, in fractions of the figure size. Takes - effect through the layout engine; exact results will vary across engines. - Note: the extent includes axis decorations when using a layout engine, - but it is exclusive of them when `engine="none"`. - - Examples - -------- - .. include:: ../docstrings/objects.Plot.layout.rst - - """ - # TODO add an "auto" mode for figsize that roughly scales with the rcParams - # figsize (so that works), but expands to prevent subplots from being squished - # Also should we have height=, aspect=, exclusive with figsize? Or working - # with figsize when only one is defined? - - new = self._clone() - - if size is not default: - new._figure_spec["figsize"] = size - if engine is not default: - new._layout_spec["engine"] = engine - if extent is not default: - new._layout_spec["extent"] = extent - - return new - - # TODO def legend (ugh) - - def theme(self, config: dict[str, Any], /) -> Plot: - """ - Control the appearance of elements in the plot. - - .. note:: - - The API for customizing plot appearance is not yet finalized. - Currently, the only valid argument is a dict of matplotlib rc parameters. - (This dict must be passed as a positional argument.) - - It is likely that this method will be enhanced in future releases. - - Matplotlib rc parameters are documented on the following page: - https://matplotlib.org/stable/tutorials/introductory/customizing.html - - Examples - -------- - .. include:: ../docstrings/objects.Plot.theme.rst - - """ - new = self._clone() - - rc = mpl.RcParams(config) - new._theme.update(rc) - - return new - - def save(self, loc, **kwargs) -> Plot: - """ - Compile the plot and write it to a buffer or file on disk. - - Parameters - ---------- - loc : str, path, or buffer - Location on disk to save the figure, or a buffer to write into. - kwargs - Other keyword arguments are passed through to - :meth:`matplotlib.figure.Figure.savefig`. - - """ - # TODO expose important keyword arguments in our signature? - with theme_context(self._theme_with_defaults()): - self._plot().save(loc, **kwargs) - return self - - def show(self, **kwargs) -> None: - """ - Compile the plot and display it by hooking into pyplot. - - Calling this method is not necessary to render a plot in notebook context, - but it may be in other environments (e.g., in a terminal). After compiling the - plot, it calls :func:`matplotlib.pyplot.show` (passing any keyword parameters). - - Unlike other :class:`Plot` methods, there is no return value. This should be - the last method you call when specifying a plot. - - """ - # TODO make pyplot configurable at the class level, and when not using, - # import IPython.display and call on self to populate cell output? - - # Keep an eye on whether matplotlib implements "attaching" an existing - # figure to pyplot: https://github.com/matplotlib/matplotlib/pull/14024 - - self.plot(pyplot=True).show(**kwargs) - - def plot(self, pyplot: bool = False) -> Plotter: - """ - Compile the plot spec and return the Plotter object. - """ - with theme_context(self._theme_with_defaults()): - return self._plot(pyplot) - - def _plot(self, pyplot: bool = False) -> Plotter: - - # TODO if we have _target object, pyplot should be determined by whether it - # is hooked into the pyplot state machine (how do we check?) - - plotter = Plotter(pyplot=pyplot, theme=self._theme_with_defaults()) - - # Process the variable assignments and initialize the figure - common, layers = plotter._extract_data(self) - plotter._setup_figure(self, common, layers) - - # Process the scale spec for coordinate variables and transform their data - coord_vars = [v for v in self._variables if re.match(r"^x|y", v)] - plotter._setup_scales(self, common, layers, coord_vars) - - # Apply statistical transform(s) - plotter._compute_stats(self, layers) - - # Process scale spec for semantic variables and coordinates computed by stat - plotter._setup_scales(self, common, layers) - - # TODO Remove these after updating other methods - # ---- Maybe have debug= param that attaches these when True? - plotter._data = common - plotter._layers = layers - - # Process the data for each layer and add matplotlib artists - for layer in layers: - plotter._plot_layer(self, layer) - - # Add various figure decorations - plotter._make_legend(self) - plotter._finalize_figure(self) - - return plotter - - -# ---- The plot compilation engine ---------------------------------------------- # - - -class Plotter: - """ - Engine for compiling a :class:`Plot` spec into a Matplotlib figure. - - This class is not intended to be instantiated directly by users. - - """ - # TODO decide if we ever want these (Plot.plot(debug=True))? - _data: PlotData - _layers: list[Layer] - _figure: Figure - - def __init__(self, pyplot: bool, theme: dict[str, Any]): - - self._pyplot = pyplot - self._theme = theme - self._legend_contents: list[tuple[ - tuple[str, str | int], list[Artist], list[str], - ]] = [] - self._scales: dict[str, Scale] = {} - - def save(self, loc, **kwargs) -> Plotter: # TODO type args - kwargs.setdefault("dpi", 96) - try: - loc = os.path.expanduser(loc) - except TypeError: - # loc may be a buffer in which case that would not work - pass - self._figure.savefig(loc, **kwargs) - return self - - def show(self, **kwargs) -> None: - """ - Display the plot by hooking into pyplot. - - This method calls :func:`matplotlib.pyplot.show` with any keyword parameters. - - """ - # TODO if we did not create the Plotter with pyplot, is it possible to do this? - # If not we should clearly raise. - import matplotlib.pyplot as plt - with theme_context(self._theme): - plt.show(**kwargs) - - # TODO API for accessing the underlying matplotlib objects - # TODO what else is useful in the public API for this class? - - def _repr_png_(self) -> tuple[bytes, dict[str, float]] | None: - - # TODO use matplotlib backend directly instead of going through savefig? - - # TODO perhaps have self.show() flip a switch to disable this, so that - # user does not end up with two versions of the figure in the output - - # TODO use bbox_inches="tight" like the inline backend? - # pro: better results, con: (sometimes) confusing results - # Better solution would be to default (with option to change) - # to using constrained/tight layout. - - if Plot.config.display["format"] != "png": - return None - - buffer = io.BytesIO() - - factor = 2 if Plot.config.display["hidpi"] else 1 - scaling = Plot.config.display["scaling"] / factor - dpi = 96 * factor # TODO put dpi in Plot.config? - - with theme_context(self._theme): # TODO _theme_with_defaults? - self._figure.savefig(buffer, dpi=dpi, format="png", bbox_inches="tight") - data = buffer.getvalue() - - w, h = Image.open(buffer).size - metadata = {"width": w * scaling, "height": h * scaling} - return data, metadata - - def _repr_svg_(self) -> str | None: - - if Plot.config.display["format"] != "svg": - return None - - # TODO DPI for rasterized artists? - - scaling = Plot.config.display["scaling"] - - buffer = io.StringIO() - with theme_context(self._theme): # TODO _theme_with_defaults? - self._figure.savefig(buffer, format="svg", bbox_inches="tight") - - root = ElementTree.fromstring(buffer.getvalue()) - w = scaling * float(root.attrib["width"][:-2]) - h = scaling * float(root.attrib["height"][:-2]) - root.attrib.update(width=f"{w}pt", height=f"{h}pt", viewbox=f"0 0 {w} {h}") - ElementTree.ElementTree(root).write(out := io.BytesIO()) - - return out.getvalue().decode() - - def _extract_data(self, p: Plot) -> tuple[PlotData, list[Layer]]: - - common_data = ( - p._data - .join(None, p._facet_spec.get("variables")) - .join(None, p._pair_spec.get("variables")) - ) - - layers: list[Layer] = [] - for layer in p._layers: - spec = layer.copy() - spec["data"] = common_data.join(layer.get("source"), layer.get("vars")) - layers.append(spec) - - return common_data, layers - - def _resolve_label(self, p: Plot, var: str, auto_label: str | None) -> str: - - if re.match(r"[xy]\d+", var): - key = var if var in p._labels else var[0] - else: - key = var - - label: str - if key in p._labels: - manual_label = p._labels[key] - if callable(manual_label) and auto_label is not None: - label = manual_label(auto_label) - else: - label = cast(str, manual_label) - elif auto_label is None: - label = "" - else: - label = auto_label - return label - - def _setup_figure(self, p: Plot, common: PlotData, layers: list[Layer]) -> None: - - # --- Parsing the faceting/pairing parameterization to specify figure grid - - subplot_spec = p._subplot_spec.copy() - facet_spec = p._facet_spec.copy() - pair_spec = p._pair_spec.copy() - - for axis in "xy": - if axis in p._shares: - subplot_spec[f"share{axis}"] = p._shares[axis] - - for dim in ["col", "row"]: - if dim in common.frame and dim not in facet_spec["structure"]: - order = categorical_order(common.frame[dim]) - facet_spec["structure"][dim] = order - - self._subplots = subplots = Subplots(subplot_spec, facet_spec, pair_spec) - - # --- Figure initialization - self._figure = subplots.init_figure( - pair_spec, self._pyplot, p._figure_spec, p._target, - ) - - # --- Figure annotation - for sub in subplots: - ax = sub["ax"] - for axis in "xy": - axis_key = sub[axis] - - # ~~ Axis labels - - # TODO Should we make it possible to use only one x/y label for - # all rows/columns in a faceted plot? Maybe using sub{axis}label, - # although the alignments of the labels from that method leaves - # something to be desired (in terms of how it defines 'centered'). - names = [ - common.names.get(axis_key), - *(layer["data"].names.get(axis_key) for layer in layers) - ] - auto_label = next((name for name in names if name is not None), None) - label = self._resolve_label(p, axis_key, auto_label) - ax.set(**{f"{axis}label": label}) - - # ~~ Decoration visibility - - # TODO there should be some override (in Plot.layout?) so that - # axis / tick labels can be shown on interior shared axes if desired - - axis_obj = getattr(ax, f"{axis}axis") - visible_side = {"x": "bottom", "y": "left"}.get(axis) - show_axis_label = ( - sub[visible_side] - or not p._pair_spec.get("cross", True) - or ( - axis in p._pair_spec.get("structure", {}) - and bool(p._pair_spec.get("wrap")) - ) - ) - axis_obj.get_label().set_visible(show_axis_label) - - show_tick_labels = ( - show_axis_label - or subplot_spec.get(f"share{axis}") not in ( - True, "all", {"x": "col", "y": "row"}[axis] - ) - ) - for group in ("major", "minor"): - side = {"x": "bottom", "y": "left"}[axis] - axis_obj.set_tick_params(**{f"label{side}": show_tick_labels}) - for t in getattr(axis_obj, f"get_{group}ticklabels")(): - t.set_visible(show_tick_labels) - - # TODO we want right-side titles for row facets in most cases? - # Let's have what we currently call "margin titles" but properly using the - # ax.set_title interface (see my gist) - title_parts = [] - for dim in ["col", "row"]: - if sub[dim] is not None: - val = self._resolve_label(p, "title", f"{sub[dim]}") - if dim in p._labels: - key = self._resolve_label(p, dim, common.names.get(dim)) - val = f"{key} {val}" - title_parts.append(val) - - has_col = sub["col"] is not None - has_row = sub["row"] is not None - show_title = ( - has_col and has_row - or (has_col or has_row) and p._facet_spec.get("wrap") - or (has_col and sub["top"]) - # TODO or has_row and sub["right"] and <right titles> - or has_row # TODO and not <right titles> - ) - if title_parts: - title = " | ".join(title_parts) - title_text = ax.set_title(title) - title_text.set_visible(show_title) - elif not (has_col or has_row): - title = self._resolve_label(p, "title", None) - title_text = ax.set_title(title) - - def _compute_stats(self, spec: Plot, layers: list[Layer]) -> None: - - grouping_vars = [v for v in PROPERTIES if v not in "xy"] - grouping_vars += ["col", "row", "group"] - - pair_vars = spec._pair_spec.get("structure", {}) - - for layer in layers: - - data = layer["data"] - mark = layer["mark"] - stat = layer["stat"] - - if stat is None: - continue - - iter_axes = itertools.product(*[ - pair_vars.get(axis, [axis]) for axis in "xy" - ]) - - old = data.frame - - if pair_vars: - data.frames = {} - data.frame = data.frame.iloc[:0] # TODO to simplify typing - - for coord_vars in iter_axes: - - pairings = "xy", coord_vars - - df = old.copy() - scales = self._scales.copy() - - for axis, var in zip(*pairings): - if axis != var: - df = df.rename(columns={var: axis}) - drop_cols = [x for x in df if re.match(rf"{axis}\d+", str(x))] - df = df.drop(drop_cols, axis=1) - scales[axis] = scales[var] - - orient = layer["orient"] or mark._infer_orient(scales) - - if stat.group_by_orient: - grouper = [orient, *grouping_vars] - else: - grouper = grouping_vars - groupby = GroupBy(grouper) - res = stat(df, groupby, orient, scales) - - if pair_vars: - data.frames[coord_vars] = res - else: - data.frame = res - - def _get_scale( - self, p: Plot, var: str, prop: Property, values: Series - ) -> Scale: - - if re.match(r"[xy]\d+", var): - key = var if var in p._scales else var[0] - else: - key = var - - if key in p._scales: - arg = p._scales[key] - if arg is None or isinstance(arg, Scale): - scale = arg - else: - scale = prop.infer_scale(arg, values) - else: - scale = prop.default_scale(values) - - return scale - - def _get_subplot_data(self, df, var, view, share_state): - - if share_state in [True, "all"]: - # The all-shared case is easiest, every subplot sees all the data - seed_values = df[var] - else: - # Otherwise, we need to setup separate scales for different subplots - if share_state in [False, "none"]: - # Fully independent axes are also easy: use each subplot's data - idx = self._get_subplot_index(df, view) - elif share_state in df: - # Sharing within row/col is more complicated - use_rows = df[share_state] == view[share_state] - idx = df.index[use_rows] - else: - # This configuration doesn't make much sense, but it's fine - idx = df.index - - seed_values = df.loc[idx, var] - - return seed_values - - def _setup_scales( - self, - p: Plot, - common: PlotData, - layers: list[Layer], - variables: list[str] | None = None, - ) -> None: - - if variables is None: - # Add variables that have data but not a scale, which happens - # because this method can be called multiple time, to handle - # variables added during the Stat transform. - variables = [] - for layer in layers: - variables.extend(layer["data"].frame.columns) - for df in layer["data"].frames.values(): - variables.extend(str(v) for v in df if v not in variables) - variables = [v for v in variables if v not in self._scales] - - for var in variables: - - # Determine whether this is a coordinate variable - # (i.e., x/y, paired x/y, or derivative such as xmax) - m = re.match(r"^(?P<coord>(?P<axis>x|y)\d*).*", var) - if m is None: - coord = axis = None - else: - coord = m["coord"] - axis = m["axis"] - - # Get keys that handle things like x0, xmax, properly where relevant - prop_key = var if axis is None else axis - scale_key = var if coord is None else coord - - if prop_key not in PROPERTIES: - continue - - # Concatenate layers, using only the relevant coordinate and faceting vars, - # This is unnecessarily wasteful, as layer data will often be redundant. - # But figuring out the minimal amount we need is more complicated. - cols = [var, "col", "row"] - parts = [common.frame.filter(cols)] - for layer in layers: - parts.append(layer["data"].frame.filter(cols)) - for df in layer["data"].frames.values(): - parts.append(df.filter(cols)) - var_df = pd.concat(parts, ignore_index=True) - - prop = PROPERTIES[prop_key] - scale = self._get_scale(p, scale_key, prop, var_df[var]) - - if scale_key not in p._variables: - # TODO this implies that the variable was added by the stat - # It allows downstream orientation inference to work properly. - # But it feels rather hacky, so ideally revisit. - scale._priority = 0 # type: ignore - - if axis is None: - # We could think about having a broader concept of (un)shared properties - # In general, not something you want to do (different scales in facets) - # But could make sense e.g. with paired plots. Build later. - share_state = None - subplots = [] - else: - share_state = self._subplots.subplot_spec[f"share{axis}"] - subplots = [view for view in self._subplots if view[axis] == coord] - - if scale is None: - self._scales[var] = Scale._identity() - else: - try: - self._scales[var] = scale._setup(var_df[var], prop) - except Exception as err: - raise PlotSpecError._during("Scale setup", var) from err - - if axis is None or (var != coord and coord in p._variables): - # Everything below here applies only to coordinate variables - continue - - # Set up an empty series to receive the transformed values. - # We need this to handle piecemeal transforms of categories -> floats. - transformed_data = [] - for layer in layers: - index = layer["data"].frame.index - empty_series = pd.Series(dtype=float, index=index, name=var) - transformed_data.append(empty_series) - - for view in subplots: - - axis_obj = getattr(view["ax"], f"{axis}axis") - seed_values = self._get_subplot_data(var_df, var, view, share_state) - view_scale = scale._setup(seed_values, prop, axis=axis_obj) - view["ax"].set(**{f"{axis}scale": view_scale._matplotlib_scale}) - - for layer, new_series in zip(layers, transformed_data): - layer_df = layer["data"].frame - if var not in layer_df: - continue - - idx = self._get_subplot_index(layer_df, view) - try: - new_series.loc[idx] = view_scale(layer_df.loc[idx, var]) - except Exception as err: - spec_error = PlotSpecError._during("Scaling operation", var) - raise spec_error from err - - # Now the transformed data series are complete, update the layer data - for layer, new_series in zip(layers, transformed_data): - layer_df = layer["data"].frame - if var in layer_df: - layer_df[var] = pd.to_numeric(new_series) - - def _plot_layer(self, p: Plot, layer: Layer) -> None: - - data = layer["data"] - mark = layer["mark"] - move = layer["move"] - - default_grouping_vars = ["col", "row", "group"] # TODO where best to define? - grouping_properties = [v for v in PROPERTIES if v[0] not in "xy"] - - pair_variables = p._pair_spec.get("structure", {}) - - for subplots, df, scales in self._generate_pairings(data, pair_variables): - - orient = layer["orient"] or mark._infer_orient(scales) - - def get_order(var): - # Ignore order for x/y: they have been scaled to numeric indices, - # so any original order is no longer valid. Default ordering rules - # sorted unique numbers will correctly reconstruct intended order - # TODO This is tricky, make sure we add some tests for this - if var not in "xy" and var in scales: - return getattr(scales[var], "order", None) - - if orient in df: - width = pd.Series(index=df.index, dtype=float) - for view in subplots: - view_idx = self._get_subplot_data( - df, orient, view, p._shares.get(orient) - ).index - view_df = df.loc[view_idx] - if "width" in mark._mappable_props: - view_width = mark._resolve(view_df, "width", None) - elif "width" in df: - view_width = view_df["width"] - else: - view_width = 0.8 # TODO what default? - spacing = scales[orient]._spacing(view_df.loc[view_idx, orient]) - width.loc[view_idx] = view_width * spacing - df["width"] = width - - if "baseline" in mark._mappable_props: - # TODO what marks should have this? - # If we can set baseline with, e.g., Bar(), then the - # "other" (e.g. y for x oriented bars) parameterization - # is somewhat ambiguous. - baseline = mark._resolve(df, "baseline", None) - else: - # TODO unlike width, we might not want to add baseline to data - # if the mark doesn't use it. Practically, there is a concern about - # Mark abstraction like Area / Ribbon - baseline = 0 if "baseline" not in df else df["baseline"] - df["baseline"] = baseline - - if move is not None: - moves = move if isinstance(move, list) else [move] - for move_step in moves: - move_by = getattr(move_step, "by", None) - if move_by is None: - move_by = grouping_properties - move_groupers = [*move_by, *default_grouping_vars] - if move_step.group_by_orient: - move_groupers.insert(0, orient) - order = {var: get_order(var) for var in move_groupers} - groupby = GroupBy(order) - df = move_step(df, groupby, orient, scales) - - df = self._unscale_coords(subplots, df, orient) - - grouping_vars = mark._grouping_props + default_grouping_vars - split_generator = self._setup_split_generator(grouping_vars, df, subplots) - - mark._plot(split_generator, scales, orient) - - # TODO is this the right place for this? - for view in self._subplots: - view["ax"].autoscale_view() - - if layer["legend"]: - self._update_legend_contents(p, mark, data, scales, layer["label"]) - - def _unscale_coords( - self, subplots: list[dict], df: DataFrame, orient: str, - ) -> DataFrame: - # TODO do we still have numbers in the variable name at this point? - coord_cols = [c for c in df if re.match(r"^[xy]\D*$", str(c))] - out_df = ( - df - .drop(coord_cols, axis=1) - .reindex(df.columns, axis=1) # So unscaled columns retain their place - .copy(deep=False) - ) - - for view in subplots: - view_df = self._filter_subplot_data(df, view) - axes_df = view_df[coord_cols] - for var, values in axes_df.items(): - - axis = getattr(view["ax"], f"{str(var)[0]}axis") - # TODO see https://github.com/matplotlib/matplotlib/issues/22713 - transform = axis.get_transform().inverted().transform - inverted = transform(values) - out_df.loc[values.index, str(var)] = inverted - - return out_df - - def _generate_pairings( - self, data: PlotData, pair_variables: dict, - ) -> Generator[ - tuple[list[dict], DataFrame, dict[str, Scale]], None, None - ]: - # TODO retype return with subplot_spec or similar - - iter_axes = itertools.product(*[ - pair_variables.get(axis, [axis]) for axis in "xy" - ]) - - for x, y in iter_axes: - - subplots = [] - for view in self._subplots: - if (view["x"] == x) and (view["y"] == y): - subplots.append(view) - - if data.frame.empty and data.frames: - out_df = data.frames[(x, y)].copy() - elif not pair_variables: - out_df = data.frame.copy() - else: - if data.frame.empty and data.frames: - out_df = data.frames[(x, y)].copy() - else: - out_df = data.frame.copy() - - scales = self._scales.copy() - if x in out_df: - scales["x"] = self._scales[x] - if y in out_df: - scales["y"] = self._scales[y] - - for axis, var in zip("xy", (x, y)): - if axis != var: - out_df = out_df.rename(columns={var: axis}) - cols = [col for col in out_df if re.match(rf"{axis}\d+", str(col))] - out_df = out_df.drop(cols, axis=1) - - yield subplots, out_df, scales - - def _get_subplot_index(self, df: DataFrame, subplot: dict) -> Index: - - dims = df.columns.intersection(["col", "row"]) - if dims.empty: - return df.index - - keep_rows = pd.Series(True, df.index, dtype=bool) - for dim in dims: - keep_rows &= df[dim] == subplot[dim] - return df.index[keep_rows] - - def _filter_subplot_data(self, df: DataFrame, subplot: dict) -> DataFrame: - # TODO note redundancies with preceding function ... needs refactoring - dims = df.columns.intersection(["col", "row"]) - if dims.empty: - return df - - keep_rows = pd.Series(True, df.index, dtype=bool) - for dim in dims: - keep_rows &= df[dim] == subplot[dim] - return df[keep_rows] - - def _setup_split_generator( - self, grouping_vars: list[str], df: DataFrame, subplots: list[dict[str, Any]], - ) -> Callable[[], Generator]: - - grouping_keys = [] - grouping_vars = [ - v for v in grouping_vars if v in df and v not in ["col", "row"] - ] - for var in grouping_vars: - order = getattr(self._scales[var], "order", None) - if order is None: - order = categorical_order(df[var]) - grouping_keys.append(order) - - def split_generator(keep_na=False) -> Generator: - - for view in subplots: - - axes_df = self._filter_subplot_data(df, view) - - axes_df_inf_as_nan = axes_df.copy() - axes_df_inf_as_nan = axes_df_inf_as_nan.mask( - axes_df_inf_as_nan.isin([np.inf, -np.inf]), np.nan - ) - if keep_na: - # The simpler thing to do would be x.dropna().reindex(x.index). - # But that doesn't work with the way that the subset iteration - # is written below, which assumes data for grouping vars. - # Matplotlib (usually?) masks nan data, so this should "work". - # Downstream code can also drop these rows, at some speed cost. - present = axes_df_inf_as_nan.notna().all(axis=1) - nulled = {} - for axis in "xy": - if axis in axes_df: - nulled[axis] = axes_df[axis].where(present) - axes_df = axes_df_inf_as_nan.assign(**nulled) - else: - axes_df = axes_df_inf_as_nan.dropna() - - subplot_keys = {} - for dim in ["col", "row"]: - if view[dim] is not None: - subplot_keys[dim] = view[dim] - - if not grouping_vars or not any(grouping_keys): - if not axes_df.empty: - yield subplot_keys, axes_df.copy(), view["ax"] - continue - - grouped_df = axes_df.groupby( - grouping_vars, sort=False, as_index=False, observed=False, - ) - - for key in itertools.product(*grouping_keys): - - pd_key = ( - key[0] if len(key) == 1 and _version_predates(pd, "2.2.0") - else key - ) - try: - df_subset = grouped_df.get_group(pd_key) - except KeyError: - # TODO (from initial work on categorical plots refactor) - # We are adding this to allow backwards compatability - # with the empty artists that old categorical plots would - # add (before 0.12), which we may decide to break, in which - # case this option could be removed - df_subset = axes_df.loc[[]] - - if df_subset.empty: - continue - - sub_vars = dict(zip(grouping_vars, key)) - sub_vars.update(subplot_keys) - - # TODO need copy(deep=...) policy (here, above, anywhere else?) - yield sub_vars, df_subset.copy(), view["ax"] - - return split_generator - - def _update_legend_contents( - self, - p: Plot, - mark: Mark, - data: PlotData, - scales: dict[str, Scale], - layer_label: str | None, - ) -> None: - """Add legend artists / labels for one layer in the plot.""" - if data.frame.empty and data.frames: - legend_vars: list[str] = [] - for frame in data.frames.values(): - frame_vars = frame.columns.intersection(list(scales)) - legend_vars.extend(v for v in frame_vars if v not in legend_vars) - else: - legend_vars = list(data.frame.columns.intersection(list(scales))) - - # First handle layer legends, which occupy a single entry in legend_contents. - if layer_label is not None: - legend_title = str(p._labels.get("legend", "")) - layer_key = (legend_title, -1) - artist = mark._legend_artist([], None, {}) - if artist is not None: - for content in self._legend_contents: - if content[0] == layer_key: - content[1].append(artist) - content[2].append(layer_label) - break - else: - self._legend_contents.append((layer_key, [artist], [layer_label])) - - # Then handle the scale legends - # First pass: Identify the values that will be shown for each variable - schema: list[tuple[ - tuple[str, str | int], list[str], tuple[list[Any], list[str]] - ]] = [] - schema = [] - for var in legend_vars: - var_legend = scales[var]._legend - if var_legend is not None: - values, labels = var_legend - for (_, part_id), part_vars, _ in schema: - if data.ids[var] == part_id: - # Allow multiple plot semantics to represent same data variable - part_vars.append(var) - break - else: - title = self._resolve_label(p, var, data.names[var]) - entry = (title, data.ids[var]), [var], (values, labels) - schema.append(entry) - - # Second pass, generate an artist corresponding to each value - contents: list[tuple[tuple[str, str | int], Any, list[str]]] = [] - for key, variables, (values, labels) in schema: - artists = [] - for val in values: - artist = mark._legend_artist(variables, val, scales) - if artist is not None: - artists.append(artist) - if artists: - contents.append((key, artists, labels)) - - self._legend_contents.extend(contents) - - def _make_legend(self, p: Plot) -> None: - """Create the legend artist(s) and add onto the figure.""" - # Combine artists representing same information across layers - # Input list has an entry for each distinct variable in each layer - # Output dict has an entry for each distinct variable - merged_contents: dict[ - tuple[str, str | int], tuple[list[tuple[Artist, ...]], list[str]], - ] = {} - for key, new_artists, labels in self._legend_contents: - # Key is (name, id); we need the id to resolve variable uniqueness, - # but will need the name in the next step to title the legend - if key not in merged_contents: - # Matplotlib accepts a tuple of artists and will overlay them - new_artist_tuples = [tuple([a]) for a in new_artists] - merged_contents[key] = new_artist_tuples, labels - else: - existing_artists = merged_contents[key][0] - for i, new_artist in enumerate(new_artists): - existing_artists[i] += tuple([new_artist]) - - # When using pyplot, an "external" legend won't be shown, so this - # keeps it inside the axes (though still attached to the figure) - # This is necessary because matplotlib layout engines currently don't - # support figure legends — ideally this will change. - loc = "center right" if self._pyplot else "center left" - - base_legend = None - for (name, _), (handles, labels) in merged_contents.items(): - - legend = mpl.legend.Legend( - self._figure, - handles, # type: ignore # matplotlib/issues/26639 - labels, - title=name, - loc=loc, - bbox_to_anchor=(.98, .55), - ) - - if base_legend: - # Matplotlib has no public API for this so it is a bit of a hack. - # Ideally we'd define our own legend class with more flexibility, - # but that is a lot of work! - base_legend_box = base_legend.get_children()[0] - this_legend_box = legend.get_children()[0] - base_legend_box.get_children().extend(this_legend_box.get_children()) - else: - base_legend = legend - self._figure.legends.append(legend) - - def _finalize_figure(self, p: Plot) -> None: - - for sub in self._subplots: - ax = sub["ax"] - for axis in "xy": - axis_key = sub[axis] - axis_obj = getattr(ax, f"{axis}axis") - - # Axis limits - if axis_key in p._limits or axis in p._limits: - convert_units = getattr(ax, f"{axis}axis").convert_units - a, b = p._limits.get(axis_key) or p._limits[axis] - lo = a if a is None else convert_units(a) - hi = b if b is None else convert_units(b) - if isinstance(a, str): - lo = cast(float, lo) - 0.5 - if isinstance(b, str): - hi = cast(float, hi) + 0.5 - ax.set(**{f"{axis}lim": (lo, hi)}) - - if axis_key in self._scales: # TODO when would it not be? - self._scales[axis_key]._finalize(p, axis_obj) - - if (engine_name := p._layout_spec.get("engine", default)) is not default: - # None is a valid arg for Figure.set_layout_engine, hence `default` - set_layout_engine(self._figure, engine_name) - elif p._target is None: - # Don't modify the layout engine if the user supplied their own - # matplotlib figure and didn't specify an engine through Plot - # TODO switch default to "constrained"? - # TODO either way, make configurable - set_layout_engine(self._figure, "tight") - - if (extent := p._layout_spec.get("extent")) is not None: - engine = get_layout_engine(self._figure) - if engine is None: - self._figure.subplots_adjust(*extent) - else: - # Note the different parameterization for the layout engine rect... - left, bottom, right, top = extent - width, height = right - left, top - bottom - try: - # The base LayoutEngine.set method doesn't have rect= so we need - # to avoid typechecking this statement. We also catch a TypeError - # as a plugin LayoutEngine may not support it either. - # Alternatively we could guard this with a check on the engine type, - # but that would make later-developed engines would un-useable. - engine.set(rect=[left, bottom, width, height]) # type: ignore - except TypeError: - # Should we warn / raise? Note that we don't expect to get here - # under any normal circumstances. - pass diff --git a/seaborn/_core/properties.py b/seaborn/_core/properties.py deleted file mode 100644 index 4e2df91b49faf267b6af020a4ef4078ea3a28566..0000000000000000000000000000000000000000 --- a/seaborn/_core/properties.py +++ /dev/null @@ -1,834 +0,0 @@ -from __future__ import annotations -import itertools -import warnings - -import numpy as np -from numpy.typing import ArrayLike -from pandas import Series -import matplotlib as mpl -from matplotlib.colors import to_rgb, to_rgba, to_rgba_array -from matplotlib.markers import MarkerStyle -from matplotlib.path import Path - -from seaborn._core.scales import Scale, Boolean, Continuous, Nominal, Temporal -from seaborn._core.rules import categorical_order, variable_type -from seaborn.palettes import QUAL_PALETTES, color_palette, blend_palette -from seaborn.utils import get_color_cycle - -from typing import Any, Callable, Tuple, List, Union, Optional - -RGBTuple = Tuple[float, float, float] -RGBATuple = Tuple[float, float, float, float] -ColorSpec = Union[RGBTuple, RGBATuple, str] - -DashPattern = Tuple[float, ...] -DashPatternWithOffset = Tuple[float, Optional[DashPattern]] - -MarkerPattern = Union[ - float, - str, - Tuple[int, int, float], - List[Tuple[float, float]], - Path, - MarkerStyle, -] - -Mapping = Callable[[ArrayLike], ArrayLike] - - -# =================================================================================== # -# Base classes -# =================================================================================== # - - -class Property: - """Base class for visual properties that can be set directly or be data scaling.""" - - # When True, scales for this property will populate the legend by default - legend = False - - # When True, scales for this property normalize data to [0, 1] before mapping - normed = False - - def __init__(self, variable: str | None = None): - """Initialize the property with the name of the corresponding plot variable.""" - if not variable: - variable = self.__class__.__name__.lower() - self.variable = variable - - def default_scale(self, data: Series) -> Scale: - """Given data, initialize appropriate scale class.""" - - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - if var_type == "numeric": - return Continuous() - elif var_type == "datetime": - return Temporal() - elif var_type == "boolean": - return Boolean() - else: - return Nominal() - - def infer_scale(self, arg: Any, data: Series) -> Scale: - """Given data and a scaling argument, initialize appropriate scale class.""" - # TODO put these somewhere external for validation - # TODO putting this here won't pick it up if subclasses define infer_scale - # (e.g. color). How best to handle that? One option is to call super after - # handling property-specific possibilities (e.g. for color check that the - # arg is not a valid palette name) but that could get tricky. - trans_args = ["log", "symlog", "logit", "pow", "sqrt"] - if isinstance(arg, str): - if any(arg.startswith(k) for k in trans_args): - # TODO validate numeric type? That should happen centrally somewhere - return Continuous(trans=arg) - else: - msg = f"Unknown magic arg for {self.variable} scale: '{arg}'." - raise ValueError(msg) - else: - arg_type = type(arg).__name__ - msg = f"Magic arg for {self.variable} scale must be str, not {arg_type}." - raise TypeError(msg) - - def get_mapping(self, scale: Scale, data: Series) -> Mapping: - """Return a function that maps from data domain to property range.""" - def identity(x): - return x - return identity - - def standardize(self, val: Any) -> Any: - """Coerce flexible property value to standardized representation.""" - return val - - def _check_dict_entries(self, levels: list, values: dict) -> None: - """Input check when values are provided as a dictionary.""" - missing = set(levels) - set(values) - if missing: - formatted = ", ".join(map(repr, sorted(missing, key=str))) - err = f"No entry in {self.variable} dictionary for {formatted}" - raise ValueError(err) - - def _check_list_length(self, levels: list, values: list) -> list: - """Input check when values are provided as a list.""" - message = "" - if len(levels) > len(values): - message = " ".join([ - f"\nThe {self.variable} list has fewer values ({len(values)})", - f"than needed ({len(levels)}) and will cycle, which may", - "produce an uninterpretable plot." - ]) - values = [x for _, x in zip(levels, itertools.cycle(values))] - - elif len(values) > len(levels): - message = " ".join([ - f"The {self.variable} list has more values ({len(values)})", - f"than needed ({len(levels)}), which may not be intended.", - ]) - values = values[:len(levels)] - - # TODO look into custom PlotSpecWarning with better formatting - if message: - warnings.warn(message, UserWarning) - - return values - - -# =================================================================================== # -# Properties relating to spatial position of marks on the plotting axes -# =================================================================================== # - - -class Coordinate(Property): - """The position of visual marks with respect to the axes of the plot.""" - legend = False - normed = False - - -# =================================================================================== # -# Properties with numeric values where scale range can be defined as an interval -# =================================================================================== # - - -class IntervalProperty(Property): - """A numeric property where scale range can be defined as an interval.""" - legend = True - normed = True - - _default_range: tuple[float, float] = (0, 1) - - @property - def default_range(self) -> tuple[float, float]: - """Min and max values used by default for semantic mapping.""" - return self._default_range - - def _forward(self, values: ArrayLike) -> ArrayLike: - """Transform applied to native values before linear mapping into interval.""" - return values - - def _inverse(self, values: ArrayLike) -> ArrayLike: - """Transform applied to results of mapping that returns to native values.""" - return values - - def infer_scale(self, arg: Any, data: Series) -> Scale: - """Given data and a scaling argument, initialize appropriate scale class.""" - - # TODO infer continuous based on log/sqrt etc? - - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - - if var_type == "boolean": - return Boolean(arg) - elif isinstance(arg, (list, dict)): - return Nominal(arg) - elif var_type == "categorical": - return Nominal(arg) - elif var_type == "datetime": - return Temporal(arg) - # TODO other variable types - else: - return Continuous(arg) - - def get_mapping(self, scale: Scale, data: Series) -> Mapping: - """Return a function that maps from data domain to property range.""" - if isinstance(scale, Nominal): - return self._get_nominal_mapping(scale, data) - elif isinstance(scale, Boolean): - return self._get_boolean_mapping(scale, data) - - if scale.values is None: - vmin, vmax = self._forward(self.default_range) - elif isinstance(scale.values, tuple) and len(scale.values) == 2: - vmin, vmax = self._forward(scale.values) - else: - if isinstance(scale.values, tuple): - actual = f"{len(scale.values)}-tuple" - else: - actual = str(type(scale.values)) - scale_class = scale.__class__.__name__ - err = " ".join([ - f"Values for {self.variable} variables with {scale_class} scale", - f"must be 2-tuple; not {actual}.", - ]) - raise TypeError(err) - - def mapping(x): - return self._inverse(np.multiply(x, vmax - vmin) + vmin) - - return mapping - - def _get_nominal_mapping(self, scale: Nominal, data: Series) -> Mapping: - """Identify evenly-spaced values using interval or explicit mapping.""" - levels = categorical_order(data, scale.order) - values = self._get_values(scale, levels) - - def mapping(x): - ixs = np.asarray(x, np.intp) - out = np.full(len(x), np.nan) - use = np.isfinite(x) - out[use] = np.take(values, ixs[use]) - return out - - return mapping - - def _get_boolean_mapping(self, scale: Boolean, data: Series) -> Mapping: - """Identify evenly-spaced values using interval or explicit mapping.""" - values = self._get_values(scale, [True, False]) - - def mapping(x): - out = np.full(len(x), np.nan) - use = np.isfinite(x) - out[use] = np.where(x[use], *values) - return out - - return mapping - - def _get_values(self, scale: Scale, levels: list) -> list: - """Validate scale.values and identify a value for each level.""" - if isinstance(scale.values, dict): - self._check_dict_entries(levels, scale.values) - values = [scale.values[x] for x in levels] - elif isinstance(scale.values, list): - values = self._check_list_length(levels, scale.values) - else: - if scale.values is None: - vmin, vmax = self.default_range - elif isinstance(scale.values, tuple): - vmin, vmax = scale.values - else: - scale_class = scale.__class__.__name__ - err = " ".join([ - f"Values for {self.variable} variables with {scale_class} scale", - f"must be a dict, list or tuple; not {type(scale.values)}", - ]) - raise TypeError(err) - - vmin, vmax = self._forward([vmin, vmax]) - values = list(self._inverse(np.linspace(vmax, vmin, len(levels)))) - - return values - - -class PointSize(IntervalProperty): - """Size (diameter) of a point mark, in points, with scaling by area.""" - _default_range = 2, 8 # TODO use rcparams? - - def _forward(self, values): - """Square native values to implement linear scaling of point area.""" - return np.square(values) - - def _inverse(self, values): - """Invert areal values back to point diameter.""" - return np.sqrt(values) - - -class LineWidth(IntervalProperty): - """Thickness of a line mark, in points.""" - @property - def default_range(self) -> tuple[float, float]: - """Min and max values used by default for semantic mapping.""" - base = mpl.rcParams["lines.linewidth"] - return base * .5, base * 2 - - -class EdgeWidth(IntervalProperty): - """Thickness of the edges on a patch mark, in points.""" - @property - def default_range(self) -> tuple[float, float]: - """Min and max values used by default for semantic mapping.""" - base = mpl.rcParams["patch.linewidth"] - return base * .5, base * 2 - - -class Stroke(IntervalProperty): - """Thickness of lines that define point glyphs.""" - _default_range = .25, 2.5 - - -class Alpha(IntervalProperty): - """Opacity of the color values for an arbitrary mark.""" - _default_range = .3, .95 - # TODO validate / enforce that output is in [0, 1] - - -class Offset(IntervalProperty): - """Offset for edge-aligned text, in point units.""" - _default_range = 0, 5 - _legend = False - - -class FontSize(IntervalProperty): - """Font size for textual marks, in points.""" - _legend = False - - @property - def default_range(self) -> tuple[float, float]: - """Min and max values used by default for semantic mapping.""" - base = mpl.rcParams["font.size"] - return base * .5, base * 2 - - -# =================================================================================== # -# Properties defined by arbitrary objects with inherently nominal scaling -# =================================================================================== # - - -class ObjectProperty(Property): - """A property defined by arbitrary an object, with inherently nominal scaling.""" - legend = True - normed = False - - # Object representing null data, should appear invisible when drawn by matplotlib - # Note that we now drop nulls in Plot._plot_layer and thus may not need this - null_value: Any = None - - def _default_values(self, n: int) -> list: - raise NotImplementedError() - - def default_scale(self, data: Series) -> Scale: - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - return Boolean() if var_type == "boolean" else Nominal() - - def infer_scale(self, arg: Any, data: Series) -> Scale: - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - return Boolean(arg) if var_type == "boolean" else Nominal(arg) - - def get_mapping(self, scale: Scale, data: Series) -> Mapping: - """Define mapping as lookup into list of object values.""" - boolean_scale = isinstance(scale, Boolean) - order = getattr(scale, "order", [True, False] if boolean_scale else None) - levels = categorical_order(data, order) - values = self._get_values(scale, levels) - - if boolean_scale: - values = values[::-1] - - def mapping(x): - ixs = np.asarray(np.nan_to_num(x), np.intp) - return [ - values[ix] if np.isfinite(x_i) else self.null_value - for x_i, ix in zip(x, ixs) - ] - - return mapping - - def _get_values(self, scale: Scale, levels: list) -> list: - """Validate scale.values and identify a value for each level.""" - n = len(levels) - if isinstance(scale.values, dict): - self._check_dict_entries(levels, scale.values) - values = [scale.values[x] for x in levels] - elif isinstance(scale.values, list): - values = self._check_list_length(levels, scale.values) - elif scale.values is None: - values = self._default_values(n) - else: - msg = " ".join([ - f"Scale values for a {self.variable} variable must be provided", - f"in a dict or list; not {type(scale.values)}." - ]) - raise TypeError(msg) - - values = [self.standardize(x) for x in values] - return values - - -class Marker(ObjectProperty): - """Shape of points in scatter-type marks or lines with data points marked.""" - null_value = MarkerStyle("") - - # TODO should we have named marker "palettes"? (e.g. see d3 options) - - # TODO need some sort of "require_scale" functionality - # to raise when we get the wrong kind explicitly specified - - def standardize(self, val: MarkerPattern) -> MarkerStyle: - return MarkerStyle(val) - - def _default_values(self, n: int) -> list[MarkerStyle]: - """Build an arbitrarily long list of unique marker styles. - - Parameters - ---------- - n : int - Number of unique marker specs to generate. - - Returns - ------- - markers : list of string or tuples - Values for defining :class:`matplotlib.markers.MarkerStyle` objects. - All markers will be filled. - - """ - # Start with marker specs that are well distinguishable - markers = [ - "o", "X", (4, 0, 45), "P", (4, 0, 0), (4, 1, 0), "^", (4, 1, 45), "v", - ] - - # Now generate more from regular polygons of increasing order - s = 5 - while len(markers) < n: - a = 360 / (s + 1) / 2 - markers.extend([(s + 1, 1, a), (s + 1, 0, a), (s, 1, 0), (s, 0, 0)]) - s += 1 - - markers = [MarkerStyle(m) for m in markers[:n]] - - return markers - - -class LineStyle(ObjectProperty): - """Dash pattern for line-type marks.""" - null_value = "" - - def standardize(self, val: str | DashPattern) -> DashPatternWithOffset: - return self._get_dash_pattern(val) - - def _default_values(self, n: int) -> list[DashPatternWithOffset]: - """Build an arbitrarily long list of unique dash styles for lines. - - Parameters - ---------- - n : int - Number of unique dash specs to generate. - - Returns - ------- - dashes : list of strings or tuples - Valid arguments for the ``dashes`` parameter on - :class:`matplotlib.lines.Line2D`. The first spec is a solid - line (``""``), the remainder are sequences of long and short - dashes. - - """ - # Start with dash specs that are well distinguishable - dashes: list[str | DashPattern] = [ - "-", (4, 1.5), (1, 1), (3, 1.25, 1.5, 1.25), (5, 1, 1, 1), - ] - - # Now programmatically build as many as we need - p = 3 - while len(dashes) < n: - - # Take combinations of long and short dashes - a = itertools.combinations_with_replacement([3, 1.25], p) - b = itertools.combinations_with_replacement([4, 1], p) - - # Interleave the combinations, reversing one of the streams - segment_list = itertools.chain(*zip(list(a)[1:-1][::-1], list(b)[1:-1])) - - # Now insert the gaps - for segments in segment_list: - gap = min(segments) - spec = tuple(itertools.chain(*((seg, gap) for seg in segments))) - dashes.append(spec) - - p += 1 - - return [self._get_dash_pattern(x) for x in dashes] - - @staticmethod - def _get_dash_pattern(style: str | DashPattern) -> DashPatternWithOffset: - """Convert linestyle arguments to dash pattern with offset.""" - # Copied and modified from Matplotlib 3.4 - # go from short hand -> full strings - ls_mapper = {"-": "solid", "--": "dashed", "-.": "dashdot", ":": "dotted"} - if isinstance(style, str): - style = ls_mapper.get(style, style) - # un-dashed styles - if style in ["solid", "none", "None"]: - offset = 0 - dashes = None - # dashed styles - elif style in ["dashed", "dashdot", "dotted"]: - offset = 0 - dashes = tuple(mpl.rcParams[f"lines.{style}_pattern"]) - else: - options = [*ls_mapper.values(), *ls_mapper.keys()] - msg = f"Linestyle string must be one of {options}, not {repr(style)}." - raise ValueError(msg) - - elif isinstance(style, tuple): - if len(style) > 1 and isinstance(style[1], tuple): - offset, dashes = style - elif len(style) > 1 and style[1] is None: - offset, dashes = style - else: - offset = 0 - dashes = style - else: - val_type = type(style).__name__ - msg = f"Linestyle must be str or tuple, not {val_type}." - raise TypeError(msg) - - # Normalize offset to be positive and shorter than the dash cycle - if dashes is not None: - try: - dsum = sum(dashes) - except TypeError as err: - msg = f"Invalid dash pattern: {dashes}" - raise TypeError(msg) from err - if dsum: - offset %= dsum - - return offset, dashes - - -class TextAlignment(ObjectProperty): - legend = False - - -class HorizontalAlignment(TextAlignment): - - def _default_values(self, n: int) -> list: - vals = itertools.cycle(["left", "right"]) - return [next(vals) for _ in range(n)] - - -class VerticalAlignment(TextAlignment): - - def _default_values(self, n: int) -> list: - vals = itertools.cycle(["top", "bottom"]) - return [next(vals) for _ in range(n)] - - -# =================================================================================== # -# Properties with RGB(A) color values -# =================================================================================== # - - -class Color(Property): - """Color, as RGB(A), scalable with nominal palettes or continuous gradients.""" - legend = True - normed = True - - def standardize(self, val: ColorSpec) -> RGBTuple | RGBATuple: - # Return color with alpha channel only if the input spec has it - # This is so that RGBA colors can override the Alpha property - if to_rgba(val) != to_rgba(val, 1): - return to_rgba(val) - else: - return to_rgb(val) - - def _standardize_color_sequence(self, colors: ArrayLike) -> ArrayLike: - """Convert color sequence to RGB(A) array, preserving but not adding alpha.""" - def has_alpha(x): - return to_rgba(x) != to_rgba(x, 1) - - if isinstance(colors, np.ndarray): - needs_alpha = colors.shape[1] == 4 - else: - needs_alpha = any(has_alpha(x) for x in colors) - - if needs_alpha: - return to_rgba_array(colors) - else: - return to_rgba_array(colors)[:, :3] - - def infer_scale(self, arg: Any, data: Series) -> Scale: - # TODO when inferring Continuous without data, verify type - - # TODO need to rethink the variable type system - # (e.g. boolean, ordered categories as Ordinal, etc).. - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - - if var_type == "boolean": - return Boolean(arg) - - if isinstance(arg, (dict, list)): - return Nominal(arg) - - if isinstance(arg, tuple): - if var_type == "categorical": - # TODO It seems reasonable to allow a gradient mapping for nominal - # scale but it also feels "technically" wrong. Should this infer - # Ordinal with categorical data and, if so, verify orderedness? - return Nominal(arg) - return Continuous(arg) - - if callable(arg): - return Continuous(arg) - - # TODO Do we accept str like "log", "pow", etc. for semantics? - - if not isinstance(arg, str): - msg = " ".join([ - f"A single scale argument for {self.variable} variables must be", - f"a string, dict, tuple, list, or callable, not {type(arg)}." - ]) - raise TypeError(msg) - - if arg in QUAL_PALETTES: - return Nominal(arg) - elif var_type == "numeric": - return Continuous(arg) - # TODO implement scales for date variables and any others. - else: - return Nominal(arg) - - def get_mapping(self, scale: Scale, data: Series) -> Mapping: - """Return a function that maps from data domain to color values.""" - # TODO what is best way to do this conditional? - # Should it be class-based or should classes have behavioral attributes? - if isinstance(scale, Nominal): - return self._get_nominal_mapping(scale, data) - elif isinstance(scale, Boolean): - return self._get_boolean_mapping(scale, data) - - if scale.values is None: - # TODO Rethink best default continuous color gradient - mapping = color_palette("ch:", as_cmap=True) - elif isinstance(scale.values, tuple): - # TODO blend_palette will strip alpha, but we should support - # interpolation on all four channels - mapping = blend_palette(scale.values, as_cmap=True) - elif isinstance(scale.values, str): - # TODO for matplotlib colormaps this will clip extremes, which is - # different from what using the named colormap directly would do - # This may or may not be desireable. - mapping = color_palette(scale.values, as_cmap=True) - elif callable(scale.values): - mapping = scale.values - else: - scale_class = scale.__class__.__name__ - msg = " ".join([ - f"Scale values for {self.variable} with a {scale_class} mapping", - f"must be string, tuple, or callable; not {type(scale.values)}." - ]) - raise TypeError(msg) - - def _mapping(x): - # Remove alpha channel so it does not override alpha property downstream - # TODO this will need to be more flexible to support RGBA tuples (see above) - invalid = ~np.isfinite(x) - out = mapping(x)[:, :3] - out[invalid] = np.nan - return out - - return _mapping - - def _get_nominal_mapping(self, scale: Nominal, data: Series) -> Mapping: - - levels = categorical_order(data, scale.order) - colors = self._get_values(scale, levels) - - def mapping(x): - ixs = np.asarray(np.nan_to_num(x), np.intp) - use = np.isfinite(x) - out = np.full((len(ixs), colors.shape[1]), np.nan) - out[use] = np.take(colors, ixs[use], axis=0) - return out - - return mapping - - def _get_boolean_mapping(self, scale: Boolean, data: Series) -> Mapping: - - colors = self._get_values(scale, [True, False]) - - def mapping(x): - - use = np.isfinite(x) - x = np.asarray(np.nan_to_num(x)).astype(bool) - out = np.full((len(x), colors.shape[1]), np.nan) - out[x & use] = colors[0] - out[~x & use] = colors[1] - return out - - return mapping - - def _get_values(self, scale: Scale, levels: list) -> ArrayLike: - """Validate scale.values and identify a value for each level.""" - n = len(levels) - values = scale.values - if isinstance(values, dict): - self._check_dict_entries(levels, values) - colors = [values[x] for x in levels] - elif isinstance(values, list): - colors = self._check_list_length(levels, values) - elif isinstance(values, tuple): - colors = blend_palette(values, n) - elif isinstance(values, str): - colors = color_palette(values, n) - elif values is None: - if n <= len(get_color_cycle()): - # Use current (global) default palette - colors = color_palette(n_colors=n) - else: - colors = color_palette("husl", n) - else: - scale_class = scale.__class__.__name__ - msg = " ".join([ - f"Scale values for {self.variable} with a {scale_class} mapping", - f"must be string, list, tuple, or dict; not {type(scale.values)}." - ]) - raise TypeError(msg) - - return self._standardize_color_sequence(colors) - - -# =================================================================================== # -# Properties that can take only two states -# =================================================================================== # - - -class Fill(Property): - """Boolean property of points/bars/patches that can be solid or outlined.""" - legend = True - normed = False - - def default_scale(self, data: Series) -> Scale: - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - return Boolean() if var_type == "boolean" else Nominal() - - def infer_scale(self, arg: Any, data: Series) -> Scale: - var_type = variable_type(data, boolean_type="boolean", strict_boolean=True) - return Boolean(arg) if var_type == "boolean" else Nominal(arg) - - def standardize(self, val: Any) -> bool: - return bool(val) - - def _default_values(self, n: int) -> list: - """Return a list of n values, alternating True and False.""" - if n > 2: - msg = " ".join([ - f"The variable assigned to {self.variable} has more than two levels,", - f"so {self.variable} values will cycle and may be uninterpretable", - ]) - # TODO fire in a "nice" way (see above) - warnings.warn(msg, UserWarning) - return [x for x, _ in zip(itertools.cycle([True, False]), range(n))] - - def get_mapping(self, scale: Scale, data: Series) -> Mapping: - """Return a function that maps each data value to True or False.""" - boolean_scale = isinstance(scale, Boolean) - order = getattr(scale, "order", [True, False] if boolean_scale else None) - levels = categorical_order(data, order) - values = self._get_values(scale, levels) - - if boolean_scale: - values = values[::-1] - - def mapping(x): - ixs = np.asarray(np.nan_to_num(x), np.intp) - return [ - values[ix] if np.isfinite(x_i) else False - for x_i, ix in zip(x, ixs) - ] - - return mapping - - def _get_values(self, scale: Scale, levels: list) -> list: - """Validate scale.values and identify a value for each level.""" - if isinstance(scale.values, list): - values = [bool(x) for x in scale.values] - elif isinstance(scale.values, dict): - values = [bool(scale.values[x]) for x in levels] - elif scale.values is None: - values = self._default_values(len(levels)) - else: - msg = " ".join([ - f"Scale values for {self.variable} must be passed in", - f"a list or dict; not {type(scale.values)}." - ]) - raise TypeError(msg) - - return values - - -# =================================================================================== # -# Enumeration of properties for use by Plot and Mark classes -# =================================================================================== # -# TODO turn this into a property registry with hooks, etc. -# TODO Users do not interact directly with properties, so how to document them? - - -PROPERTY_CLASSES = { - "x": Coordinate, - "y": Coordinate, - "color": Color, - "alpha": Alpha, - "fill": Fill, - "marker": Marker, - "pointsize": PointSize, - "stroke": Stroke, - "linewidth": LineWidth, - "linestyle": LineStyle, - "fillcolor": Color, - "fillalpha": Alpha, - "edgewidth": EdgeWidth, - "edgestyle": LineStyle, - "edgecolor": Color, - "edgealpha": Alpha, - "text": Property, - "halign": HorizontalAlignment, - "valign": VerticalAlignment, - "offset": Offset, - "fontsize": FontSize, - "xmin": Coordinate, - "xmax": Coordinate, - "ymin": Coordinate, - "ymax": Coordinate, - "group": Property, - # TODO pattern? - # TODO gradient? -} - -PROPERTIES = {var: cls(var) for var, cls in PROPERTY_CLASSES.items()} diff --git a/seaborn/_core/rules.py b/seaborn/_core/rules.py deleted file mode 100644 index de6c651d97b6657bfb1bee2de370376958c750ae..0000000000000000000000000000000000000000 --- a/seaborn/_core/rules.py +++ /dev/null @@ -1,173 +0,0 @@ -from __future__ import annotations - -import warnings -from collections import UserString -from numbers import Number -from datetime import datetime - -import numpy as np -import pandas as pd - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from typing import Literal - from pandas import Series - - -class VarType(UserString): - """ - Prevent comparisons elsewhere in the library from using the wrong name. - - Errors are simple assertions because users should not be able to trigger - them. If that changes, they should be more verbose. - - """ - # TODO VarType is an awfully overloaded name, but so is DataType ... - # TODO adding unknown because we are using this in for scales, is that right? - allowed = "numeric", "datetime", "categorical", "boolean", "unknown" - - def __init__(self, data): - assert data in self.allowed, data - super().__init__(data) - - def __eq__(self, other): - assert other in self.allowed, other - return self.data == other - - -def variable_type( - vector: Series, - boolean_type: Literal["numeric", "categorical", "boolean"] = "numeric", - strict_boolean: bool = False, -) -> VarType: - """ - Determine whether a vector contains numeric, categorical, or datetime data. - - This function differs from the pandas typing API in a few ways: - - - Python sequences or object-typed PyData objects are considered numeric if - all of their entries are numeric. - - String or mixed-type data are considered categorical even if not - explicitly represented as a :class:`pandas.api.types.CategoricalDtype`. - - There is some flexibility about how to treat binary / boolean data. - - Parameters - ---------- - vector : :func:`pandas.Series`, :func:`numpy.ndarray`, or Python sequence - Input data to test. - boolean_type : 'numeric', 'categorical', or 'boolean' - Type to use for vectors containing only 0s and 1s (and NAs). - strict_boolean : bool - If True, only consider data to be boolean when the dtype is bool or Boolean. - - Returns - ------- - var_type : 'numeric', 'categorical', or 'datetime' - Name identifying the type of data in the vector. - """ - - # If a categorical dtype is set, infer categorical - if isinstance(getattr(vector, 'dtype', None), pd.CategoricalDtype): - return VarType("categorical") - - # Special-case all-na data, which is always "numeric" - if pd.isna(vector).all(): - return VarType("numeric") - - # Now drop nulls to simplify further type inference - vector = vector.dropna() - - # Special-case binary/boolean data, allow caller to determine - # This triggers a numpy warning when vector has strings/objects - # https://github.com/numpy/numpy/issues/6784 - # Because we reduce with .all(), we are agnostic about whether the - # comparison returns a scalar or vector, so we will ignore the warning. - # It triggers a separate DeprecationWarning when the vector has datetimes: - # https://github.com/numpy/numpy/issues/13548 - # This is considered a bug by numpy and will likely go away. - with warnings.catch_warnings(): - warnings.simplefilter( - action='ignore', - category=(FutureWarning, DeprecationWarning) # type: ignore # mypy bug? - ) - if strict_boolean: - if isinstance(vector.dtype, pd.core.dtypes.base.ExtensionDtype): - boolean_dtypes = ["bool", "boolean"] - else: - boolean_dtypes = ["bool"] - boolean_vector = vector.dtype in boolean_dtypes - else: - try: - boolean_vector = bool(np.isin(vector, [0, 1]).all()) - except TypeError: - # .isin comparison is not guaranteed to be possible under NumPy - # casting rules, depending on the (unknown) dtype of 'vector' - boolean_vector = False - if boolean_vector: - return VarType(boolean_type) - - # Defer to positive pandas tests - if pd.api.types.is_numeric_dtype(vector): - return VarType("numeric") - - if pd.api.types.is_datetime64_dtype(vector): - return VarType("datetime") - - # --- If we get to here, we need to check the entries - - # Check for a collection where everything is a number - - def all_numeric(x): - for x_i in x: - if not isinstance(x_i, Number): - return False - return True - - if all_numeric(vector): - return VarType("numeric") - - # Check for a collection where everything is a datetime - - def all_datetime(x): - for x_i in x: - if not isinstance(x_i, (datetime, np.datetime64)): - return False - return True - - if all_datetime(vector): - return VarType("datetime") - - # Otherwise, our final fallback is to consider things categorical - - return VarType("categorical") - - -def categorical_order(vector: Series, order: list | None = None) -> list: - """ - Return a list of unique data values using seaborn's ordering rules. - - Parameters - ---------- - vector : Series - Vector of "categorical" values - order : list - Desired order of category levels to override the order determined - from the `data` object. - - Returns - ------- - order : list - Ordered list of category levels not including null values. - - """ - if order is not None: - return order - - if vector.dtype.name == "category": - order = list(vector.cat.categories) - else: - order = list(filter(pd.notnull, vector.unique())) - if variable_type(pd.Series(order)) == "numeric": - order.sort() - - return order diff --git a/seaborn/_core/scales.py b/seaborn/_core/scales.py deleted file mode 100644 index 1e7bef8a5d29934f5afa18b1a474b103f605e611..0000000000000000000000000000000000000000 --- a/seaborn/_core/scales.py +++ /dev/null @@ -1,1090 +0,0 @@ -from __future__ import annotations -import re -from copy import copy -from collections.abc import Sequence -from dataclasses import dataclass -from functools import partial -from typing import Any, Callable, Tuple, Optional, ClassVar - -import numpy as np -import matplotlib as mpl -from matplotlib.ticker import ( - Locator, - Formatter, - AutoLocator, - AutoMinorLocator, - FixedLocator, - LinearLocator, - LogLocator, - SymmetricalLogLocator, - MaxNLocator, - MultipleLocator, - EngFormatter, - FuncFormatter, - LogFormatterSciNotation, - ScalarFormatter, - StrMethodFormatter, -) -from matplotlib.dates import ( - AutoDateLocator, - AutoDateFormatter, - ConciseDateFormatter, -) -from matplotlib.axis import Axis -from matplotlib.scale import ScaleBase -from pandas import Series - -from seaborn._core.rules import categorical_order -from seaborn._core.typing import Default, default - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from seaborn._core.plot import Plot - from seaborn._core.properties import Property - from numpy.typing import ArrayLike, NDArray - - TransFuncs = Tuple[ - Callable[[ArrayLike], ArrayLike], Callable[[ArrayLike], ArrayLike] - ] - - # TODO Reverting typing to Any as it was proving too complicated to - # work out the right way to communicate the types to mypy. Revisit! - Pipeline = Sequence[Optional[Callable[[Any], Any]]] - - -class Scale: - """Base class for objects that map data values to visual properties.""" - - values: tuple | str | list | dict | None - - _priority: ClassVar[int] - _pipeline: Pipeline - _matplotlib_scale: ScaleBase - _spacer: staticmethod - _legend: tuple[list[Any], list[str]] | None - - def __post_init__(self): - - self._tick_params = None - self._label_params = None - self._legend = None - - def tick(self): - raise NotImplementedError() - - def label(self): - raise NotImplementedError() - - def _get_locators(self): - raise NotImplementedError() - - def _get_formatter(self, locator: Locator | None = None): - raise NotImplementedError() - - def _get_scale(self, name: str, forward: Callable, inverse: Callable): - - major_locator, minor_locator = self._get_locators(**self._tick_params) - major_formatter = self._get_formatter(major_locator, **self._label_params) - - class InternalScale(mpl.scale.FuncScale): - def set_default_locators_and_formatters(self, axis): - axis.set_major_locator(major_locator) - if minor_locator is not None: - axis.set_minor_locator(minor_locator) - axis.set_major_formatter(major_formatter) - - return InternalScale(name, (forward, inverse)) - - def _spacing(self, x: Series) -> float: - space = self._spacer(x) - if np.isnan(space): - # This happens when there is no variance in the orient coordinate data - # Not exactly clear what the right default is, but 1 seems reasonable? - return 1 - return space - - def _setup( - self, data: Series, prop: Property, axis: Axis | None = None, - ) -> Scale: - raise NotImplementedError() - - def _finalize(self, p: Plot, axis: Axis) -> None: - """Perform scale-specific axis tweaks after adding artists.""" - pass - - def __call__(self, data: Series) -> ArrayLike: - - trans_data: Series | NDArray | list - - # TODO sometimes we need to handle scalars (e.g. for Line) - # but what is the best way to do that? - scalar_data = np.isscalar(data) - if scalar_data: - trans_data = np.array([data]) - else: - trans_data = data - - for func in self._pipeline: - if func is not None: - trans_data = func(trans_data) - - if scalar_data: - return trans_data[0] - else: - return trans_data - - @staticmethod - def _identity(): - - class Identity(Scale): - _pipeline = [] - _spacer = None - _legend = None - _matplotlib_scale = None - - return Identity() - - -@dataclass -class Boolean(Scale): - """ - A scale with a discrete domain of True and False values. - - The behavior is similar to the :class:`Nominal` scale, but property - mappings and legends will use a [True, False] ordering rather than - a sort using numeric rules. Coordinate variables accomplish this by - inverting axis limits so as to maintain underlying numeric positioning. - Input data are cast to boolean values, respecting missing data. - - """ - values: tuple | list | dict | None = None - - _priority: ClassVar[int] = 3 - - def _setup( - self, data: Series, prop: Property, axis: Axis | None = None, - ) -> Scale: - - new = copy(self) - if new._tick_params is None: - new = new.tick() - if new._label_params is None: - new = new.label() - - def na_safe_cast(x): - # TODO this doesn't actually need to be a closure - if np.isscalar(x): - return float(bool(x)) - else: - if hasattr(x, "notna"): - # Handle pd.NA; np<>pd interop with NA is tricky - use = x.notna().to_numpy() - else: - use = np.isfinite(x) - out = np.full(len(x), np.nan, dtype=float) - out[use] = x[use].astype(bool).astype(float) - return out - - new._pipeline = [na_safe_cast, prop.get_mapping(new, data)] - new._spacer = _default_spacer - if prop.legend: - new._legend = [True, False], ["True", "False"] - - forward, inverse = _make_identity_transforms() - mpl_scale = new._get_scale(str(data.name), forward, inverse) - - axis = PseudoAxis(mpl_scale) if axis is None else axis - mpl_scale.set_default_locators_and_formatters(axis) - new._matplotlib_scale = mpl_scale - - return new - - def _finalize(self, p: Plot, axis: Axis) -> None: - - # We want values to appear in a True, False order but also want - # True/False to be drawn at 1/0 positions respectively to avoid nasty - # surprises if additional artists are added through the matplotlib API. - # We accomplish this using axis inversion akin to what we do in Nominal. - - ax = axis.axes - name = axis.axis_name - axis.grid(False, which="both") - if name not in p._limits: - nticks = len(axis.get_major_ticks()) - lo, hi = -.5, nticks - .5 - if name == "x": - lo, hi = hi, lo - set_lim = getattr(ax, f"set_{name}lim") - set_lim(lo, hi, auto=None) - - def tick(self, locator: Locator | None = None): - new = copy(self) - new._tick_params = {"locator": locator} - return new - - def label(self, formatter: Formatter | None = None): - new = copy(self) - new._label_params = {"formatter": formatter} - return new - - def _get_locators(self, locator): - if locator is not None: - return locator - return FixedLocator([0, 1]), None - - def _get_formatter(self, locator, formatter): - if formatter is not None: - return formatter - return FuncFormatter(lambda x, _: str(bool(x))) - - -@dataclass -class Nominal(Scale): - """ - A categorical scale without relative importance / magnitude. - """ - # Categorical (convert to strings), un-sortable - - values: tuple | str | list | dict | None = None - order: list | None = None - - _priority: ClassVar[int] = 4 - - def _setup( - self, data: Series, prop: Property, axis: Axis | None = None, - ) -> Scale: - - new = copy(self) - if new._tick_params is None: - new = new.tick() - if new._label_params is None: - new = new.label() - - # TODO flexibility over format() which isn't great for numbers / dates - stringify = np.vectorize(format, otypes=["object"]) - - units_seed = categorical_order(data, new.order) - - # TODO move to Nominal._get_scale? - # TODO this needs some more complicated rethinking about how to pass - # a unit dictionary down to these methods, along with how much we want - # to invest in their API. What is it useful for tick() to do here? - # (Ordinal may be different if we draw that contrast). - # Any customization we do to allow, e.g., label wrapping will probably - # require defining our own Formatter subclass. - # We could also potentially implement auto-wrapping in an Axis subclass - # (see Axis.draw ... it already is computing the bboxes). - # major_locator, minor_locator = new._get_locators(**new._tick_params) - # major_formatter = new._get_formatter(major_locator, **new._label_params) - - class CatScale(mpl.scale.LinearScale): - def set_default_locators_and_formatters(self, axis): - ... - # axis.set_major_locator(major_locator) - # if minor_locator is not None: - # axis.set_minor_locator(minor_locator) - # axis.set_major_formatter(major_formatter) - - mpl_scale = CatScale(data.name) - if axis is None: - axis = PseudoAxis(mpl_scale) - - # TODO Currently just used in non-Coordinate contexts, but should - # we use this to (A) set the padding we want for categorial plots - # and (B) allow the values parameter for a Coordinate to set xlim/ylim - axis.set_view_interval(0, len(units_seed) - 1) - - new._matplotlib_scale = mpl_scale - - # TODO array cast necessary to handle float/int mixture, which we need - # to solve in a more systematic way probably - # (i.e. if we have [1, 2.5], do we want [1.0, 2.5]? Unclear) - axis.update_units(stringify(np.array(units_seed))) - - # TODO define this more centrally - def convert_units(x): - # TODO only do this with explicit order? - # (But also category dtype?) - # TODO isin fails when units_seed mixes numbers and strings (numpy error?) - # but np.isin also does not seem any faster? (Maybe not broadcasting in C) - # keep = x.isin(units_seed) - keep = np.array([x_ in units_seed for x_ in x], bool) - out = np.full(len(x), np.nan) - out[keep] = axis.convert_units(stringify(x[keep])) - return out - - new._pipeline = [convert_units, prop.get_mapping(new, data)] - new._spacer = _default_spacer - - if prop.legend: - new._legend = units_seed, list(stringify(units_seed)) - - return new - - def _finalize(self, p: Plot, axis: Axis) -> None: - - ax = axis.axes - name = axis.axis_name - axis.grid(False, which="both") - if name not in p._limits: - nticks = len(axis.get_major_ticks()) - lo, hi = -.5, nticks - .5 - if name == "y": - lo, hi = hi, lo - set_lim = getattr(ax, f"set_{name}lim") - set_lim(lo, hi, auto=None) - - def tick(self, locator: Locator | None = None) -> Nominal: - """ - Configure the selection of ticks for the scale's axis or legend. - - .. note:: - This API is under construction and will be enhanced over time. - At the moment, it is probably not very useful. - - Parameters - ---------- - locator : :class:`matplotlib.ticker.Locator` subclass - Pre-configured matplotlib locator; other parameters will not be used. - - Returns - ------- - Copy of self with new tick configuration. - - """ - new = copy(self) - new._tick_params = {"locator": locator} - return new - - def label(self, formatter: Formatter | None = None) -> Nominal: - """ - Configure the selection of labels for the scale's axis or legend. - - .. note:: - This API is under construction and will be enhanced over time. - At the moment, it is probably not very useful. - - Parameters - ---------- - formatter : :class:`matplotlib.ticker.Formatter` subclass - Pre-configured matplotlib formatter; other parameters will not be used. - - Returns - ------- - scale - Copy of self with new tick configuration. - - """ - new = copy(self) - new._label_params = {"formatter": formatter} - return new - - def _get_locators(self, locator): - - if locator is not None: - return locator, None - - locator = mpl.category.StrCategoryLocator({}) - - return locator, None - - def _get_formatter(self, locator, formatter): - - if formatter is not None: - return formatter - - formatter = mpl.category.StrCategoryFormatter({}) - - return formatter - - -@dataclass -class Ordinal(Scale): - # Categorical (convert to strings), sortable, can skip ticklabels - ... - - -@dataclass -class Discrete(Scale): - # Numeric, integral, can skip ticks/ticklabels - ... - - -@dataclass -class ContinuousBase(Scale): - - values: tuple | str | None = None - norm: tuple | None = None - - def _setup( - self, data: Series, prop: Property, axis: Axis | None = None, - ) -> Scale: - - new = copy(self) - if new._tick_params is None: - new = new.tick() - if new._label_params is None: - new = new.label() - - forward, inverse = new._get_transform() - - mpl_scale = new._get_scale(str(data.name), forward, inverse) - - if axis is None: - axis = PseudoAxis(mpl_scale) - axis.update_units(data) - - mpl_scale.set_default_locators_and_formatters(axis) - new._matplotlib_scale = mpl_scale - - normalize: Optional[Callable[[ArrayLike], ArrayLike]] - if prop.normed: - if new.norm is None: - vmin, vmax = data.min(), data.max() - else: - vmin, vmax = new.norm - vmin, vmax = map(float, axis.convert_units((vmin, vmax))) - a = forward(vmin) - b = forward(vmax) - forward(vmin) - - def normalize(x): - return (x - a) / b - - else: - normalize = vmin = vmax = None - - new._pipeline = [ - axis.convert_units, - forward, - normalize, - prop.get_mapping(new, data) - ] - - def spacer(x): - x = x.dropna().unique() - if len(x) < 2: - return np.nan - return np.min(np.diff(np.sort(x))) - new._spacer = spacer - - # TODO How to allow disabling of legend for all uses of property? - # Could add a Scale parameter, or perhaps Scale.suppress()? - # Are there other useful parameters that would be in Scale.legend() - # besides allowing Scale.legend(False)? - if prop.legend: - axis.set_view_interval(vmin, vmax) - locs = axis.major.locator() - locs = locs[(vmin <= locs) & (locs <= vmax)] - # Avoid having an offset / scientific notation in a legend - # as we don't represent that anywhere so it ends up incorrect. - # This could become an option (e.g. Continuous.label(offset=True)) - # in which case we would need to figure out how to show it. - if hasattr(axis.major.formatter, "set_useOffset"): - axis.major.formatter.set_useOffset(False) - if hasattr(axis.major.formatter, "set_scientific"): - axis.major.formatter.set_scientific(False) - labels = axis.major.formatter.format_ticks(locs) - new._legend = list(locs), list(labels) - - return new - - def _get_transform(self): - - arg = self.trans - - def get_param(method, default): - if arg == method: - return default - return float(arg[len(method):]) - - if arg is None: - return _make_identity_transforms() - elif isinstance(arg, tuple): - return arg - elif isinstance(arg, str): - if arg == "ln": - return _make_log_transforms() - elif arg == "logit": - base = get_param("logit", 10) - return _make_logit_transforms(base) - elif arg.startswith("log"): - base = get_param("log", 10) - return _make_log_transforms(base) - elif arg.startswith("symlog"): - c = get_param("symlog", 1) - return _make_symlog_transforms(c) - elif arg.startswith("pow"): - exp = get_param("pow", 2) - return _make_power_transforms(exp) - elif arg == "sqrt": - return _make_sqrt_transforms() - else: - raise ValueError(f"Unknown value provided for trans: {arg!r}") - - -@dataclass -class Continuous(ContinuousBase): - """ - A numeric scale supporting norms and functional transforms. - """ - values: tuple | str | None = None - trans: str | TransFuncs | None = None - - # TODO Add this to deal with outliers? - # outside: Literal["keep", "drop", "clip"] = "keep" - - _priority: ClassVar[int] = 1 - - def tick( - self, - locator: Locator | None = None, *, - at: Sequence[float] | None = None, - upto: int | None = None, - count: int | None = None, - every: float | None = None, - between: tuple[float, float] | None = None, - minor: int | None = None, - ) -> Continuous: - """ - Configure the selection of ticks for the scale's axis or legend. - - Parameters - ---------- - locator : :class:`matplotlib.ticker.Locator` subclass - Pre-configured matplotlib locator; other parameters will not be used. - at : sequence of floats - Place ticks at these specific locations (in data units). - upto : int - Choose "nice" locations for ticks, but do not exceed this number. - count : int - Choose exactly this number of ticks, bounded by `between` or axis limits. - every : float - Choose locations at this interval of separation (in data units). - between : pair of floats - Bound upper / lower ticks when using `every` or `count`. - minor : int - Number of unlabeled ticks to draw between labeled "major" ticks. - - Returns - ------- - scale - Copy of self with new tick configuration. - - """ - # Input checks - if locator is not None and not isinstance(locator, Locator): - raise TypeError( - f"Tick locator must be an instance of {Locator!r}, " - f"not {type(locator)!r}." - ) - log_base, symlog_thresh = self._parse_for_log_params(self.trans) - if log_base or symlog_thresh: - if count is not None and between is None: - raise RuntimeError("`count` requires `between` with log transform.") - if every is not None: - raise RuntimeError("`every` not supported with log transform.") - - new = copy(self) - new._tick_params = { - "locator": locator, - "at": at, - "upto": upto, - "count": count, - "every": every, - "between": between, - "minor": minor, - } - return new - - def label( - self, - formatter: Formatter | None = None, *, - like: str | Callable | None = None, - base: int | None | Default = default, - unit: str | None = None, - ) -> Continuous: - """ - Configure the appearance of tick labels for the scale's axis or legend. - - Parameters - ---------- - formatter : :class:`matplotlib.ticker.Formatter` subclass - Pre-configured formatter to use; other parameters will be ignored. - like : str or callable - Either a format pattern (e.g., `".2f"`), a format string with fields named - `x` and/or `pos` (e.g., `"${x:.2f}"`), or a callable with a signature like - `f(x: float, pos: int) -> str`. In the latter variants, `x` is passed as the - tick value and `pos` is passed as the tick index. - base : number - Use log formatter (with scientific notation) having this value as the base. - Set to `None` to override the default formatter with a log transform. - unit : str or (str, str) tuple - Use SI prefixes with these units (e.g., with `unit="g"`, a tick value - of 5000 will appear as `5 kg`). When a tuple, the first element gives the - separator between the number and unit. - - Returns - ------- - scale - Copy of self with new label configuration. - - """ - # Input checks - if formatter is not None and not isinstance(formatter, Formatter): - raise TypeError( - f"Label formatter must be an instance of {Formatter!r}, " - f"not {type(formatter)!r}" - ) - if like is not None and not (isinstance(like, str) or callable(like)): - msg = f"`like` must be a string or callable, not {type(like).__name__}." - raise TypeError(msg) - - new = copy(self) - new._label_params = { - "formatter": formatter, - "like": like, - "base": base, - "unit": unit, - } - return new - - def _parse_for_log_params( - self, trans: str | TransFuncs | None - ) -> tuple[float | None, float | None]: - - log_base = symlog_thresh = None - if isinstance(trans, str): - m = re.match(r"^log(\d*)", trans) - if m is not None: - log_base = float(m[1] or 10) - m = re.match(r"symlog(\d*)", trans) - if m is not None: - symlog_thresh = float(m[1] or 1) - return log_base, symlog_thresh - - def _get_locators(self, locator, at, upto, count, every, between, minor): - - log_base, symlog_thresh = self._parse_for_log_params(self.trans) - - if locator is not None: - major_locator = locator - - elif upto is not None: - if log_base: - major_locator = LogLocator(base=log_base, numticks=upto) - else: - major_locator = MaxNLocator(upto, steps=[1, 1.5, 2, 2.5, 3, 5, 10]) - - elif count is not None: - if between is None: - # This is rarely useful (unless you are setting limits) - major_locator = LinearLocator(count) - else: - if log_base or symlog_thresh: - forward, inverse = self._get_transform() - lo, hi = forward(between) - ticks = inverse(np.linspace(lo, hi, num=count)) - else: - ticks = np.linspace(*between, num=count) - major_locator = FixedLocator(ticks) - - elif every is not None: - if between is None: - major_locator = MultipleLocator(every) - else: - lo, hi = between - ticks = np.arange(lo, hi + every, every) - major_locator = FixedLocator(ticks) - - elif at is not None: - major_locator = FixedLocator(at) - - else: - if log_base: - major_locator = LogLocator(log_base) - elif symlog_thresh: - major_locator = SymmetricalLogLocator(linthresh=symlog_thresh, base=10) - else: - major_locator = AutoLocator() - - if minor is None: - minor_locator = LogLocator(log_base, subs=None) if log_base else None - else: - if log_base: - subs = np.linspace(0, log_base, minor + 2)[1:-1] - minor_locator = LogLocator(log_base, subs=subs) - else: - minor_locator = AutoMinorLocator(minor + 1) - - return major_locator, minor_locator - - def _get_formatter(self, locator, formatter, like, base, unit): - - log_base, symlog_thresh = self._parse_for_log_params(self.trans) - if base is default: - if symlog_thresh: - log_base = 10 - base = log_base - - if formatter is not None: - return formatter - - if like is not None: - if isinstance(like, str): - if "{x" in like or "{pos" in like: - fmt = like - else: - fmt = f"{{x:{like}}}" - formatter = StrMethodFormatter(fmt) - else: - formatter = FuncFormatter(like) - - elif base is not None: - # We could add other log options if necessary - formatter = LogFormatterSciNotation(base) - - elif unit is not None: - if isinstance(unit, tuple): - sep, unit = unit - elif not unit: - sep = "" - else: - sep = " " - formatter = EngFormatter(unit, sep=sep) - - else: - formatter = ScalarFormatter() - - return formatter - - -@dataclass -class Temporal(ContinuousBase): - """ - A scale for date/time data. - """ - # TODO date: bool? - # For when we only care about the time component, would affect - # default formatter and norm conversion. Should also happen in - # Property.default_scale. The alternative was having distinct - # Calendric / Temporal scales, but that feels a bit fussy, and it - # would get in the way of using first-letter shorthands because - # Calendric and Continuous would collide. Still, we haven't implemented - # those yet, and having a clear distinction betewen date(time) / time - # may be more useful. - - trans = None - - _priority: ClassVar[int] = 2 - - def tick( - self, locator: Locator | None = None, *, - upto: int | None = None, - ) -> Temporal: - """ - Configure the selection of ticks for the scale's axis or legend. - - .. note:: - This API is under construction and will be enhanced over time. - - Parameters - ---------- - locator : :class:`matplotlib.ticker.Locator` subclass - Pre-configured matplotlib locator; other parameters will not be used. - upto : int - Choose "nice" locations for ticks, but do not exceed this number. - - Returns - ------- - scale - Copy of self with new tick configuration. - - """ - if locator is not None and not isinstance(locator, Locator): - err = ( - f"Tick locator must be an instance of {Locator!r}, " - f"not {type(locator)!r}." - ) - raise TypeError(err) - - new = copy(self) - new._tick_params = {"locator": locator, "upto": upto} - return new - - def label( - self, - formatter: Formatter | None = None, *, - concise: bool = False, - ) -> Temporal: - """ - Configure the appearance of tick labels for the scale's axis or legend. - - .. note:: - This API is under construction and will be enhanced over time. - - Parameters - ---------- - formatter : :class:`matplotlib.ticker.Formatter` subclass - Pre-configured formatter to use; other parameters will be ignored. - concise : bool - If True, use :class:`matplotlib.dates.ConciseDateFormatter` to make - the tick labels as compact as possible. - - Returns - ------- - scale - Copy of self with new label configuration. - - """ - new = copy(self) - new._label_params = {"formatter": formatter, "concise": concise} - return new - - def _get_locators(self, locator, upto): - - if locator is not None: - major_locator = locator - elif upto is not None: - major_locator = AutoDateLocator(minticks=2, maxticks=upto) - - else: - major_locator = AutoDateLocator(minticks=2, maxticks=6) - minor_locator = None - - return major_locator, minor_locator - - def _get_formatter(self, locator, formatter, concise): - - if formatter is not None: - return formatter - - if concise: - # TODO ideally we would have concise coordinate ticks, - # but full semantic ticks. Is that possible? - formatter = ConciseDateFormatter(locator) - else: - formatter = AutoDateFormatter(locator) - - return formatter - - -# ----------------------------------------------------------------------------------- # - - -# TODO Have this separate from Temporal or have Temporal(date=True) or similar? -# class Calendric(Scale): - -# TODO Needed? Or handle this at layer (in stat or as param, eg binning=) -# class Binned(Scale): - -# TODO any need for color-specific scales? -# class Sequential(Continuous): -# class Diverging(Continuous): -# class Qualitative(Nominal): - - -# ----------------------------------------------------------------------------------- # - - -class PseudoAxis: - """ - Internal class implementing minimal interface equivalent to matplotlib Axis. - - Coordinate variables are typically scaled by attaching the Axis object from - the figure where the plot will end up. Matplotlib has no similar concept of - and axis for the other mappable variables (color, etc.), but to simplify the - code, this object acts like an Axis and can be used to scale other variables. - - """ - axis_name = "" # Matplotlib requirement but not actually used - - def __init__(self, scale): - - self.converter = None - self.units = None - self.scale = scale - self.major = mpl.axis.Ticker() - self.minor = mpl.axis.Ticker() - - # It appears that this needs to be initialized this way on matplotlib 3.1, - # but not later versions. It is unclear whether there are any issues with it. - self._data_interval = None, None - - scale.set_default_locators_and_formatters(self) - # self.set_default_intervals() Is this ever needed? - - def set_view_interval(self, vmin, vmax): - self._view_interval = vmin, vmax - - def get_view_interval(self): - return self._view_interval - - # TODO do we want to distinguish view/data intervals? e.g. for a legend - # we probably want to represent the full range of the data values, but - # still norm the colormap. If so, we'll need to track data range separately - # from the norm, which we currently don't do. - - def set_data_interval(self, vmin, vmax): - self._data_interval = vmin, vmax - - def get_data_interval(self): - return self._data_interval - - def get_tick_space(self): - # TODO how to do this in a configurable / auto way? - # Would be cool to have legend density adapt to figure size, etc. - return 5 - - def set_major_locator(self, locator): - self.major.locator = locator - locator.set_axis(self) - - def set_major_formatter(self, formatter): - self.major.formatter = formatter - formatter.set_axis(self) - - def set_minor_locator(self, locator): - self.minor.locator = locator - locator.set_axis(self) - - def set_minor_formatter(self, formatter): - self.minor.formatter = formatter - formatter.set_axis(self) - - def set_units(self, units): - self.units = units - - def update_units(self, x): - """Pass units to the internal converter, potentially updating its mapping.""" - self.converter = mpl.units.registry.get_converter(x) - if self.converter is not None: - self.converter.default_units(x, self) - - info = self.converter.axisinfo(self.units, self) - - if info is None: - return - if info.majloc is not None: - self.set_major_locator(info.majloc) - if info.majfmt is not None: - self.set_major_formatter(info.majfmt) - - # This is in matplotlib method; do we need this? - # self.set_default_intervals() - - def convert_units(self, x): - """Return a numeric representation of the input data.""" - if np.issubdtype(np.asarray(x).dtype, np.number): - return x - elif self.converter is None: - return x - return self.converter.convert(x, self.units, self) - - def get_scale(self): - # Note that matplotlib actually returns a string here! - # (e.g., with a log scale, axis.get_scale() returns "log") - # Currently we just hit it with minor ticks where it checks for - # scale == "log". I'm not sure how you'd actually use log-scale - # minor "ticks" in a legend context, so this is fine.... - return self.scale - - def get_majorticklocs(self): - return self.major.locator() - - -# ------------------------------------------------------------------------------------ # -# Transform function creation - - -def _make_identity_transforms() -> TransFuncs: - - def identity(x): - return x - - return identity, identity - - -def _make_logit_transforms(base: float | None = None) -> TransFuncs: - - log, exp = _make_log_transforms(base) - - def logit(x): - with np.errstate(invalid="ignore", divide="ignore"): - return log(x) - log(1 - x) - - def expit(x): - with np.errstate(invalid="ignore", divide="ignore"): - return exp(x) / (1 + exp(x)) - - return logit, expit - - -def _make_log_transforms(base: float | None = None) -> TransFuncs: - - fs: TransFuncs - if base is None: - fs = np.log, np.exp - elif base == 2: - fs = np.log2, partial(np.power, 2) - elif base == 10: - fs = np.log10, partial(np.power, 10) - else: - def forward(x): - return np.log(x) / np.log(base) - fs = forward, partial(np.power, base) - - def log(x: ArrayLike) -> ArrayLike: - with np.errstate(invalid="ignore", divide="ignore"): - return fs[0](x) - - def exp(x: ArrayLike) -> ArrayLike: - with np.errstate(invalid="ignore", divide="ignore"): - return fs[1](x) - - return log, exp - - -def _make_symlog_transforms(c: float = 1, base: float = 10) -> TransFuncs: - - # From https://iopscience.iop.org/article/10.1088/0957-0233/24/2/027001 - - # Note: currently not using base because we only get - # one parameter from the string, and are using c (this is consistent with d3) - - log, exp = _make_log_transforms(base) - - def symlog(x): - with np.errstate(invalid="ignore", divide="ignore"): - return np.sign(x) * log(1 + np.abs(np.divide(x, c))) - - def symexp(x): - with np.errstate(invalid="ignore", divide="ignore"): - return np.sign(x) * c * (exp(np.abs(x)) - 1) - - return symlog, symexp - - -def _make_sqrt_transforms() -> TransFuncs: - - def sqrt(x): - return np.sign(x) * np.sqrt(np.abs(x)) - - def square(x): - return np.sign(x) * np.square(x) - - return sqrt, square - - -def _make_power_transforms(exp: float) -> TransFuncs: - - def forward(x): - return np.sign(x) * np.power(np.abs(x), exp) - - def inverse(x): - return np.sign(x) * np.power(np.abs(x), 1 / exp) - - return forward, inverse - - -def _default_spacer(x: Series) -> float: - return 1 diff --git a/seaborn/_core/subplots.py b/seaborn/_core/subplots.py deleted file mode 100644 index 287f441670881f0967f674cdec6d607af58029ef..0000000000000000000000000000000000000000 --- a/seaborn/_core/subplots.py +++ /dev/null @@ -1,263 +0,0 @@ -from __future__ import annotations -from collections.abc import Generator - -import numpy as np -import matplotlib as mpl -import matplotlib.pyplot as plt - -from matplotlib.axes import Axes -from matplotlib.figure import Figure -from typing import TYPE_CHECKING -if TYPE_CHECKING: # TODO move to seaborn._core.typing? - from seaborn._core.plot import FacetSpec, PairSpec - from matplotlib.figure import SubFigure - - -class Subplots: - """ - Interface for creating and using matplotlib subplots based on seaborn parameters. - - Parameters - ---------- - subplot_spec : dict - Keyword args for :meth:`matplotlib.figure.Figure.subplots`. - facet_spec : dict - Parameters that control subplot faceting. - pair_spec : dict - Parameters that control subplot pairing. - data : PlotData - Data used to define figure setup. - - """ - def __init__( - self, - subplot_spec: dict, # TODO define as TypedDict - facet_spec: FacetSpec, - pair_spec: PairSpec, - ): - - self.subplot_spec = subplot_spec - - self._check_dimension_uniqueness(facet_spec, pair_spec) - self._determine_grid_dimensions(facet_spec, pair_spec) - self._handle_wrapping(facet_spec, pair_spec) - self._determine_axis_sharing(pair_spec) - - def _check_dimension_uniqueness( - self, facet_spec: FacetSpec, pair_spec: PairSpec - ) -> None: - """Reject specs that pair and facet on (or wrap to) same figure dimension.""" - err = None - - facet_vars = facet_spec.get("variables", {}) - - if facet_spec.get("wrap") and {"col", "row"} <= set(facet_vars): - err = "Cannot wrap facets when specifying both `col` and `row`." - elif ( - pair_spec.get("wrap") - and pair_spec.get("cross", True) - and len(pair_spec.get("structure", {}).get("x", [])) > 1 - and len(pair_spec.get("structure", {}).get("y", [])) > 1 - ): - err = "Cannot wrap subplots when pairing on both `x` and `y`." - - collisions = {"x": ["columns", "rows"], "y": ["rows", "columns"]} - for pair_axis, (multi_dim, wrap_dim) in collisions.items(): - if pair_axis not in pair_spec.get("structure", {}): - continue - elif multi_dim[:3] in facet_vars: - err = f"Cannot facet the {multi_dim} while pairing on `{pair_axis}``." - elif wrap_dim[:3] in facet_vars and facet_spec.get("wrap"): - err = f"Cannot wrap the {wrap_dim} while pairing on `{pair_axis}``." - elif wrap_dim[:3] in facet_vars and pair_spec.get("wrap"): - err = f"Cannot wrap the {multi_dim} while faceting the {wrap_dim}." - - if err is not None: - raise RuntimeError(err) # TODO what err class? Define PlotSpecError? - - def _determine_grid_dimensions( - self, facet_spec: FacetSpec, pair_spec: PairSpec - ) -> None: - """Parse faceting and pairing information to define figure structure.""" - self.grid_dimensions: dict[str, list] = {} - for dim, axis in zip(["col", "row"], ["x", "y"]): - - facet_vars = facet_spec.get("variables", {}) - if dim in facet_vars: - self.grid_dimensions[dim] = facet_spec["structure"][dim] - elif axis in pair_spec.get("structure", {}): - self.grid_dimensions[dim] = [ - None for _ in pair_spec.get("structure", {})[axis] - ] - else: - self.grid_dimensions[dim] = [None] - - self.subplot_spec[f"n{dim}s"] = len(self.grid_dimensions[dim]) - - if not pair_spec.get("cross", True): - self.subplot_spec["nrows"] = 1 - - self.n_subplots = self.subplot_spec["ncols"] * self.subplot_spec["nrows"] - - def _handle_wrapping( - self, facet_spec: FacetSpec, pair_spec: PairSpec - ) -> None: - """Update figure structure parameters based on facet/pair wrapping.""" - self.wrap = wrap = facet_spec.get("wrap") or pair_spec.get("wrap") - if not wrap: - return - - wrap_dim = "row" if self.subplot_spec["nrows"] > 1 else "col" - flow_dim = {"row": "col", "col": "row"}[wrap_dim] - n_subplots = self.subplot_spec[f"n{wrap_dim}s"] - flow = int(np.ceil(n_subplots / wrap)) - - if wrap < self.subplot_spec[f"n{wrap_dim}s"]: - self.subplot_spec[f"n{wrap_dim}s"] = wrap - self.subplot_spec[f"n{flow_dim}s"] = flow - self.n_subplots = n_subplots - self.wrap_dim = wrap_dim - - def _determine_axis_sharing(self, pair_spec: PairSpec) -> None: - """Update subplot spec with default or specified axis sharing parameters.""" - axis_to_dim = {"x": "col", "y": "row"} - key: str - val: str | bool - for axis in "xy": - key = f"share{axis}" - # Always use user-specified value, if present - if key not in self.subplot_spec: - if axis in pair_spec.get("structure", {}): - # Paired axes are shared along one dimension by default - if self.wrap is None and pair_spec.get("cross", True): - val = axis_to_dim[axis] - else: - val = False - else: - # This will pick up faceted plots, as well as single subplot - # figures, where the value doesn't really matter - val = True - self.subplot_spec[key] = val - - def init_figure( - self, - pair_spec: PairSpec, - pyplot: bool = False, - figure_kws: dict | None = None, - target: Axes | Figure | SubFigure | None = None, - ) -> Figure: - """Initialize matplotlib objects and add seaborn-relevant metadata.""" - # TODO reduce need to pass pair_spec here? - - if figure_kws is None: - figure_kws = {} - - if isinstance(target, mpl.axes.Axes): - - if max(self.subplot_spec["nrows"], self.subplot_spec["ncols"]) > 1: - err = " ".join([ - "Cannot create multiple subplots after calling `Plot.on` with", - f"a {mpl.axes.Axes} object.", - f" You may want to use a {mpl.figure.SubFigure} instead.", - ]) - raise RuntimeError(err) - - self._subplot_list = [{ - "ax": target, - "left": True, - "right": True, - "top": True, - "bottom": True, - "col": None, - "row": None, - "x": "x", - "y": "y", - }] - self._figure = target.figure - return self._figure - - elif isinstance(target, mpl.figure.SubFigure): - figure = target.figure - elif isinstance(target, mpl.figure.Figure): - figure = target - else: - if pyplot: - figure = plt.figure(**figure_kws) - else: - figure = mpl.figure.Figure(**figure_kws) - target = figure - self._figure = figure - - axs = target.subplots(**self.subplot_spec, squeeze=False) - - if self.wrap: - # Remove unused Axes and flatten the rest into a (2D) vector - axs_flat = axs.ravel({"col": "C", "row": "F"}[self.wrap_dim]) - axs, extra = np.split(axs_flat, [self.n_subplots]) - for ax in extra: - ax.remove() - if self.wrap_dim == "col": - axs = axs[np.newaxis, :] - else: - axs = axs[:, np.newaxis] - - # Get i, j coordinates for each Axes object - # Note that i, j are with respect to faceting/pairing, - # not the subplot grid itself, (which only matters in the case of wrapping). - iter_axs: np.ndenumerate | zip - if not pair_spec.get("cross", True): - indices = np.arange(self.n_subplots) - iter_axs = zip(zip(indices, indices), axs.flat) - else: - iter_axs = np.ndenumerate(axs) - - self._subplot_list = [] - for (i, j), ax in iter_axs: - - info = {"ax": ax} - - nrows, ncols = self.subplot_spec["nrows"], self.subplot_spec["ncols"] - if not self.wrap: - info["left"] = j % ncols == 0 - info["right"] = (j + 1) % ncols == 0 - info["top"] = i == 0 - info["bottom"] = i == nrows - 1 - elif self.wrap_dim == "col": - info["left"] = j % ncols == 0 - info["right"] = ((j + 1) % ncols == 0) or ((j + 1) == self.n_subplots) - info["top"] = j < ncols - info["bottom"] = j >= (self.n_subplots - ncols) - elif self.wrap_dim == "row": - info["left"] = i < nrows - info["right"] = i >= self.n_subplots - nrows - info["top"] = i % nrows == 0 - info["bottom"] = ((i + 1) % nrows == 0) or ((i + 1) == self.n_subplots) - - if not pair_spec.get("cross", True): - info["top"] = j < ncols - info["bottom"] = j >= self.n_subplots - ncols - - for dim in ["row", "col"]: - idx = {"row": i, "col": j}[dim] - info[dim] = self.grid_dimensions[dim][idx] - - for axis in "xy": - - idx = {"x": j, "y": i}[axis] - if axis in pair_spec.get("structure", {}): - key = f"{axis}{idx}" - else: - key = axis - info[axis] = key - - self._subplot_list.append(info) - - return figure - - def __iter__(self) -> Generator[dict, None, None]: # TODO TypedDict? - """Yield each subplot dictionary with Axes object and metadata.""" - yield from self._subplot_list - - def __len__(self) -> int: - """Return the number of subplots in this figure.""" - return len(self._subplot_list) diff --git a/seaborn/_core/typing.py b/seaborn/_core/typing.py deleted file mode 100644 index 9bdf8a6ef88dd112fa842f3aff026413ab906dbb..0000000000000000000000000000000000000000 --- a/seaborn/_core/typing.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import annotations - -from collections.abc import Iterable, Mapping -from datetime import date, datetime, timedelta -from typing import Any, Optional, Union, Tuple, List, Dict - -from numpy import ndarray # TODO use ArrayLike? -from pandas import Series, Index, Timestamp, Timedelta -from matplotlib.colors import Colormap, Normalize - - -ColumnName = Union[ - str, bytes, date, datetime, timedelta, bool, complex, Timestamp, Timedelta -] -Vector = Union[Series, Index, ndarray] - -VariableSpec = Union[ColumnName, Vector, None] -VariableSpecList = Union[List[VariableSpec], Index, None] - -# A DataSource can be an object implementing __dataframe__, or a Mapping -# (and is optional in all contexts where it is used). -# I don't think there's an abc for "has __dataframe__", so we type as object -# but keep the (slightly odd) Union alias for better user-facing annotations. -DataSource = Union[object, Mapping, None] - -OrderSpec = Union[Iterable, None] # TODO technically str is iterable -NormSpec = Union[Tuple[Optional[float], Optional[float]], Normalize, None] - -# TODO for discrete mappings, it would be ideal to use a parameterized type -# as the dict values / list entries should be of specific type(s) for each method -PaletteSpec = Union[str, list, dict, Colormap, None] -DiscreteValueSpec = Union[dict, list, None] -ContinuousValueSpec = Union[ - Tuple[float, float], List[float], Dict[Any, float], None, -] - - -class Default: - def __repr__(self): - return "<default>" - - -class Deprecated: - def __repr__(self): - return "<deprecated>" - - -default = Default() -deprecated = Deprecated() diff --git a/seaborn/_docstrings.py b/seaborn/_docstrings.py deleted file mode 100644 index 2ab210b6ffbf63f21ebee9a4a3d59dcbc94fcb57..0000000000000000000000000000000000000000 --- a/seaborn/_docstrings.py +++ /dev/null @@ -1,198 +0,0 @@ -import re -import pydoc -from .external.docscrape import NumpyDocString - - -class DocstringComponents: - - regexp = re.compile(r"\n((\n|.)+)\n\s*", re.MULTILINE) - - def __init__(self, comp_dict, strip_whitespace=True): - """Read entries from a dict, optionally stripping outer whitespace.""" - if strip_whitespace: - entries = {} - for key, val in comp_dict.items(): - m = re.match(self.regexp, val) - if m is None: - entries[key] = val - else: - entries[key] = m.group(1) - else: - entries = comp_dict.copy() - - self.entries = entries - - def __getattr__(self, attr): - """Provide dot access to entries for clean raw docstrings.""" - if attr in self.entries: - return self.entries[attr] - else: - try: - return self.__getattribute__(attr) - except AttributeError as err: - # If Python is run with -OO, it will strip docstrings and our lookup - # from self.entries will fail. We check for __debug__, which is actually - # set to False by -O (it is True for normal execution). - # But we only want to see an error when building the docs; - # not something users should see, so this slight inconsistency is fine. - if __debug__: - raise err - else: - pass - - @classmethod - def from_nested_components(cls, **kwargs): - """Add multiple sub-sets of components.""" - return cls(kwargs, strip_whitespace=False) - - @classmethod - def from_function_params(cls, func): - """Use the numpydoc parser to extract components from existing func.""" - params = NumpyDocString(pydoc.getdoc(func))["Parameters"] - comp_dict = {} - for p in params: - name = p.name - type = p.type - desc = "\n ".join(p.desc) - comp_dict[name] = f"{name} : {type}\n {desc}" - - return cls(comp_dict) - - -# TODO is "vector" the best term here? We mean to imply 1D data with a variety -# of types? - -# TODO now that we can parse numpydoc style strings, do we need to define dicts -# of docstring components, or just write out a docstring? - - -_core_params = dict( - data=""" -data : :class:`pandas.DataFrame`, :class:`numpy.ndarray`, mapping, or sequence - Input data structure. Either a long-form collection of vectors that can be - assigned to named variables or a wide-form dataset that will be internally - reshaped. - """, # TODO add link to user guide narrative when exists - xy=""" -x, y : vectors or keys in ``data`` - Variables that specify positions on the x and y axes. - """, - hue=""" -hue : vector or key in ``data`` - Semantic variable that is mapped to determine the color of plot elements. - """, - palette=""" -palette : string, list, dict, or :class:`matplotlib.colors.Colormap` - Method for choosing the colors to use when mapping the ``hue`` semantic. - String values are passed to :func:`color_palette`. List or dict values - imply categorical mapping, while a colormap object implies numeric mapping. - """, # noqa: E501 - hue_order=""" -hue_order : vector of strings - Specify the order of processing and plotting for categorical levels of the - ``hue`` semantic. - """, - hue_norm=""" -hue_norm : tuple or :class:`matplotlib.colors.Normalize` - Either a pair of values that set the normalization range in data units - or an object that will map from data units into a [0, 1] interval. Usage - implies numeric mapping. - """, - color=""" -color : :mod:`matplotlib color <matplotlib.colors>` - Single color specification for when hue mapping is not used. Otherwise, the - plot will try to hook into the matplotlib property cycle. - """, - ax=""" -ax : :class:`matplotlib.axes.Axes` - Pre-existing axes for the plot. Otherwise, call :func:`matplotlib.pyplot.gca` - internally. - """, # noqa: E501 -) - - -_core_returns = dict( - ax=""" -:class:`matplotlib.axes.Axes` - The matplotlib axes containing the plot. - """, - facetgrid=""" -:class:`FacetGrid` - An object managing one or more subplots that correspond to conditional data - subsets with convenient methods for batch-setting of axes attributes. - """, - jointgrid=""" -:class:`JointGrid` - An object managing multiple subplots that correspond to joint and marginal axes - for plotting a bivariate relationship or distribution. - """, - pairgrid=""" -:class:`PairGrid` - An object managing multiple subplots that correspond to joint and marginal axes - for pairwise combinations of multiple variables in a dataset. - """, -) - - -_seealso_blurbs = dict( - - # Relational plots - scatterplot=""" -scatterplot : Plot data using points. - """, - lineplot=""" -lineplot : Plot data using lines. - """, - - # Distribution plots - displot=""" -displot : Figure-level interface to distribution plot functions. - """, - histplot=""" -histplot : Plot a histogram of binned counts with optional normalization or smoothing. - """, - kdeplot=""" -kdeplot : Plot univariate or bivariate distributions using kernel density estimation. - """, - ecdfplot=""" -ecdfplot : Plot empirical cumulative distribution functions. - """, - rugplot=""" -rugplot : Plot a tick at each observation value along the x and/or y axes. - """, - - # Categorical plots - stripplot=""" -stripplot : Plot a categorical scatter with jitter. - """, - swarmplot=""" -swarmplot : Plot a categorical scatter with non-overlapping points. - """, - violinplot=""" -violinplot : Draw an enhanced boxplot using kernel density estimation. - """, - pointplot=""" -pointplot : Plot point estimates and CIs using markers and lines. - """, - - # Multiples - jointplot=""" -jointplot : Draw a bivariate plot with univariate marginal distributions. - """, - pairplot=""" -jointplot : Draw multiple bivariate plots with univariate marginal distributions. - """, - jointgrid=""" -JointGrid : Set up a figure with joint and marginal views on bivariate data. - """, - pairgrid=""" -PairGrid : Set up a figure with joint and marginal views on multiple variables. - """, -) - - -_core_docs = dict( - params=DocstringComponents(_core_params), - returns=DocstringComponents(_core_returns), - seealso=DocstringComponents(_seealso_blurbs), -) diff --git a/seaborn/_marks/__init__.py b/seaborn/_marks/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/seaborn/_marks/area.py b/seaborn/_marks/area.py deleted file mode 100644 index 7514a6d13b7a373ff3c89ccbe06abec77442c0f2..0000000000000000000000000000000000000000 --- a/seaborn/_marks/area.py +++ /dev/null @@ -1,170 +0,0 @@ -from __future__ import annotations -from collections import defaultdict -from dataclasses import dataclass - -import numpy as np -import matplotlib as mpl - -from seaborn._marks.base import ( - Mark, - Mappable, - MappableBool, - MappableFloat, - MappableColor, - MappableStyle, - resolve_properties, - resolve_color, - document_properties, -) - - -class AreaBase: - - def _plot(self, split_gen, scales, orient): - - patches = defaultdict(list) - - for keys, data, ax in split_gen(): - - kws = {} - data = self._standardize_coordinate_parameters(data, orient) - resolved = resolve_properties(self, keys, scales) - verts = self._get_verts(data, orient) - ax.update_datalim(verts) - - # TODO should really move this logic into resolve_color - fc = resolve_color(self, keys, "", scales) - if not resolved["fill"]: - fc = mpl.colors.to_rgba(fc, 0) - - kws["facecolor"] = fc - kws["edgecolor"] = resolve_color(self, keys, "edge", scales) - kws["linewidth"] = resolved["edgewidth"] - kws["linestyle"] = resolved["edgestyle"] - - patches[ax].append(mpl.patches.Polygon(verts, **kws)) - - for ax, ax_patches in patches.items(): - - for patch in ax_patches: - self._postprocess_artist(patch, ax, orient) - ax.add_patch(patch) - - def _standardize_coordinate_parameters(self, data, orient): - return data - - def _postprocess_artist(self, artist, ax, orient): - pass - - def _get_verts(self, data, orient): - - dv = {"x": "y", "y": "x"}[orient] - data = data.sort_values(orient, kind="mergesort") - verts = np.concatenate([ - data[[orient, f"{dv}min"]].to_numpy(), - data[[orient, f"{dv}max"]].to_numpy()[::-1], - ]) - if orient == "y": - verts = verts[:, ::-1] - return verts - - def _legend_artist(self, variables, value, scales): - - keys = {v: value for v in variables} - resolved = resolve_properties(self, keys, scales) - - fc = resolve_color(self, keys, "", scales) - if not resolved["fill"]: - fc = mpl.colors.to_rgba(fc, 0) - - return mpl.patches.Patch( - facecolor=fc, - edgecolor=resolve_color(self, keys, "edge", scales), - linewidth=resolved["edgewidth"], - linestyle=resolved["edgestyle"], - **self.artist_kws, - ) - - -@document_properties -@dataclass -class Area(AreaBase, Mark): - """ - A fill mark drawn from a baseline to data values. - - See also - -------- - Band : A fill mark representing an interval between values. - - Examples - -------- - .. include:: ../docstrings/objects.Area.rst - - """ - color: MappableColor = Mappable("C0", ) - alpha: MappableFloat = Mappable(.2, ) - fill: MappableBool = Mappable(True, ) - edgecolor: MappableColor = Mappable(depend="color") - edgealpha: MappableFloat = Mappable(1, ) - edgewidth: MappableFloat = Mappable(rc="patch.linewidth", ) - edgestyle: MappableStyle = Mappable("-", ) - - # TODO should this be settable / mappable? - baseline: MappableFloat = Mappable(0, grouping=False) - - def _standardize_coordinate_parameters(self, data, orient): - dv = {"x": "y", "y": "x"}[orient] - return data.rename(columns={"baseline": f"{dv}min", dv: f"{dv}max"}) - - def _postprocess_artist(self, artist, ax, orient): - - # TODO copying a lot of code from Bar, let's abstract this - # See comments there, I am not going to repeat them too - - artist.set_linewidth(artist.get_linewidth() * 2) - - linestyle = artist.get_linestyle() - if linestyle[1]: - linestyle = (linestyle[0], tuple(x / 2 for x in linestyle[1])) - artist.set_linestyle(linestyle) - - artist.set_clip_path(artist.get_path(), artist.get_transform() + ax.transData) - if self.artist_kws.get("clip_on", True): - artist.set_clip_box(ax.bbox) - - val_idx = ["y", "x"].index(orient) - artist.sticky_edges[val_idx][:] = (0, np.inf) - - -@document_properties -@dataclass -class Band(AreaBase, Mark): - """ - A fill mark representing an interval between values. - - See also - -------- - Area : A fill mark drawn from a baseline to data values. - - Examples - -------- - .. include:: ../docstrings/objects.Band.rst - - """ - color: MappableColor = Mappable("C0", ) - alpha: MappableFloat = Mappable(.2, ) - fill: MappableBool = Mappable(True, ) - edgecolor: MappableColor = Mappable(depend="color", ) - edgealpha: MappableFloat = Mappable(1, ) - edgewidth: MappableFloat = Mappable(0, ) - edgestyle: MappableFloat = Mappable("-", ) - - def _standardize_coordinate_parameters(self, data, orient): - # dv = {"x": "y", "y": "x"}[orient] - # TODO assert that all(ymax >= ymin)? - # TODO what if only one exist? - other = {"x": "y", "y": "x"}[orient] - if not set(data.columns) & {f"{other}min", f"{other}max"}: - agg = {f"{other}min": (other, "min"), f"{other}max": (other, "max")} - data = data.groupby(orient).agg(**agg).reset_index() - return data diff --git a/seaborn/_marks/bar.py b/seaborn/_marks/bar.py deleted file mode 100644 index 2aed6830a6becc256352a7adcac1f10238d84b98..0000000000000000000000000000000000000000 --- a/seaborn/_marks/bar.py +++ /dev/null @@ -1,252 +0,0 @@ -from __future__ import annotations -from collections import defaultdict -from dataclasses import dataclass - -import numpy as np -import matplotlib as mpl - -from seaborn._marks.base import ( - Mark, - Mappable, - MappableBool, - MappableColor, - MappableFloat, - MappableStyle, - resolve_properties, - resolve_color, - document_properties -) - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from typing import Any - from matplotlib.artist import Artist - from seaborn._core.scales import Scale - - -class BarBase(Mark): - - def _make_patches(self, data, scales, orient): - - transform = scales[orient]._matplotlib_scale.get_transform() - forward = transform.transform - reverse = transform.inverted().transform - - other = {"x": "y", "y": "x"}[orient] - - pos = reverse(forward(data[orient]) - data["width"] / 2) - width = reverse(forward(data[orient]) + data["width"] / 2) - pos - - val = (data[other] - data["baseline"]).to_numpy() - base = data["baseline"].to_numpy() - - kws = self._resolve_properties(data, scales) - if orient == "x": - kws.update(x=pos, y=base, w=width, h=val) - else: - kws.update(x=base, y=pos, w=val, h=width) - - kws.pop("width", None) - kws.pop("baseline", None) - - val_dim = {"x": "h", "y": "w"}[orient] - bars, vals = [], [] - - for i in range(len(data)): - - row = {k: v[i] for k, v in kws.items()} - - # Skip bars with no value. It's possible we'll want to make this - # an option (i.e so you have an artist for animating or annotating), - # but let's keep things simple for now. - if not np.nan_to_num(row[val_dim]): - continue - - bar = mpl.patches.Rectangle( - xy=(row["x"], row["y"]), - width=row["w"], - height=row["h"], - facecolor=row["facecolor"], - edgecolor=row["edgecolor"], - linestyle=row["edgestyle"], - linewidth=row["edgewidth"], - **self.artist_kws, - ) - bars.append(bar) - vals.append(row[val_dim]) - - return bars, vals - - def _resolve_properties(self, data, scales): - - resolved = resolve_properties(self, data, scales) - - resolved["facecolor"] = resolve_color(self, data, "", scales) - resolved["edgecolor"] = resolve_color(self, data, "edge", scales) - - fc = resolved["facecolor"] - if isinstance(fc, tuple): - resolved["facecolor"] = fc[0], fc[1], fc[2], fc[3] * resolved["fill"] - else: - fc[:, 3] = fc[:, 3] * resolved["fill"] # TODO Is inplace mod a problem? - resolved["facecolor"] = fc - - return resolved - - def _legend_artist( - self, variables: list[str], value: Any, scales: dict[str, Scale], - ) -> Artist: - # TODO return some sensible default? - key = {v: value for v in variables} - key = self._resolve_properties(key, scales) - artist = mpl.patches.Patch( - facecolor=key["facecolor"], - edgecolor=key["edgecolor"], - linewidth=key["edgewidth"], - linestyle=key["edgestyle"], - ) - return artist - - -@document_properties -@dataclass -class Bar(BarBase): - """ - A bar mark drawn between baseline and data values. - - See also - -------- - Bars : A faster bar mark with defaults more suitable for histograms. - - Examples - -------- - .. include:: ../docstrings/objects.Bar.rst - - """ - color: MappableColor = Mappable("C0", grouping=False) - alpha: MappableFloat = Mappable(.7, grouping=False) - fill: MappableBool = Mappable(True, grouping=False) - edgecolor: MappableColor = Mappable(depend="color", grouping=False) - edgealpha: MappableFloat = Mappable(1, grouping=False) - edgewidth: MappableFloat = Mappable(rc="patch.linewidth", grouping=False) - edgestyle: MappableStyle = Mappable("-", grouping=False) - # pattern: MappableString = Mappable(None) # TODO no Property yet - - width: MappableFloat = Mappable(.8, grouping=False) - baseline: MappableFloat = Mappable(0, grouping=False) # TODO *is* this mappable? - - def _plot(self, split_gen, scales, orient): - - val_idx = ["y", "x"].index(orient) - - for _, data, ax in split_gen(): - - bars, vals = self._make_patches(data, scales, orient) - - for bar in bars: - - # Because we are clipping the artist (see below), the edges end up - # looking half as wide as they actually are. I don't love this clumsy - # workaround, which is going to cause surprises if you work with the - # artists directly. We may need to revisit after feedback. - bar.set_linewidth(bar.get_linewidth() * 2) - linestyle = bar.get_linestyle() - if linestyle[1]: - linestyle = (linestyle[0], tuple(x / 2 for x in linestyle[1])) - bar.set_linestyle(linestyle) - - # This is a bit of a hack to handle the fact that the edge lines are - # centered on the actual extents of the bar, and overlap when bars are - # stacked or dodged. We may discover that this causes problems and needs - # to be revisited at some point. Also it should be faster to clip with - # a bbox than a path, but I cant't work out how to get the intersection - # with the axes bbox. - bar.set_clip_path(bar.get_path(), bar.get_transform() + ax.transData) - if self.artist_kws.get("clip_on", True): - # It seems the above hack undoes the default axes clipping - bar.set_clip_box(ax.bbox) - bar.sticky_edges[val_idx][:] = (0, np.inf) - ax.add_patch(bar) - - # Add a container which is useful for, e.g. Axes.bar_label - orientation = {"x": "vertical", "y": "horizontal"}[orient] - container_kws = dict(datavalues=vals, orientation=orientation) - container = mpl.container.BarContainer(bars, **container_kws) - ax.add_container(container) - - -@document_properties -@dataclass -class Bars(BarBase): - """ - A faster bar mark with defaults more suitable for histograms. - - See also - -------- - Bar : A bar mark drawn between baseline and data values. - - Examples - -------- - .. include:: ../docstrings/objects.Bars.rst - - """ - color: MappableColor = Mappable("C0", grouping=False) - alpha: MappableFloat = Mappable(.7, grouping=False) - fill: MappableBool = Mappable(True, grouping=False) - edgecolor: MappableColor = Mappable(rc="patch.edgecolor", grouping=False) - edgealpha: MappableFloat = Mappable(1, grouping=False) - edgewidth: MappableFloat = Mappable(auto=True, grouping=False) - edgestyle: MappableStyle = Mappable("-", grouping=False) - # pattern: MappableString = Mappable(None) # TODO no Property yet - - width: MappableFloat = Mappable(1, grouping=False) - baseline: MappableFloat = Mappable(0, grouping=False) # TODO *is* this mappable? - - def _plot(self, split_gen, scales, orient): - - ori_idx = ["x", "y"].index(orient) - val_idx = ["y", "x"].index(orient) - - patches = defaultdict(list) - for _, data, ax in split_gen(): - bars, _ = self._make_patches(data, scales, orient) - patches[ax].extend(bars) - - collections = {} - for ax, ax_patches in patches.items(): - - col = mpl.collections.PatchCollection(ax_patches, match_original=True) - col.sticky_edges[val_idx][:] = (0, np.inf) - ax.add_collection(col, autolim=False) - collections[ax] = col - - # Workaround for matplotlib autoscaling bug - # https://github.com/matplotlib/matplotlib/issues/11898 - # https://github.com/matplotlib/matplotlib/issues/23129 - xys = np.vstack([path.vertices for path in col.get_paths()]) - ax.update_datalim(xys) - - if "edgewidth" not in scales and isinstance(self.edgewidth, Mappable): - - for ax in collections: - ax.autoscale_view() - - def get_dimensions(collection): - edges, widths = [], [] - for verts in (path.vertices for path in collection.get_paths()): - edges.append(min(verts[:, ori_idx])) - widths.append(np.ptp(verts[:, ori_idx])) - return np.array(edges), np.array(widths) - - min_width = np.inf - for ax, col in collections.items(): - edges, widths = get_dimensions(col) - points = 72 / ax.figure.dpi * abs( - ax.transData.transform([edges + widths] * 2) - - ax.transData.transform([edges] * 2) - ) - min_width = min(min_width, min(points[:, ori_idx])) - - linewidth = min(.1 * min_width, mpl.rcParams["patch.linewidth"]) - for _, col in collections.items(): - col.set_linewidth(linewidth) diff --git a/seaborn/_marks/base.py b/seaborn/_marks/base.py deleted file mode 100644 index ac8fdf4aa547b5aa2fd1dcad424f4f78548dc7b8..0000000000000000000000000000000000000000 --- a/seaborn/_marks/base.py +++ /dev/null @@ -1,317 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass, fields, field -import textwrap -from typing import Any, Callable, Union -from collections.abc import Generator - -import numpy as np -import pandas as pd -import matplotlib as mpl - -from numpy import ndarray -from pandas import DataFrame -from matplotlib.artist import Artist - -from seaborn._core.scales import Scale -from seaborn._core.properties import ( - PROPERTIES, - Property, - RGBATuple, - DashPattern, - DashPatternWithOffset, -) -from seaborn._core.exceptions import PlotSpecError - - -class Mappable: - def __init__( - self, - val: Any = None, - depend: str | None = None, - rc: str | None = None, - auto: bool = False, - grouping: bool = True, - ): - """ - Property that can be mapped from data or set directly, with flexible defaults. - - Parameters - ---------- - val : Any - Use this value as the default. - depend : str - Use the value of this feature as the default. - rc : str - Use the value of this rcParam as the default. - auto : bool - The default value will depend on other parameters at compile time. - grouping : bool - If True, use the mapped variable to define groups. - - """ - if depend is not None: - assert depend in PROPERTIES - if rc is not None: - assert rc in mpl.rcParams - - self._val = val - self._rc = rc - self._depend = depend - self._auto = auto - self._grouping = grouping - - def __repr__(self): - """Nice formatting for when object appears in Mark init signature.""" - if self._val is not None: - s = f"<{repr(self._val)}>" - elif self._depend is not None: - s = f"<depend:{self._depend}>" - elif self._rc is not None: - s = f"<rc:{self._rc}>" - elif self._auto: - s = "<auto>" - else: - s = "<undefined>" - return s - - @property - def depend(self) -> Any: - """Return the name of the feature to source a default value from.""" - return self._depend - - @property - def grouping(self) -> bool: - return self._grouping - - @property - def default(self) -> Any: - """Get the default value for this feature, or access the relevant rcParam.""" - if self._val is not None: - return self._val - elif self._rc is not None: - return mpl.rcParams.get(self._rc) - - -# TODO where is the right place to put this kind of type aliasing? - -MappableBool = Union[bool, Mappable] -MappableString = Union[str, Mappable] -MappableFloat = Union[float, Mappable] -MappableColor = Union[str, tuple, Mappable] -MappableStyle = Union[str, DashPattern, DashPatternWithOffset, Mappable] - - -@dataclass -class Mark: - """Base class for objects that visually represent data.""" - - artist_kws: dict = field(default_factory=dict) - - @property - def _mappable_props(self): - return { - f.name: getattr(self, f.name) for f in fields(self) - if isinstance(f.default, Mappable) - } - - @property - def _grouping_props(self): - # TODO does it make sense to have variation within a Mark's - # properties about whether they are grouping? - return [ - f.name for f in fields(self) - if isinstance(f.default, Mappable) and f.default.grouping - ] - - # TODO make this method private? Would extender every need to call directly? - def _resolve( - self, - data: DataFrame | dict[str, Any], - name: str, - scales: dict[str, Scale] | None = None, - ) -> Any: - """Obtain default, specified, or mapped value for a named feature. - - Parameters - ---------- - data : DataFrame or dict with scalar values - Container with data values for features that will be semantically mapped. - name : string - Identity of the feature / semantic. - scales: dict - Mapping from variable to corresponding scale object. - - Returns - ------- - value or array of values - Outer return type depends on whether `data` is a dict (implying that - we want a single value) or DataFrame (implying that we want an array - of values with matching length). - - """ - feature = self._mappable_props[name] - prop = PROPERTIES.get(name, Property(name)) - directly_specified = not isinstance(feature, Mappable) - return_multiple = isinstance(data, pd.DataFrame) - return_array = return_multiple and not name.endswith("style") - - # Special case width because it needs to be resolved and added to the dataframe - # during layer prep (so the Move operations use it properly). - # TODO how does width *scaling* work, e.g. for violin width by count? - if name == "width": - directly_specified = directly_specified and name not in data - - if directly_specified: - feature = prop.standardize(feature) - if return_multiple: - feature = [feature] * len(data) - if return_array: - feature = np.array(feature) - return feature - - if name in data: - if scales is None or name not in scales: - # TODO Might this obviate the identity scale? Just don't add a scale? - feature = data[name] - else: - scale = scales[name] - value = data[name] - try: - feature = scale(value) - except Exception as err: - raise PlotSpecError._during("Scaling operation", name) from err - - if return_array: - feature = np.asarray(feature) - return feature - - if feature.depend is not None: - # TODO add source_func or similar to transform the source value? - # e.g. set linewidth as a proportion of pointsize? - return self._resolve(data, feature.depend, scales) - - default = prop.standardize(feature.default) - if return_multiple: - default = [default] * len(data) - if return_array: - default = np.array(default) - return default - - def _infer_orient(self, scales: dict) -> str: # TODO type scales - - # TODO The original version of this (in seaborn._base) did more checking. - # Paring that down here for the prototype to see what restrictions make sense. - - # TODO rethink this to map from scale type to "DV priority" and use that? - # e.g. Nominal > Discrete > Continuous - - x = 0 if "x" not in scales else scales["x"]._priority - y = 0 if "y" not in scales else scales["y"]._priority - - if y > x: - return "y" - else: - return "x" - - def _plot( - self, - split_generator: Callable[[], Generator], - scales: dict[str, Scale], - orient: str, - ) -> None: - """Main interface for creating a plot.""" - raise NotImplementedError() - - def _legend_artist( - self, variables: list[str], value: Any, scales: dict[str, Scale], - ) -> Artist | None: - - return None - - -def resolve_properties( - mark: Mark, data: DataFrame, scales: dict[str, Scale] -) -> dict[str, Any]: - - props = { - name: mark._resolve(data, name, scales) for name in mark._mappable_props - } - return props - - -def resolve_color( - mark: Mark, - data: DataFrame | dict, - prefix: str = "", - scales: dict[str, Scale] | None = None, -) -> RGBATuple | ndarray: - """ - Obtain a default, specified, or mapped value for a color feature. - - This method exists separately to support the relationship between a - color and its corresponding alpha. We want to respect alpha values that - are passed in specified (or mapped) color values but also make use of a - separate `alpha` variable, which can be mapped. This approach may also - be extended to support mapping of specific color channels (i.e. - luminance, chroma) in the future. - - Parameters - ---------- - mark : - Mark with the color property. - data : - Container with data values for features that will be semantically mapped. - prefix : - Support "color", "fillcolor", etc. - - """ - color = mark._resolve(data, f"{prefix}color", scales) - - if f"{prefix}alpha" in mark._mappable_props: - alpha = mark._resolve(data, f"{prefix}alpha", scales) - else: - alpha = mark._resolve(data, "alpha", scales) - - def visible(x, axis=None): - """Detect "invisible" colors to set alpha appropriately.""" - # TODO First clause only needed to handle non-rgba arrays, - # which we are trying to handle upstream - return np.array(x).dtype.kind != "f" or np.isfinite(x).all(axis) - - # Second check here catches vectors of strings with identity scale - # It could probably be handled better upstream. This is a tricky problem - if np.ndim(color) < 2 and all(isinstance(x, float) for x in color): - if len(color) == 4: - return mpl.colors.to_rgba(color) - alpha = alpha if visible(color) else np.nan - return mpl.colors.to_rgba(color, alpha) - else: - if np.ndim(color) == 2 and color.shape[1] == 4: - return mpl.colors.to_rgba_array(color) - alpha = np.where(visible(color, axis=1), alpha, np.nan) - return mpl.colors.to_rgba_array(color, alpha) - - # TODO should we be implementing fill here too? - # (i.e. set fillalpha to 0 when fill=False) - - -def document_properties(mark): - - properties = [f.name for f in fields(mark) if isinstance(f.default, Mappable)] - text = [ - "", - " This mark defines the following properties:", - textwrap.fill( - ", ".join([f"|{p}|" for p in properties]), - width=78, initial_indent=" " * 8, subsequent_indent=" " * 8, - ), - ] - - docstring_lines = mark.__doc__.split("\n") - new_docstring = "\n".join([ - *docstring_lines[:2], - *text, - *docstring_lines[2:], - ]) - mark.__doc__ = new_docstring - return mark diff --git a/seaborn/_marks/dot.py b/seaborn/_marks/dot.py deleted file mode 100644 index beef412dec2030d791b986aeb0261f5c0ba69766..0000000000000000000000000000000000000000 --- a/seaborn/_marks/dot.py +++ /dev/null @@ -1,200 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass - -import numpy as np -import matplotlib as mpl - -from seaborn._marks.base import ( - Mark, - Mappable, - MappableBool, - MappableFloat, - MappableString, - MappableColor, - MappableStyle, - resolve_properties, - resolve_color, - document_properties, -) - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from typing import Any - from matplotlib.artist import Artist - from seaborn._core.scales import Scale - - -class DotBase(Mark): - - def _resolve_paths(self, data): - - paths = [] - path_cache = {} - marker = data["marker"] - - def get_transformed_path(m): - return m.get_path().transformed(m.get_transform()) - - if isinstance(marker, mpl.markers.MarkerStyle): - return get_transformed_path(marker) - - for m in marker: - if m not in path_cache: - path_cache[m] = get_transformed_path(m) - paths.append(path_cache[m]) - return paths - - def _resolve_properties(self, data, scales): - - resolved = resolve_properties(self, data, scales) - resolved["path"] = self._resolve_paths(resolved) - resolved["size"] = resolved["pointsize"] ** 2 - - if isinstance(data, dict): # Properties for single dot - filled_marker = resolved["marker"].is_filled() - else: - filled_marker = [m.is_filled() for m in resolved["marker"]] - - resolved["fill"] = resolved["fill"] * filled_marker - - return resolved - - def _plot(self, split_gen, scales, orient): - - # TODO Not backcompat with allowed (but nonfunctional) univariate plots - # (That should be solved upstream by defaulting to "" for unset x/y?) - # (Be mindful of xmin/xmax, etc!) - - for _, data, ax in split_gen(): - - offsets = np.column_stack([data["x"], data["y"]]) - data = self._resolve_properties(data, scales) - - points = mpl.collections.PathCollection( - offsets=offsets, - paths=data["path"], - sizes=data["size"], - facecolors=data["facecolor"], - edgecolors=data["edgecolor"], - linewidths=data["linewidth"], - linestyles=data["edgestyle"], - transOffset=ax.transData, - transform=mpl.transforms.IdentityTransform(), - **self.artist_kws, - ) - ax.add_collection(points) - - def _legend_artist( - self, variables: list[str], value: Any, scales: dict[str, Scale], - ) -> Artist: - - key = {v: value for v in variables} - res = self._resolve_properties(key, scales) - - return mpl.collections.PathCollection( - paths=[res["path"]], - sizes=[res["size"]], - facecolors=[res["facecolor"]], - edgecolors=[res["edgecolor"]], - linewidths=[res["linewidth"]], - linestyles=[res["edgestyle"]], - transform=mpl.transforms.IdentityTransform(), - **self.artist_kws, - ) - - -@document_properties -@dataclass -class Dot(DotBase): - """ - A mark suitable for dot plots or less-dense scatterplots. - - See also - -------- - Dots : A dot mark defined by strokes to better handle overplotting. - - Examples - -------- - .. include:: ../docstrings/objects.Dot.rst - - """ - marker: MappableString = Mappable("o", grouping=False) - pointsize: MappableFloat = Mappable(6, grouping=False) # TODO rcParam? - stroke: MappableFloat = Mappable(.75, grouping=False) # TODO rcParam? - color: MappableColor = Mappable("C0", grouping=False) - alpha: MappableFloat = Mappable(1, grouping=False) - fill: MappableBool = Mappable(True, grouping=False) - edgecolor: MappableColor = Mappable(depend="color", grouping=False) - edgealpha: MappableFloat = Mappable(depend="alpha", grouping=False) - edgewidth: MappableFloat = Mappable(.5, grouping=False) # TODO rcParam? - edgestyle: MappableStyle = Mappable("-", grouping=False) - - def _resolve_properties(self, data, scales): - - resolved = super()._resolve_properties(data, scales) - filled = resolved["fill"] - - main_stroke = resolved["stroke"] - edge_stroke = resolved["edgewidth"] - resolved["linewidth"] = np.where(filled, edge_stroke, main_stroke) - - main_color = resolve_color(self, data, "", scales) - edge_color = resolve_color(self, data, "edge", scales) - - if not np.isscalar(filled): - # Expand dims to use in np.where with rgba arrays - filled = filled[:, None] - resolved["edgecolor"] = np.where(filled, edge_color, main_color) - - filled = np.squeeze(filled) - if isinstance(main_color, tuple): - # TODO handle this in resolve_color - main_color = tuple([*main_color[:3], main_color[3] * filled]) - else: - main_color = np.c_[main_color[:, :3], main_color[:, 3] * filled] - resolved["facecolor"] = main_color - - return resolved - - -@document_properties -@dataclass -class Dots(DotBase): - """ - A dot mark defined by strokes to better handle overplotting. - - See also - -------- - Dot : A mark suitable for dot plots or less-dense scatterplots. - - Examples - -------- - .. include:: ../docstrings/objects.Dots.rst - - """ - # TODO retype marker as MappableMarker - marker: MappableString = Mappable(rc="scatter.marker", grouping=False) - pointsize: MappableFloat = Mappable(4, grouping=False) # TODO rcParam? - stroke: MappableFloat = Mappable(.75, grouping=False) # TODO rcParam? - color: MappableColor = Mappable("C0", grouping=False) - alpha: MappableFloat = Mappable(1, grouping=False) # TODO auto alpha? - fill: MappableBool = Mappable(True, grouping=False) - fillcolor: MappableColor = Mappable(depend="color", grouping=False) - fillalpha: MappableFloat = Mappable(.2, grouping=False) - - def _resolve_properties(self, data, scales): - - resolved = super()._resolve_properties(data, scales) - resolved["linewidth"] = resolved.pop("stroke") - resolved["facecolor"] = resolve_color(self, data, "fill", scales) - resolved["edgecolor"] = resolve_color(self, data, "", scales) - resolved.setdefault("edgestyle", (0, None)) - - fc = resolved["facecolor"] - if isinstance(fc, tuple): - resolved["facecolor"] = fc[0], fc[1], fc[2], fc[3] * resolved["fill"] - else: - fc[:, 3] = fc[:, 3] * resolved["fill"] # TODO Is inplace mod a problem? - resolved["facecolor"] = fc - - return resolved diff --git a/seaborn/_marks/line.py b/seaborn/_marks/line.py deleted file mode 100644 index a517f1b8b79483c5bc0374322d73d6affe2bdbda..0000000000000000000000000000000000000000 --- a/seaborn/_marks/line.py +++ /dev/null @@ -1,285 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import ClassVar - -import numpy as np -import matplotlib as mpl - -from seaborn._marks.base import ( - Mark, - Mappable, - MappableFloat, - MappableString, - MappableColor, - resolve_properties, - resolve_color, - document_properties, -) - - -@document_properties -@dataclass -class Path(Mark): - """ - A mark connecting data points in the order they appear. - - See also - -------- - Line : A mark connecting data points with sorting along the orientation axis. - Paths : A faster but less-flexible mark for drawing many paths. - - Examples - -------- - .. include:: ../docstrings/objects.Path.rst - - """ - color: MappableColor = Mappable("C0") - alpha: MappableFloat = Mappable(1) - linewidth: MappableFloat = Mappable(rc="lines.linewidth") - linestyle: MappableString = Mappable(rc="lines.linestyle") - marker: MappableString = Mappable(rc="lines.marker") - pointsize: MappableFloat = Mappable(rc="lines.markersize") - fillcolor: MappableColor = Mappable(depend="color") - edgecolor: MappableColor = Mappable(depend="color") - edgewidth: MappableFloat = Mappable(rc="lines.markeredgewidth") - - _sort: ClassVar[bool] = False - - def _plot(self, split_gen, scales, orient): - - for keys, data, ax in split_gen(keep_na=not self._sort): - - vals = resolve_properties(self, keys, scales) - vals["color"] = resolve_color(self, keys, scales=scales) - vals["fillcolor"] = resolve_color(self, keys, prefix="fill", scales=scales) - vals["edgecolor"] = resolve_color(self, keys, prefix="edge", scales=scales) - - if self._sort: - data = data.sort_values(orient, kind="mergesort") - - artist_kws = self.artist_kws.copy() - self._handle_capstyle(artist_kws, vals) - - line = mpl.lines.Line2D( - data["x"].to_numpy(), - data["y"].to_numpy(), - color=vals["color"], - linewidth=vals["linewidth"], - linestyle=vals["linestyle"], - marker=vals["marker"], - markersize=vals["pointsize"], - markerfacecolor=vals["fillcolor"], - markeredgecolor=vals["edgecolor"], - markeredgewidth=vals["edgewidth"], - **artist_kws, - ) - ax.add_line(line) - - def _legend_artist(self, variables, value, scales): - - keys = {v: value for v in variables} - vals = resolve_properties(self, keys, scales) - vals["color"] = resolve_color(self, keys, scales=scales) - vals["fillcolor"] = resolve_color(self, keys, prefix="fill", scales=scales) - vals["edgecolor"] = resolve_color(self, keys, prefix="edge", scales=scales) - - artist_kws = self.artist_kws.copy() - self._handle_capstyle(artist_kws, vals) - - return mpl.lines.Line2D( - [], [], - color=vals["color"], - linewidth=vals["linewidth"], - linestyle=vals["linestyle"], - marker=vals["marker"], - markersize=vals["pointsize"], - markerfacecolor=vals["fillcolor"], - markeredgecolor=vals["edgecolor"], - markeredgewidth=vals["edgewidth"], - **artist_kws, - ) - - def _handle_capstyle(self, kws, vals): - - # Work around for this matplotlib issue: - # https://github.com/matplotlib/matplotlib/issues/23437 - if vals["linestyle"][1] is None: - capstyle = kws.get("solid_capstyle", mpl.rcParams["lines.solid_capstyle"]) - kws["dash_capstyle"] = capstyle - - -@document_properties -@dataclass -class Line(Path): - """ - A mark connecting data points with sorting along the orientation axis. - - See also - -------- - Path : A mark connecting data points in the order they appear. - Lines : A faster but less-flexible mark for drawing many lines. - - Examples - -------- - .. include:: ../docstrings/objects.Line.rst - - """ - _sort: ClassVar[bool] = True - - -@document_properties -@dataclass -class Paths(Mark): - """ - A faster but less-flexible mark for drawing many paths. - - See also - -------- - Path : A mark connecting data points in the order they appear. - - Examples - -------- - .. include:: ../docstrings/objects.Paths.rst - - """ - color: MappableColor = Mappable("C0") - alpha: MappableFloat = Mappable(1) - linewidth: MappableFloat = Mappable(rc="lines.linewidth") - linestyle: MappableString = Mappable(rc="lines.linestyle") - - _sort: ClassVar[bool] = False - - def __post_init__(self): - - # LineCollection artists have a capstyle property but don't source its value - # from the rc, so we do that manually here. Unfortunately, because we add - # only one LineCollection, we have the use the same capstyle for all lines - # even when they are dashed. It's a slight inconsistency, but looks fine IMO. - self.artist_kws.setdefault("capstyle", mpl.rcParams["lines.solid_capstyle"]) - - def _plot(self, split_gen, scales, orient): - - line_data = {} - for keys, data, ax in split_gen(keep_na=not self._sort): - - if ax not in line_data: - line_data[ax] = { - "segments": [], - "colors": [], - "linewidths": [], - "linestyles": [], - } - - segments = self._setup_segments(data, orient) - line_data[ax]["segments"].extend(segments) - n = len(segments) - - vals = resolve_properties(self, keys, scales) - vals["color"] = resolve_color(self, keys, scales=scales) - - line_data[ax]["colors"].extend([vals["color"]] * n) - line_data[ax]["linewidths"].extend([vals["linewidth"]] * n) - line_data[ax]["linestyles"].extend([vals["linestyle"]] * n) - - for ax, ax_data in line_data.items(): - lines = mpl.collections.LineCollection(**ax_data, **self.artist_kws) - # Handle datalim update manually - # https://github.com/matplotlib/matplotlib/issues/23129 - ax.add_collection(lines, autolim=False) - if ax_data["segments"]: - xy = np.concatenate(ax_data["segments"]) - ax.update_datalim(xy) - - def _legend_artist(self, variables, value, scales): - - key = resolve_properties(self, {v: value for v in variables}, scales) - - artist_kws = self.artist_kws.copy() - capstyle = artist_kws.pop("capstyle") - artist_kws["solid_capstyle"] = capstyle - artist_kws["dash_capstyle"] = capstyle - - return mpl.lines.Line2D( - [], [], - color=key["color"], - linewidth=key["linewidth"], - linestyle=key["linestyle"], - **artist_kws, - ) - - def _setup_segments(self, data, orient): - - if self._sort: - data = data.sort_values(orient, kind="mergesort") - - # Column stack to avoid block consolidation - xy = np.column_stack([data["x"], data["y"]]) - - return [xy] - - -@document_properties -@dataclass -class Lines(Paths): - """ - A faster but less-flexible mark for drawing many lines. - - See also - -------- - Line : A mark connecting data points with sorting along the orientation axis. - - Examples - -------- - .. include:: ../docstrings/objects.Lines.rst - - """ - _sort: ClassVar[bool] = True - - -@document_properties -@dataclass -class Range(Paths): - """ - An oriented line mark drawn between min/max values. - - Examples - -------- - .. include:: ../docstrings/objects.Range.rst - - """ - def _setup_segments(self, data, orient): - - # TODO better checks on what variables we have - # TODO what if only one exist? - val = {"x": "y", "y": "x"}[orient] - if not set(data.columns) & {f"{val}min", f"{val}max"}: - agg = {f"{val}min": (val, "min"), f"{val}max": (val, "max")} - data = data.groupby(orient).agg(**agg).reset_index() - - cols = [orient, f"{val}min", f"{val}max"] - data = data[cols].melt(orient, value_name=val)[["x", "y"]] - segments = [d.to_numpy() for _, d in data.groupby(orient)] - return segments - - -@document_properties -@dataclass -class Dash(Paths): - """ - A line mark drawn as an oriented segment for each datapoint. - - Examples - -------- - .. include:: ../docstrings/objects.Dash.rst - - """ - width: MappableFloat = Mappable(.8, grouping=False) - - def _setup_segments(self, data, orient): - - ori = ["x", "y"].index(orient) - xys = data[["x", "y"]].to_numpy().astype(float) - segments = np.stack([xys, xys], axis=1) - segments[:, 0, ori] -= data["width"] / 2 - segments[:, 1, ori] += data["width"] / 2 - return segments diff --git a/seaborn/_marks/text.py b/seaborn/_marks/text.py deleted file mode 100644 index 58d757c1acefc3c2ef6ecb8bec01c152ea08729d..0000000000000000000000000000000000000000 --- a/seaborn/_marks/text.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import annotations -from collections import defaultdict -from dataclasses import dataclass - -import numpy as np -import matplotlib as mpl -from matplotlib.transforms import ScaledTranslation - -from seaborn._marks.base import ( - Mark, - Mappable, - MappableFloat, - MappableString, - MappableColor, - resolve_properties, - resolve_color, - document_properties, -) - - -@document_properties -@dataclass -class Text(Mark): - """ - A textual mark to annotate or represent data values. - - Examples - -------- - .. include:: ../docstrings/objects.Text.rst - - """ - text: MappableString = Mappable("") - color: MappableColor = Mappable("k") - alpha: MappableFloat = Mappable(1) - fontsize: MappableFloat = Mappable(rc="font.size") - halign: MappableString = Mappable("center") - valign: MappableString = Mappable("center_baseline") - offset: MappableFloat = Mappable(4) - - def _plot(self, split_gen, scales, orient): - - ax_data = defaultdict(list) - - for keys, data, ax in split_gen(): - - vals = resolve_properties(self, keys, scales) - color = resolve_color(self, keys, "", scales) - - halign = vals["halign"] - valign = vals["valign"] - fontsize = vals["fontsize"] - offset = vals["offset"] / 72 - - offset_trans = ScaledTranslation( - {"right": -offset, "left": +offset}.get(halign, 0), - {"top": -offset, "bottom": +offset, "baseline": +offset}.get(valign, 0), - ax.figure.dpi_scale_trans, - ) - - for row in data.to_dict("records"): - artist = mpl.text.Text( - x=row["x"], - y=row["y"], - text=str(row.get("text", vals["text"])), - color=color, - fontsize=fontsize, - horizontalalignment=halign, - verticalalignment=valign, - transform=ax.transData + offset_trans, - **self.artist_kws, - ) - ax.add_artist(artist) - ax_data[ax].append([row["x"], row["y"]]) - - for ax, ax_vals in ax_data.items(): - ax.update_datalim(np.array(ax_vals)) diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py deleted file mode 100644 index 40346b02697d2fe58887ac8d00048a16b5b5c6e3..0000000000000000000000000000000000000000 --- a/seaborn/_statistics.py +++ /dev/null @@ -1,698 +0,0 @@ -"""Statistical transformations for visualization. - -This module is currently private, but is being written to eventually form part -of the public API. - -The classes should behave roughly in the style of scikit-learn. - -- All data-independent parameters should be passed to the class constructor. -- Each class should implement a default transformation that is exposed through - __call__. These are currently written for vector arguments, but I think - consuming a whole `plot_data` DataFrame and return it with transformed - variables would make more sense. -- Some class have data-dependent preprocessing that should be cached and used - multiple times (think defining histogram bins off all data and then counting - observations within each bin multiple times per data subsets). These currently - have unique names, but it would be good to have a common name. Not quite - `fit`, but something similar. -- Alternatively, the transform interface could take some information about grouping - variables and do a groupby internally. -- Some classes should define alternate transforms that might make the most sense - with a different function. For example, KDE usually evaluates the distribution - on a regular grid, but it would be useful for it to transform at the actual - datapoints. Then again, this could be controlled by a parameter at the time of - class instantiation. - -""" -from numbers import Number -from statistics import NormalDist -import numpy as np -import pandas as pd -try: - from scipy.stats import gaussian_kde - _no_scipy = False -except ImportError: - from .external.kde import gaussian_kde - _no_scipy = True - -from .algorithms import bootstrap -from .utils import _check_argument - - -class KDE: - """Univariate and bivariate kernel density estimator.""" - def __init__( - self, *, - bw_method=None, - bw_adjust=1, - gridsize=200, - cut=3, - clip=None, - cumulative=False, - ): - """Initialize the estimator with its parameters. - - Parameters - ---------- - bw_method : string, scalar, or callable, optional - Method for determining the smoothing bandwidth to use; passed to - :class:`scipy.stats.gaussian_kde`. - bw_adjust : number, optional - Factor that multiplicatively scales the value chosen using - ``bw_method``. Increasing will make the curve smoother. See Notes. - gridsize : int, optional - Number of points on each dimension of the evaluation grid. - cut : number, optional - Factor, multiplied by the smoothing bandwidth, that determines how - far the evaluation grid extends past the extreme datapoints. When - set to 0, truncate the curve at the data limits. - clip : pair of numbers or None, or a pair of such pairs - Do not evaluate the density outside of these limits. - cumulative : bool, optional - If True, estimate a cumulative distribution function. Requires scipy. - - """ - if clip is None: - clip = None, None - - self.bw_method = bw_method - self.bw_adjust = bw_adjust - self.gridsize = gridsize - self.cut = cut - self.clip = clip - self.cumulative = cumulative - - if cumulative and _no_scipy: - raise RuntimeError("Cumulative KDE evaluation requires scipy") - - self.support = None - - def _define_support_grid(self, x, bw, cut, clip, gridsize): - """Create the grid of evaluation points depending for vector x.""" - clip_lo = -np.inf if clip[0] is None else clip[0] - clip_hi = +np.inf if clip[1] is None else clip[1] - gridmin = max(x.min() - bw * cut, clip_lo) - gridmax = min(x.max() + bw * cut, clip_hi) - return np.linspace(gridmin, gridmax, gridsize) - - def _define_support_univariate(self, x, weights): - """Create a 1D grid of evaluation points.""" - kde = self._fit(x, weights) - bw = np.sqrt(kde.covariance.squeeze()) - grid = self._define_support_grid( - x, bw, self.cut, self.clip, self.gridsize - ) - return grid - - def _define_support_bivariate(self, x1, x2, weights): - """Create a 2D grid of evaluation points.""" - clip = self.clip - if clip[0] is None or np.isscalar(clip[0]): - clip = (clip, clip) - - kde = self._fit([x1, x2], weights) - bw = np.sqrt(np.diag(kde.covariance).squeeze()) - - grid1 = self._define_support_grid( - x1, bw[0], self.cut, clip[0], self.gridsize - ) - grid2 = self._define_support_grid( - x2, bw[1], self.cut, clip[1], self.gridsize - ) - - return grid1, grid2 - - def define_support(self, x1, x2=None, weights=None, cache=True): - """Create the evaluation grid for a given data set.""" - if x2 is None: - support = self._define_support_univariate(x1, weights) - else: - support = self._define_support_bivariate(x1, x2, weights) - - if cache: - self.support = support - - return support - - def _fit(self, fit_data, weights=None): - """Fit the scipy kde while adding bw_adjust logic and version check.""" - fit_kws = {"bw_method": self.bw_method} - if weights is not None: - fit_kws["weights"] = weights - - kde = gaussian_kde(fit_data, **fit_kws) - kde.set_bandwidth(kde.factor * self.bw_adjust) - - return kde - - def _eval_univariate(self, x, weights=None): - """Fit and evaluate a univariate on univariate data.""" - support = self.support - if support is None: - support = self.define_support(x, cache=False) - - kde = self._fit(x, weights) - - if self.cumulative: - s_0 = support[0] - density = np.array([ - kde.integrate_box_1d(s_0, s_i) for s_i in support - ]) - else: - density = kde(support) - - return density, support - - def _eval_bivariate(self, x1, x2, weights=None): - """Fit and evaluate a univariate on bivariate data.""" - support = self.support - if support is None: - support = self.define_support(x1, x2, cache=False) - - kde = self._fit([x1, x2], weights) - - if self.cumulative: - - grid1, grid2 = support - density = np.zeros((grid1.size, grid2.size)) - p0 = grid1.min(), grid2.min() - for i, xi in enumerate(grid1): - for j, xj in enumerate(grid2): - density[i, j] = kde.integrate_box(p0, (xi, xj)) - - else: - - xx1, xx2 = np.meshgrid(*support) - density = kde([xx1.ravel(), xx2.ravel()]).reshape(xx1.shape) - - return density, support - - def __call__(self, x1, x2=None, weights=None): - """Fit and evaluate on univariate or bivariate data.""" - if x2 is None: - return self._eval_univariate(x1, weights) - else: - return self._eval_bivariate(x1, x2, weights) - - -# Note: we no longer use this for univariate histograms in histplot, -# preferring _stats.Hist. We'll deprecate this once we have a bivariate Stat class. -class Histogram: - """Univariate and bivariate histogram estimator.""" - def __init__( - self, - stat="count", - bins="auto", - binwidth=None, - binrange=None, - discrete=False, - cumulative=False, - ): - """Initialize the estimator with its parameters. - - Parameters - ---------- - stat : str - Aggregate statistic to compute in each bin. - - - `count`: show the number of observations in each bin - - `frequency`: show the number of observations divided by the bin width - - `probability` or `proportion`: normalize such that bar heights sum to 1 - - `percent`: normalize such that bar heights sum to 100 - - `density`: normalize such that the total area of the histogram equals 1 - - bins : str, number, vector, or a pair of such values - Generic bin parameter that can be the name of a reference rule, - the number of bins, or the breaks of the bins. - Passed to :func:`numpy.histogram_bin_edges`. - binwidth : number or pair of numbers - Width of each bin, overrides ``bins`` but can be used with - ``binrange``. - binrange : pair of numbers or a pair of pairs - Lowest and highest value for bin edges; can be used either - with ``bins`` or ``binwidth``. Defaults to data extremes. - discrete : bool or pair of bools - If True, set ``binwidth`` and ``binrange`` such that bin - edges cover integer values in the dataset. - cumulative : bool - If True, return the cumulative statistic. - - """ - stat_choices = [ - "count", "frequency", "density", "probability", "proportion", "percent", - ] - _check_argument("stat", stat_choices, stat) - - self.stat = stat - self.bins = bins - self.binwidth = binwidth - self.binrange = binrange - self.discrete = discrete - self.cumulative = cumulative - - self.bin_kws = None - - def _define_bin_edges(self, x, weights, bins, binwidth, binrange, discrete): - """Inner function that takes bin parameters as arguments.""" - if binrange is None: - start, stop = x.min(), x.max() - else: - start, stop = binrange - - if discrete: - bin_edges = np.arange(start - .5, stop + 1.5) - elif binwidth is not None: - step = binwidth - bin_edges = np.arange(start, stop + step, step) - # Handle roundoff error (maybe there is a less clumsy way?) - if bin_edges.max() < stop or len(bin_edges) < 2: - bin_edges = np.append(bin_edges, bin_edges.max() + step) - else: - bin_edges = np.histogram_bin_edges( - x, bins, binrange, weights, - ) - return bin_edges - - def define_bin_params(self, x1, x2=None, weights=None, cache=True): - """Given data, return numpy.histogram parameters to define bins.""" - if x2 is None: - - bin_edges = self._define_bin_edges( - x1, weights, self.bins, self.binwidth, self.binrange, self.discrete, - ) - - if isinstance(self.bins, (str, Number)): - n_bins = len(bin_edges) - 1 - bin_range = bin_edges.min(), bin_edges.max() - bin_kws = dict(bins=n_bins, range=bin_range) - else: - bin_kws = dict(bins=bin_edges) - - else: - - bin_edges = [] - for i, x in enumerate([x1, x2]): - - # Resolve out whether bin parameters are shared - # or specific to each variable - - bins = self.bins - if not bins or isinstance(bins, (str, Number)): - pass - elif isinstance(bins[i], str): - bins = bins[i] - elif len(bins) == 2: - bins = bins[i] - - binwidth = self.binwidth - if binwidth is None: - pass - elif not isinstance(binwidth, Number): - binwidth = binwidth[i] - - binrange = self.binrange - if binrange is None: - pass - elif not isinstance(binrange[0], Number): - binrange = binrange[i] - - discrete = self.discrete - if not isinstance(discrete, bool): - discrete = discrete[i] - - # Define the bins for this variable - - bin_edges.append(self._define_bin_edges( - x, weights, bins, binwidth, binrange, discrete, - )) - - bin_kws = dict(bins=tuple(bin_edges)) - - if cache: - self.bin_kws = bin_kws - - return bin_kws - - def _eval_bivariate(self, x1, x2, weights): - """Inner function for histogram of two variables.""" - bin_kws = self.bin_kws - if bin_kws is None: - bin_kws = self.define_bin_params(x1, x2, cache=False) - - density = self.stat == "density" - - hist, *bin_edges = np.histogram2d( - x1, x2, **bin_kws, weights=weights, density=density - ) - - area = np.outer( - np.diff(bin_edges[0]), - np.diff(bin_edges[1]), - ) - - if self.stat == "probability" or self.stat == "proportion": - hist = hist.astype(float) / hist.sum() - elif self.stat == "percent": - hist = hist.astype(float) / hist.sum() * 100 - elif self.stat == "frequency": - hist = hist.astype(float) / area - - if self.cumulative: - if self.stat in ["density", "frequency"]: - hist = (hist * area).cumsum(axis=0).cumsum(axis=1) - else: - hist = hist.cumsum(axis=0).cumsum(axis=1) - - return hist, bin_edges - - def _eval_univariate(self, x, weights): - """Inner function for histogram of one variable.""" - bin_kws = self.bin_kws - if bin_kws is None: - bin_kws = self.define_bin_params(x, weights=weights, cache=False) - - density = self.stat == "density" - hist, bin_edges = np.histogram( - x, **bin_kws, weights=weights, density=density, - ) - - if self.stat == "probability" or self.stat == "proportion": - hist = hist.astype(float) / hist.sum() - elif self.stat == "percent": - hist = hist.astype(float) / hist.sum() * 100 - elif self.stat == "frequency": - hist = hist.astype(float) / np.diff(bin_edges) - - if self.cumulative: - if self.stat in ["density", "frequency"]: - hist = (hist * np.diff(bin_edges)).cumsum() - else: - hist = hist.cumsum() - - return hist, bin_edges - - def __call__(self, x1, x2=None, weights=None): - """Count the occurrences in each bin, maybe normalize.""" - if x2 is None: - return self._eval_univariate(x1, weights) - else: - return self._eval_bivariate(x1, x2, weights) - - -class ECDF: - """Univariate empirical cumulative distribution estimator.""" - def __init__(self, stat="proportion", complementary=False): - """Initialize the class with its parameters - - Parameters - ---------- - stat : {{"proportion", "percent", "count"}} - Distribution statistic to compute. - complementary : bool - If True, use the complementary CDF (1 - CDF) - - """ - _check_argument("stat", ["count", "percent", "proportion"], stat) - self.stat = stat - self.complementary = complementary - - def _eval_bivariate(self, x1, x2, weights): - """Inner function for ECDF of two variables.""" - raise NotImplementedError("Bivariate ECDF is not implemented") - - def _eval_univariate(self, x, weights): - """Inner function for ECDF of one variable.""" - sorter = x.argsort() - x = x[sorter] - weights = weights[sorter] - y = weights.cumsum() - - if self.stat in ["percent", "proportion"]: - y = y / y.max() - if self.stat == "percent": - y = y * 100 - - x = np.r_[-np.inf, x] - y = np.r_[0, y] - - if self.complementary: - y = y.max() - y - - return y, x - - def __call__(self, x1, x2=None, weights=None): - """Return proportion or count of observations below each sorted datapoint.""" - x1 = np.asarray(x1) - if weights is None: - weights = np.ones_like(x1) - else: - weights = np.asarray(weights) - - if x2 is None: - return self._eval_univariate(x1, weights) - else: - return self._eval_bivariate(x1, x2, weights) - - -class EstimateAggregator: - - def __init__(self, estimator, errorbar=None, **boot_kws): - """ - Data aggregator that produces an estimate and error bar interval. - - Parameters - ---------- - estimator : callable or string - Function (or method name) that maps a vector to a scalar. - errorbar : string, (string, number) tuple, or callable - Name of errorbar method (either "ci", "pi", "se", or "sd"), or a tuple - with a method name and a level parameter, or a function that maps from a - vector to a (min, max) interval, or None to hide errorbar. See the - :doc:`errorbar tutorial </tutorial/error_bars>` for more information. - boot_kws - Additional keywords are passed to bootstrap when error_method is "ci". - - """ - self.estimator = estimator - - method, level = _validate_errorbar_arg(errorbar) - self.error_method = method - self.error_level = level - - self.boot_kws = boot_kws - - def __call__(self, data, var): - """Aggregate over `var` column of `data` with estimate and error interval.""" - vals = data[var] - if callable(self.estimator): - # You would think we could pass to vals.agg, and yet: - # https://github.com/mwaskom/seaborn/issues/2943 - estimate = self.estimator(vals) - else: - estimate = vals.agg(self.estimator) - - # Options that produce no error bars - if self.error_method is None: - err_min = err_max = np.nan - elif len(data) <= 1: - err_min = err_max = np.nan - - # Generic errorbars from user-supplied function - elif callable(self.error_method): - err_min, err_max = self.error_method(vals) - - # Parametric options - elif self.error_method == "sd": - half_interval = vals.std() * self.error_level - err_min, err_max = estimate - half_interval, estimate + half_interval - elif self.error_method == "se": - half_interval = vals.sem() * self.error_level - err_min, err_max = estimate - half_interval, estimate + half_interval - - # Nonparametric options - elif self.error_method == "pi": - err_min, err_max = _percentile_interval(vals, self.error_level) - elif self.error_method == "ci": - units = data.get("units", None) - boots = bootstrap(vals, units=units, func=self.estimator, **self.boot_kws) - err_min, err_max = _percentile_interval(boots, self.error_level) - - return pd.Series({var: estimate, f"{var}min": err_min, f"{var}max": err_max}) - - -class WeightedAggregator: - - def __init__(self, estimator, errorbar=None, **boot_kws): - """ - Data aggregator that produces a weighted estimate and error bar interval. - - Parameters - ---------- - estimator : string - Function (or method name) that maps a vector to a scalar. Currently - supports only "mean". - errorbar : string or (string, number) tuple - Name of errorbar method or a tuple with a method name and a level parameter. - Currently the only supported method is "ci". - boot_kws - Additional keywords are passed to bootstrap when error_method is "ci". - - """ - if estimator != "mean": - # Note that, while other weighted estimators may make sense (e.g. median), - # I'm not aware of an implementation in our dependencies. We can add one - # in seaborn later, if there is sufficient interest. For now, limit to mean. - raise ValueError(f"Weighted estimator must be 'mean', not {estimator!r}.") - self.estimator = estimator - - method, level = _validate_errorbar_arg(errorbar) - if method is not None and method != "ci": - # As with the estimator, weighted 'sd' or 'pi' error bars may make sense. - # But we'll keep things simple for now and limit to (bootstrap) CI. - raise ValueError(f"Error bar method must be 'ci', not {method!r}.") - self.error_method = method - self.error_level = level - - self.boot_kws = boot_kws - - def __call__(self, data, var): - """Aggregate over `var` column of `data` with estimate and error interval.""" - vals = data[var] - weights = data["weight"] - - estimate = np.average(vals, weights=weights) - - if self.error_method == "ci" and len(data) > 1: - - def error_func(x, w): - return np.average(x, weights=w) - - boots = bootstrap(vals, weights, func=error_func, **self.boot_kws) - err_min, err_max = _percentile_interval(boots, self.error_level) - - else: - err_min = err_max = np.nan - - return pd.Series({var: estimate, f"{var}min": err_min, f"{var}max": err_max}) - - -class LetterValues: - - def __init__(self, k_depth, outlier_prop, trust_alpha): - """ - Compute percentiles of a distribution using various tail stopping rules. - - Parameters - ---------- - k_depth: "tukey", "proportion", "trustworthy", or "full" - Stopping rule for choosing tail percentiled to show: - - - tukey: Show a similar number of outliers as in a conventional boxplot. - - proportion: Show approximately `outlier_prop` outliers. - - trust_alpha: Use `trust_alpha` level for most extreme tail percentile. - - outlier_prop: float - Parameter for `k_depth="proportion"` setting the expected outlier rate. - trust_alpha: float - Parameter for `k_depth="trustworthy"` setting the confidence threshold. - - Notes - ----- - Based on the proposal in this paper: - https://vita.had.co.nz/papers/letter-value-plot.pdf - - """ - k_options = ["tukey", "proportion", "trustworthy", "full"] - if isinstance(k_depth, str): - _check_argument("k_depth", k_options, k_depth) - elif not isinstance(k_depth, int): - err = ( - "The `k_depth` parameter must be either an integer or string " - f"(one of {k_options}), not {k_depth!r}." - ) - raise TypeError(err) - - self.k_depth = k_depth - self.outlier_prop = outlier_prop - self.trust_alpha = trust_alpha - - def _compute_k(self, n): - - # Select the depth, i.e. number of boxes to draw, based on the method - if self.k_depth == "full": - # extend boxes to 100% of the data - k = int(np.log2(n)) + 1 - elif self.k_depth == "tukey": - # This results with 5-8 points in each tail - k = int(np.log2(n)) - 3 - elif self.k_depth == "proportion": - k = int(np.log2(n)) - int(np.log2(n * self.outlier_prop)) + 1 - elif self.k_depth == "trustworthy": - normal_quantile_func = np.vectorize(NormalDist().inv_cdf) - point_conf = 2 * normal_quantile_func(1 - self.trust_alpha / 2) ** 2 - k = int(np.log2(n / point_conf)) + 1 - else: - # Allow having k directly specified as input - k = int(self.k_depth) - - return max(k, 1) - - def __call__(self, x): - """Evaluate the letter values.""" - k = self._compute_k(len(x)) - exp = np.arange(k + 1, 1, -1), np.arange(2, k + 2) - levels = k + 1 - np.concatenate([exp[0], exp[1][1:]]) - percentiles = 100 * np.concatenate([0.5 ** exp[0], 1 - 0.5 ** exp[1]]) - if self.k_depth == "full": - percentiles[0] = 0 - percentiles[-1] = 100 - values = np.percentile(x, percentiles) - fliers = np.asarray(x[(x < values.min()) | (x > values.max())]) - median = np.percentile(x, 50) - - return { - "k": k, - "levels": levels, - "percs": percentiles, - "values": values, - "fliers": fliers, - "median": median, - } - - -def _percentile_interval(data, width): - """Return a percentile interval from data of a given width.""" - edge = (100 - width) / 2 - percentiles = edge, 100 - edge - return np.nanpercentile(data, percentiles) - - -def _validate_errorbar_arg(arg): - """Check type and value of errorbar argument and assign default level.""" - DEFAULT_LEVELS = { - "ci": 95, - "pi": 95, - "se": 1, - "sd": 1, - } - - usage = "`errorbar` must be a callable, string, or (string, number) tuple" - - if arg is None: - return None, None - elif callable(arg): - return arg, None - elif isinstance(arg, str): - method = arg - level = DEFAULT_LEVELS.get(method, None) - else: - try: - method, level = arg - except (ValueError, TypeError) as err: - raise err.__class__(usage) from err - - _check_argument("errorbar", list(DEFAULT_LEVELS), method) - if level is not None and not isinstance(level, Number): - raise TypeError(usage) - - return method, level diff --git a/seaborn/_stats/__init__.py b/seaborn/_stats/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/seaborn/_stats/aggregation.py b/seaborn/_stats/aggregation.py deleted file mode 100644 index 7e7d60212a49383bbdf8f78bcb297d7f1bdbc561..0000000000000000000000000000000000000000 --- a/seaborn/_stats/aggregation.py +++ /dev/null @@ -1,130 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import ClassVar, Callable - -import pandas as pd -from pandas import DataFrame - -from seaborn._core.scales import Scale -from seaborn._core.groupby import GroupBy -from seaborn._stats.base import Stat -from seaborn._statistics import ( - EstimateAggregator, - WeightedAggregator, -) -from seaborn._core.typing import Vector - - -@dataclass -class Agg(Stat): - """ - Aggregate data along the value axis using given method. - - Parameters - ---------- - func : str or callable - Name of a :class:`pandas.Series` method or a vector -> scalar function. - - See Also - -------- - objects.Est : Aggregation with error bars. - - Examples - -------- - .. include:: ../docstrings/objects.Agg.rst - - """ - func: str | Callable[[Vector], float] = "mean" - - group_by_orient: ClassVar[bool] = True - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - var = {"x": "y", "y": "x"}.get(orient) - res = ( - groupby - .agg(data, {var: self.func}) - .dropna(subset=[var]) - .reset_index(drop=True) - ) - return res - - -@dataclass -class Est(Stat): - """ - Calculate a point estimate and error bar interval. - - For more information about the various `errorbar` choices, see the - :doc:`errorbar tutorial </tutorial/error_bars>`. - - Additional variables: - - - **weight**: When passed to a layer that uses this stat, a weighted estimate - will be computed. Note that use of weights currently limits the choice of - function and error bar method to `"mean"` and `"ci"`, respectively. - - Parameters - ---------- - func : str or callable - Name of a :class:`numpy.ndarray` method or a vector -> scalar function. - errorbar : str, (str, float) tuple, or callable - Name of errorbar method (one of "ci", "pi", "se" or "sd"), or a tuple - with a method name ane a level parameter, or a function that maps from a - vector to a (min, max) interval. - n_boot : int - Number of bootstrap samples to draw for "ci" errorbars. - seed : int - Seed for the PRNG used to draw bootstrap samples. - - Examples - -------- - .. include:: ../docstrings/objects.Est.rst - - """ - func: str | Callable[[Vector], float] = "mean" - errorbar: str | tuple[str, float] = ("ci", 95) - n_boot: int = 1000 - seed: int | None = None - - group_by_orient: ClassVar[bool] = True - - def _process( - self, data: DataFrame, var: str, estimator: EstimateAggregator - ) -> DataFrame: - # Needed because GroupBy.apply assumes func is DataFrame -> DataFrame - # which we could probably make more general to allow Series return - res = estimator(data, var) - return pd.DataFrame([res]) - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - boot_kws = {"n_boot": self.n_boot, "seed": self.seed} - if "weight" in data: - engine = WeightedAggregator(self.func, self.errorbar, **boot_kws) - else: - engine = EstimateAggregator(self.func, self.errorbar, **boot_kws) - - var = {"x": "y", "y": "x"}[orient] - res = ( - groupby - .apply(data, self._process, var, engine) - .dropna(subset=[var]) - .reset_index(drop=True) - ) - - res = res.fillna({f"{var}min": res[var], f"{var}max": res[var]}) - - return res - - -@dataclass -class Rolling(Stat): - ... - - def __call__(self, data, groupby, orient, scales): - ... diff --git a/seaborn/_stats/base.py b/seaborn/_stats/base.py deleted file mode 100644 index b80b228165406f2103f00ce9bb0143bf16c02002..0000000000000000000000000000000000000000 --- a/seaborn/_stats/base.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Base module for statistical transformations.""" -from __future__ import annotations -from collections.abc import Iterable -from dataclasses import dataclass -from typing import ClassVar, Any -import warnings - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from pandas import DataFrame - from seaborn._core.groupby import GroupBy - from seaborn._core.scales import Scale - - -@dataclass -class Stat: - """Base class for objects that apply statistical transformations.""" - - # The class supports a partial-function application pattern. The object is - # initialized with desired parameters and the result is a callable that - # accepts and returns dataframes. - - # The statistical transformation logic should not add any state to the instance - # beyond what is defined with the initialization parameters. - - # Subclasses can declare whether the orient dimension should be used in grouping - # TODO consider whether this should be a parameter. Motivating example: - # use the same KDE class violin plots and univariate density estimation. - # In the former case, we would expect separate densities for each unique - # value on the orient axis, but we would not in the latter case. - group_by_orient: ClassVar[bool] = False - - def _check_param_one_of(self, param: str, options: Iterable[Any]) -> None: - """Raise when parameter value is not one of a specified set.""" - value = getattr(self, param) - if value not in options: - *most, last = options - option_str = ", ".join(f"{x!r}" for x in most[:-1]) + f" or {last!r}" - err = " ".join([ - f"The `{param}` parameter for `{self.__class__.__name__}` must be", - f"one of {option_str}; not {value!r}.", - ]) - raise ValueError(err) - - def _check_grouping_vars( - self, param: str, data_vars: list[str], stacklevel: int = 2, - ) -> None: - """Warn if vars are named in parameter without being present in the data.""" - param_vars = getattr(self, param) - undefined = set(param_vars) - set(data_vars) - if undefined: - param = f"{self.__class__.__name__}.{param}" - names = ", ".join(f"{x!r}" for x in undefined) - msg = f"Undefined variable(s) passed for {param}: {names}." - warnings.warn(msg, stacklevel=stacklevel) - - def __call__( - self, - data: DataFrame, - groupby: GroupBy, - orient: str, - scales: dict[str, Scale], - ) -> DataFrame: - """Apply statistical transform to data subgroups and return combined result.""" - return data diff --git a/seaborn/_stats/counting.py b/seaborn/_stats/counting.py deleted file mode 100644 index 0c2fb7d4998ac6fcfd39bf79686c113f5481bb65..0000000000000000000000000000000000000000 --- a/seaborn/_stats/counting.py +++ /dev/null @@ -1,232 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import ClassVar - -import numpy as np -import pandas as pd -from pandas import DataFrame - -from seaborn._core.groupby import GroupBy -from seaborn._core.scales import Scale -from seaborn._stats.base import Stat - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from numpy.typing import ArrayLike - - -@dataclass -class Count(Stat): - """ - Count distinct observations within groups. - - See Also - -------- - Hist : A more fully-featured transform including binning and/or normalization. - - Examples - -------- - .. include:: ../docstrings/objects.Count.rst - - """ - group_by_orient: ClassVar[bool] = True - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - var = {"x": "y", "y": "x"}[orient] - res = ( - groupby - .agg(data.assign(**{var: data[orient]}), {var: len}) - .dropna(subset=["x", "y"]) - .reset_index(drop=True) - ) - return res - - -@dataclass -class Hist(Stat): - """ - Bin observations, count them, and optionally normalize or cumulate. - - Parameters - ---------- - stat : str - Aggregate statistic to compute in each bin: - - - `count`: the number of observations - - `density`: normalize so that the total area of the histogram equals 1 - - `percent`: normalize so that bar heights sum to 100 - - `probability` or `proportion`: normalize so that bar heights sum to 1 - - `frequency`: divide the number of observations by the bin width - - bins : str, int, or ArrayLike - Generic parameter that can be the name of a reference rule, the number - of bins, or the bin breaks. Passed to :func:`numpy.histogram_bin_edges`. - binwidth : float - Width of each bin; overrides `bins` but can be used with `binrange`. - Note that if `binwidth` does not evenly divide the bin range, the actual - bin width used will be only approximately equal to the parameter value. - binrange : (min, max) - Lowest and highest value for bin edges; can be used with either - `bins` (when a number) or `binwidth`. Defaults to data extremes. - common_norm : bool or list of variables - When not `False`, the normalization is applied across groups. Use - `True` to normalize across all groups, or pass variable name(s) that - define normalization groups. - common_bins : bool or list of variables - When not `False`, the same bins are used for all groups. Use `True` to - share bins across all groups, or pass variable name(s) to share within. - cumulative : bool - If True, cumulate the bin values. - discrete : bool - If True, set `binwidth` and `binrange` so that bins have unit width and - are centered on integer values - - Notes - ----- - The choice of bins for computing and plotting a histogram can exert - substantial influence on the insights that one is able to draw from the - visualization. If the bins are too large, they may erase important features. - On the other hand, bins that are too small may be dominated by random - variability, obscuring the shape of the true underlying distribution. The - default bin size is determined using a reference rule that depends on the - sample size and variance. This works well in many cases, (i.e., with - "well-behaved" data) but it fails in others. It is always a good to try - different bin sizes to be sure that you are not missing something important. - This function allows you to specify bins in several different ways, such as - by setting the total number of bins to use, the width of each bin, or the - specific locations where the bins should break. - - Examples - -------- - .. include:: ../docstrings/objects.Hist.rst - - """ - stat: str = "count" - bins: str | int | ArrayLike = "auto" - binwidth: float | None = None - binrange: tuple[float, float] | None = None - common_norm: bool | list[str] = True - common_bins: bool | list[str] = True - cumulative: bool = False - discrete: bool = False - - def __post_init__(self): - - stat_options = [ - "count", "density", "percent", "probability", "proportion", "frequency" - ] - self._check_param_one_of("stat", stat_options) - - def _define_bin_edges(self, vals, weight, bins, binwidth, binrange, discrete): - """Inner function that takes bin parameters as arguments.""" - vals = vals.replace(-np.inf, np.nan).replace(np.inf, np.nan).dropna() - - if binrange is None: - start, stop = vals.min(), vals.max() - else: - start, stop = binrange - - if discrete: - bin_edges = np.arange(start - .5, stop + 1.5) - else: - if binwidth is not None: - bins = int(round((stop - start) / binwidth)) - bin_edges = np.histogram_bin_edges(vals, bins, binrange, weight) - - # TODO warning or cap on too many bins? - - return bin_edges - - def _define_bin_params(self, data, orient, scale_type): - """Given data, return numpy.histogram parameters to define bins.""" - vals = data[orient] - weights = data.get("weight", None) - - # TODO We'll want this for ordinal / discrete scales too - # (Do we need discrete as a parameter or just infer from scale?) - discrete = self.discrete or scale_type == "nominal" - - bin_edges = self._define_bin_edges( - vals, weights, self.bins, self.binwidth, self.binrange, discrete, - ) - - if isinstance(self.bins, (str, int)): - n_bins = len(bin_edges) - 1 - bin_range = bin_edges.min(), bin_edges.max() - bin_kws = dict(bins=n_bins, range=bin_range) - else: - bin_kws = dict(bins=bin_edges) - - return bin_kws - - def _get_bins_and_eval(self, data, orient, groupby, scale_type): - - bin_kws = self._define_bin_params(data, orient, scale_type) - return groupby.apply(data, self._eval, orient, bin_kws) - - def _eval(self, data, orient, bin_kws): - - vals = data[orient] - weights = data.get("weight", None) - - density = self.stat == "density" - hist, edges = np.histogram(vals, **bin_kws, weights=weights, density=density) - - width = np.diff(edges) - center = edges[:-1] + width / 2 - - return pd.DataFrame({orient: center, "count": hist, "space": width}) - - def _normalize(self, data): - - hist = data["count"] - if self.stat == "probability" or self.stat == "proportion": - hist = hist.astype(float) / hist.sum() - elif self.stat == "percent": - hist = hist.astype(float) / hist.sum() * 100 - elif self.stat == "frequency": - hist = hist.astype(float) / data["space"] - - if self.cumulative: - if self.stat in ["density", "frequency"]: - hist = (hist * data["space"]).cumsum() - else: - hist = hist.cumsum() - - return data.assign(**{self.stat: hist}) - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - scale_type = scales[orient].__class__.__name__.lower() - grouping_vars = [str(v) for v in data if v in groupby.order] - if not grouping_vars or self.common_bins is True: - bin_kws = self._define_bin_params(data, orient, scale_type) - data = groupby.apply(data, self._eval, orient, bin_kws) - else: - if self.common_bins is False: - bin_groupby = GroupBy(grouping_vars) - else: - bin_groupby = GroupBy(self.common_bins) - self._check_grouping_vars("common_bins", grouping_vars) - - data = bin_groupby.apply( - data, self._get_bins_and_eval, orient, groupby, scale_type, - ) - - if not grouping_vars or self.common_norm is True: - data = self._normalize(data) - else: - if self.common_norm is False: - norm_groupby = GroupBy(grouping_vars) - else: - norm_groupby = GroupBy(self.common_norm) - self._check_grouping_vars("common_norm", grouping_vars) - data = norm_groupby.apply(data, self._normalize) - - other = {"x": "y", "y": "x"}[orient] - return data.assign(**{other: data[self.stat]}) diff --git a/seaborn/_stats/density.py b/seaborn/_stats/density.py deleted file mode 100644 index e461387651556a28ded23d6583d78e8fff8e38b3..0000000000000000000000000000000000000000 --- a/seaborn/_stats/density.py +++ /dev/null @@ -1,214 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass -from typing import Any, Callable - -import numpy as np -from numpy import ndarray -import pandas as pd -from pandas import DataFrame -try: - from scipy.stats import gaussian_kde - _no_scipy = False -except ImportError: - from seaborn.external.kde import gaussian_kde - _no_scipy = True - -from seaborn._core.groupby import GroupBy -from seaborn._core.scales import Scale -from seaborn._stats.base import Stat - - -@dataclass -class KDE(Stat): - """ - Compute a univariate kernel density estimate. - - Parameters - ---------- - bw_adjust : float - Factor that multiplicatively scales the value chosen using - `bw_method`. Increasing will make the curve smoother. See Notes. - bw_method : string, scalar, or callable - Method for determining the smoothing bandwidth to use. Passed directly - to :class:`scipy.stats.gaussian_kde`; see there for options. - common_norm : bool or list of variables - If `True`, normalize so that the areas of all curves sums to 1. - If `False`, normalize each curve independently. If a list, defines - variable(s) to group by and normalize within. - common_grid : bool or list of variables - If `True`, all curves will share the same evaluation grid. - If `False`, each evaluation grid is independent. If a list, defines - variable(s) to group by and share a grid within. - gridsize : int or None - Number of points in the evaluation grid. If None, the density is - evaluated at the original datapoints. - cut : float - Factor, multiplied by the kernel bandwidth, that determines how far - the evaluation grid extends past the extreme datapoints. When set to 0, - the curve is truncated at the data limits. - cumulative : bool - If True, estimate a cumulative distribution function. Requires scipy. - - Notes - ----- - The *bandwidth*, or standard deviation of the smoothing kernel, is an - important parameter. Much like histogram bin width, using the wrong - bandwidth can produce a distorted representation. Over-smoothing can erase - true features, while under-smoothing can create false ones. The default - uses a rule-of-thumb that works best for distributions that are roughly - bell-shaped. It is a good idea to check the default by varying `bw_adjust`. - - Because the smoothing is performed with a Gaussian kernel, the estimated - density curve can extend to values that may not make sense. For example, the - curve may be drawn over negative values when data that are naturally - positive. The `cut` parameter can be used to control the evaluation range, - but datasets that have many observations close to a natural boundary may be - better served by a different method. - - Similar distortions may arise when a dataset is naturally discrete or "spiky" - (containing many repeated observations of the same value). KDEs will always - produce a smooth curve, which could be misleading. - - The units on the density axis are a common source of confusion. While kernel - density estimation produces a probability distribution, the height of the curve - at each point gives a density, not a probability. A probability can be obtained - only by integrating the density across a range. The curve is normalized so - that the integral over all possible values is 1, meaning that the scale of - the density axis depends on the data values. - - If scipy is installed, its cython-accelerated implementation will be used. - - Examples - -------- - .. include:: ../docstrings/objects.KDE.rst - - """ - bw_adjust: float = 1 - bw_method: str | float | Callable[[gaussian_kde], float] = "scott" - common_norm: bool | list[str] = True - common_grid: bool | list[str] = True - gridsize: int | None = 200 - cut: float = 3 - cumulative: bool = False - - def __post_init__(self): - - if self.cumulative and _no_scipy: - raise RuntimeError("Cumulative KDE evaluation requires scipy") - - def _check_var_list_or_boolean(self, param: str, grouping_vars: Any) -> None: - """Do input checks on grouping parameters.""" - value = getattr(self, param) - if not ( - isinstance(value, bool) - or (isinstance(value, list) and all(isinstance(v, str) for v in value)) - ): - param_name = f"{self.__class__.__name__}.{param}" - raise TypeError(f"{param_name} must be a boolean or list of strings.") - self._check_grouping_vars(param, grouping_vars, stacklevel=3) - - def _fit(self, data: DataFrame, orient: str) -> gaussian_kde: - """Fit and return a KDE object.""" - # TODO need to handle singular data - - fit_kws: dict[str, Any] = {"bw_method": self.bw_method} - if "weight" in data: - fit_kws["weights"] = data["weight"] - kde = gaussian_kde(data[orient], **fit_kws) - kde.set_bandwidth(kde.factor * self.bw_adjust) - - return kde - - def _get_support(self, data: DataFrame, orient: str) -> ndarray: - """Define the grid that the KDE will be evaluated on.""" - if self.gridsize is None: - return data[orient].to_numpy() - - kde = self._fit(data, orient) - bw = np.sqrt(kde.covariance.squeeze()) - gridmin = data[orient].min() - bw * self.cut - gridmax = data[orient].max() + bw * self.cut - return np.linspace(gridmin, gridmax, self.gridsize) - - def _fit_and_evaluate( - self, data: DataFrame, orient: str, support: ndarray - ) -> DataFrame: - """Transform single group by fitting a KDE and evaluating on a support grid.""" - empty = pd.DataFrame(columns=[orient, "weight", "density"], dtype=float) - if len(data) < 2: - return empty - try: - kde = self._fit(data, orient) - except np.linalg.LinAlgError: - return empty - - if self.cumulative: - s_0 = support[0] - density = np.array([kde.integrate_box_1d(s_0, s_i) for s_i in support]) - else: - density = kde(support) - - weight = data["weight"].sum() - return pd.DataFrame({orient: support, "weight": weight, "density": density}) - - def _transform( - self, data: DataFrame, orient: str, grouping_vars: list[str] - ) -> DataFrame: - """Transform multiple groups by fitting KDEs and evaluating.""" - empty = pd.DataFrame(columns=[*data.columns, "density"], dtype=float) - if len(data) < 2: - return empty - try: - support = self._get_support(data, orient) - except np.linalg.LinAlgError: - return empty - - grouping_vars = [x for x in grouping_vars if data[x].nunique() > 1] - if not grouping_vars: - return self._fit_and_evaluate(data, orient, support) - groupby = GroupBy(grouping_vars) - return groupby.apply(data, self._fit_and_evaluate, orient, support) - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - if "weight" not in data: - data = data.assign(weight=1) - data = data.dropna(subset=[orient, "weight"]) - - # Transform each group separately - grouping_vars = [str(v) for v in data if v in groupby.order] - if not grouping_vars or self.common_grid is True: - res = self._transform(data, orient, grouping_vars) - else: - if self.common_grid is False: - grid_vars = grouping_vars - else: - self._check_var_list_or_boolean("common_grid", grouping_vars) - grid_vars = [v for v in self.common_grid if v in grouping_vars] - - res = ( - GroupBy(grid_vars) - .apply(data, self._transform, orient, grouping_vars) - ) - - # Normalize, potentially within groups - if not grouping_vars or self.common_norm is True: - res = res.assign(group_weight=data["weight"].sum()) - else: - if self.common_norm is False: - norm_vars = grouping_vars - else: - self._check_var_list_or_boolean("common_norm", grouping_vars) - norm_vars = [v for v in self.common_norm if v in grouping_vars] - - res = res.join( - data.groupby(norm_vars)["weight"].sum().rename("group_weight"), - on=norm_vars, - ) - - res["density"] *= res.eval("weight / group_weight") - value = {"x": "y", "y": "x"}[orient] - res[value] = res["density"] - return res.drop(["weight", "group_weight"], axis=1) diff --git a/seaborn/_stats/order.py b/seaborn/_stats/order.py deleted file mode 100644 index c37c0985238efde6386e61055fca2d2f3ff2cc10..0000000000000000000000000000000000000000 --- a/seaborn/_stats/order.py +++ /dev/null @@ -1,78 +0,0 @@ - -from __future__ import annotations -from dataclasses import dataclass -from typing import ClassVar, cast -try: - from typing import Literal -except ImportError: - from typing_extensions import Literal # type: ignore - -import numpy as np -from pandas import DataFrame - -from seaborn._core.scales import Scale -from seaborn._core.groupby import GroupBy -from seaborn._stats.base import Stat -from seaborn.utils import _version_predates - - -# From https://github.com/numpy/numpy/blob/main/numpy/lib/function_base.pyi -_MethodKind = Literal[ - "inverted_cdf", - "averaged_inverted_cdf", - "closest_observation", - "interpolated_inverted_cdf", - "hazen", - "weibull", - "linear", - "median_unbiased", - "normal_unbiased", - "lower", - "higher", - "midpoint", - "nearest", -] - - -@dataclass -class Perc(Stat): - """ - Replace observations with percentile values. - - Parameters - ---------- - k : list of numbers or int - If a list of numbers, this gives the percentiles (in [0, 100]) to compute. - If an integer, compute `k` evenly-spaced percentiles between 0 and 100. - For example, `k=5` computes the 0, 25, 50, 75, and 100th percentiles. - method : str - Method for interpolating percentiles between observed datapoints. - See :func:`numpy.percentile` for valid options and more information. - - Examples - -------- - .. include:: ../docstrings/objects.Perc.rst - - """ - k: int | list[float] = 5 - method: str = "linear" - - group_by_orient: ClassVar[bool] = True - - def _percentile(self, data: DataFrame, var: str) -> DataFrame: - - k = list(np.linspace(0, 100, self.k)) if isinstance(self.k, int) else self.k - method = cast(_MethodKind, self.method) - values = data[var].dropna() - if _version_predates(np, "1.22"): - res = np.percentile(values, k, interpolation=method) # type: ignore - else: - res = np.percentile(data[var].dropna(), k, method=method) - return DataFrame({var: res, "percentile": k}) - - def __call__( - self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], - ) -> DataFrame: - - var = {"x": "y", "y": "x"}[orient] - return groupby.apply(data, self._percentile, var) diff --git a/seaborn/_stats/regression.py b/seaborn/_stats/regression.py deleted file mode 100644 index 9ec81a4e5c6ae4eca0baad56b23a5cc1e21a9399..0000000000000000000000000000000000000000 --- a/seaborn/_stats/regression.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass - -import numpy as np -import pandas as pd - -from seaborn._stats.base import Stat - - -@dataclass -class PolyFit(Stat): - """ - Fit a polynomial of the given order and resample data onto predicted curve. - """ - # This is a provisional class that is useful for building out functionality. - # It may or may not change substantially in form or dissappear as we think - # through the organization of the stats subpackage. - - order: int = 2 - gridsize: int = 100 - - def _fit_predict(self, data): - - x = data["x"] - y = data["y"] - if x.nunique() <= self.order: - # TODO warn? - xx = yy = [] - else: - p = np.polyfit(x, y, self.order) - xx = np.linspace(x.min(), x.max(), self.gridsize) - yy = np.polyval(p, xx) - - return pd.DataFrame(dict(x=xx, y=yy)) - - # TODO we should have a way of identifying the method that will be applied - # and then only define __call__ on a base-class of stats with this pattern - - def __call__(self, data, groupby, orient, scales): - - return ( - groupby - .apply(data.dropna(subset=["x", "y"]), self._fit_predict) - ) - - -@dataclass -class OLSFit(Stat): - - ... diff --git a/seaborn/_testing.py b/seaborn/_testing.py deleted file mode 100644 index c6f821cbe26f44a720cc8863fe6a863d61a275dd..0000000000000000000000000000000000000000 --- a/seaborn/_testing.py +++ /dev/null @@ -1,90 +0,0 @@ -import numpy as np -import matplotlib as mpl -from matplotlib.colors import to_rgb, to_rgba -from numpy.testing import assert_array_equal - - -USE_PROPS = [ - "alpha", - "edgecolor", - "facecolor", - "fill", - "hatch", - "height", - "linestyle", - "linewidth", - "paths", - "xy", - "xydata", - "sizes", - "zorder", -] - - -def assert_artists_equal(list1, list2): - - assert len(list1) == len(list2) - for a1, a2 in zip(list1, list2): - assert a1.__class__ == a2.__class__ - prop1 = a1.properties() - prop2 = a2.properties() - for key in USE_PROPS: - if key not in prop1: - continue - v1 = prop1[key] - v2 = prop2[key] - if key == "paths": - for p1, p2 in zip(v1, v2): - assert_array_equal(p1.vertices, p2.vertices) - assert_array_equal(p1.codes, p2.codes) - elif key == "color": - v1 = mpl.colors.to_rgba(v1) - v2 = mpl.colors.to_rgba(v2) - assert v1 == v2 - elif isinstance(v1, np.ndarray): - assert_array_equal(v1, v2) - else: - assert v1 == v2 - - -def assert_legends_equal(leg1, leg2): - - assert leg1.get_title().get_text() == leg2.get_title().get_text() - for t1, t2 in zip(leg1.get_texts(), leg2.get_texts()): - assert t1.get_text() == t2.get_text() - - assert_artists_equal( - leg1.get_patches(), leg2.get_patches(), - ) - assert_artists_equal( - leg1.get_lines(), leg2.get_lines(), - ) - - -def assert_plots_equal(ax1, ax2, labels=True): - - assert_artists_equal(ax1.patches, ax2.patches) - assert_artists_equal(ax1.lines, ax2.lines) - assert_artists_equal(ax1.collections, ax2.collections) - - if labels: - assert ax1.get_xlabel() == ax2.get_xlabel() - assert ax1.get_ylabel() == ax2.get_ylabel() - - -def assert_colors_equal(a, b, check_alpha=True): - - def handle_array(x): - - if isinstance(x, np.ndarray): - if x.ndim > 1: - x = np.unique(x, axis=0).squeeze() - if x.ndim > 1: - raise ValueError("Color arrays must be 1 dimensional") - return x - - a = handle_array(a) - b = handle_array(b) - - f = to_rgba if check_alpha else to_rgb - assert f(a) == f(b) diff --git a/seaborn/algorithms.py b/seaborn/algorithms.py deleted file mode 100644 index 2e34b9dd9cdffb5d82f56674fac4896de91a4d0a..0000000000000000000000000000000000000000 --- a/seaborn/algorithms.py +++ /dev/null @@ -1,120 +0,0 @@ -"""Algorithms to support fitting routines in seaborn plotting functions.""" -import numpy as np -import warnings - - -def bootstrap(*args, **kwargs): - """Resample one or more arrays with replacement and store aggregate values. - - Positional arguments are a sequence of arrays to bootstrap along the first - axis and pass to a summary function. - - Keyword arguments: - n_boot : int, default=10000 - Number of iterations - axis : int, default=None - Will pass axis to ``func`` as a keyword argument. - units : array, default=None - Array of sampling unit IDs. When used the bootstrap resamples units - and then observations within units instead of individual - datapoints. - func : string or callable, default="mean" - Function to call on the args that are passed in. If string, uses as - name of function in the numpy namespace. If nans are present in the - data, will try to use nan-aware version of named function. - seed : Generator | SeedSequence | RandomState | int | None - Seed for the random number generator; useful if you want - reproducible resamples. - - Returns - ------- - boot_dist: array - array of bootstrapped statistic values - - """ - # Ensure list of arrays are same length - if len(np.unique(list(map(len, args)))) > 1: - raise ValueError("All input arrays must have the same length") - n = len(args[0]) - - # Default keyword arguments - n_boot = kwargs.get("n_boot", 10000) - func = kwargs.get("func", "mean") - axis = kwargs.get("axis", None) - units = kwargs.get("units", None) - random_seed = kwargs.get("random_seed", None) - if random_seed is not None: - msg = "`random_seed` has been renamed to `seed` and will be removed" - warnings.warn(msg) - seed = kwargs.get("seed", random_seed) - if axis is None: - func_kwargs = dict() - else: - func_kwargs = dict(axis=axis) - - # Initialize the resampler - if isinstance(seed, np.random.RandomState): - rng = seed - else: - rng = np.random.default_rng(seed) - - # Coerce to arrays - args = list(map(np.asarray, args)) - if units is not None: - units = np.asarray(units) - - if isinstance(func, str): - - # Allow named numpy functions - f = getattr(np, func) - - # Try to use nan-aware version of function if necessary - missing_data = np.isnan(np.sum(np.column_stack(args))) - - if missing_data and not func.startswith("nan"): - nanf = getattr(np, f"nan{func}", None) - if nanf is None: - msg = f"Data contain nans but no nan-aware version of `{func}` found" - warnings.warn(msg, UserWarning) - else: - f = nanf - - else: - f = func - - # Handle numpy changes - try: - integers = rng.integers - except AttributeError: - integers = rng.randint - - # Do the bootstrap - if units is not None: - return _structured_bootstrap(args, n_boot, units, f, - func_kwargs, integers) - - boot_dist = [] - for i in range(int(n_boot)): - resampler = integers(0, n, n, dtype=np.intp) # intp is indexing dtype - sample = [a.take(resampler, axis=0) for a in args] - boot_dist.append(f(*sample, **func_kwargs)) - return np.array(boot_dist) - - -def _structured_bootstrap(args, n_boot, units, func, func_kwargs, integers): - """Resample units instead of datapoints.""" - unique_units = np.unique(units) - n_units = len(unique_units) - - args = [[a[units == unit] for unit in unique_units] for a in args] - - boot_dist = [] - for i in range(int(n_boot)): - resampler = integers(0, n_units, n_units, dtype=np.intp) - sample = [[a[i] for i in resampler] for a in args] - lengths = map(len, sample[0]) - resampler = [integers(0, n, n, dtype=np.intp) for n in lengths] - sample = [[c.take(r, axis=0) for c, r in zip(a, resampler)] for a in sample] - sample = list(map(np.concatenate, sample)) - boot_dist.append(func(*sample, **func_kwargs)) - return np.array(boot_dist) diff --git a/seaborn/axisgrid.py b/seaborn/axisgrid.py deleted file mode 100644 index 17d333bc89b79d3b43310493456176e9c69fab4e..0000000000000000000000000000000000000000 --- a/seaborn/axisgrid.py +++ /dev/null @@ -1,2401 +0,0 @@ -from __future__ import annotations -from itertools import product -from inspect import signature -import warnings -from textwrap import dedent - -import numpy as np -import pandas as pd -import matplotlib as mpl -import matplotlib.pyplot as plt - -from ._base import VectorPlotter, variable_type, categorical_order -from ._core.data import handle_data_source -from ._compat import share_axis, get_legend_handles -from . import utils -from .utils import ( - adjust_legend_subtitles, - set_hls_values, - _check_argument, - _draw_figure, - _disable_autolayout -) -from .palettes import color_palette, blend_palette -from ._docstrings import ( - DocstringComponents, - _core_docs, -) - -__all__ = ["FacetGrid", "PairGrid", "JointGrid", "pairplot", "jointplot"] - - -_param_docs = DocstringComponents.from_nested_components( - core=_core_docs["params"], -) - - -class _BaseGrid: - """Base class for grids of subplots.""" - - def set(self, **kwargs): - """Set attributes on each subplot Axes.""" - for ax in self.axes.flat: - if ax is not None: # Handle removed axes - ax.set(**kwargs) - return self - - @property - def fig(self): - """DEPRECATED: prefer the `figure` property.""" - # Grid.figure is preferred because it matches the Axes attribute name. - # But as the maintanace burden on having this property is minimal, - # let's be slow about formally deprecating it. For now just note its deprecation - # in the docstring; add a warning in version 0.13, and eventually remove it. - return self._figure - - @property - def figure(self): - """Access the :class:`matplotlib.figure.Figure` object underlying the grid.""" - return self._figure - - def apply(self, func, *args, **kwargs): - """ - Pass the grid to a user-supplied function and return self. - - The `func` must accept an object of this type for its first - positional argument. Additional arguments are passed through. - The return value of `func` is ignored; this method returns self. - See the `pipe` method if you want the return value. - - Added in v0.12.0. - - """ - func(self, *args, **kwargs) - return self - - def pipe(self, func, *args, **kwargs): - """ - Pass the grid to a user-supplied function and return its value. - - The `func` must accept an object of this type for its first - positional argument. Additional arguments are passed through. - The return value of `func` becomes the return value of this method. - See the `apply` method if you want to return self instead. - - Added in v0.12.0. - - """ - return func(self, *args, **kwargs) - - def savefig(self, *args, **kwargs): - """ - Save an image of the plot. - - This wraps :meth:`matplotlib.figure.Figure.savefig`, using bbox_inches="tight" - by default. Parameters are passed through to the matplotlib function. - - """ - kwargs = kwargs.copy() - kwargs.setdefault("bbox_inches", "tight") - self.figure.savefig(*args, **kwargs) - - -class Grid(_BaseGrid): - """A grid that can have multiple subplots and an external legend.""" - _margin_titles = False - _legend_out = True - - def __init__(self): - - self._tight_layout_rect = [0, 0, 1, 1] - self._tight_layout_pad = None - - # This attribute is set externally and is a hack to handle newer functions that - # don't add proxy artists onto the Axes. We need an overall cleaner approach. - self._extract_legend_handles = False - - def tight_layout(self, *args, **kwargs): - """Call fig.tight_layout within rect that exclude the legend.""" - kwargs = kwargs.copy() - kwargs.setdefault("rect", self._tight_layout_rect) - if self._tight_layout_pad is not None: - kwargs.setdefault("pad", self._tight_layout_pad) - self._figure.tight_layout(*args, **kwargs) - return self - - def add_legend(self, legend_data=None, title=None, label_order=None, - adjust_subtitles=False, **kwargs): - """Draw a legend, maybe placing it outside axes and resizing the figure. - - Parameters - ---------- - legend_data : dict - Dictionary mapping label names (or two-element tuples where the - second element is a label name) to matplotlib artist handles. The - default reads from ``self._legend_data``. - title : string - Title for the legend. The default reads from ``self._hue_var``. - label_order : list of labels - The order that the legend entries should appear in. The default - reads from ``self.hue_names``. - adjust_subtitles : bool - If True, modify entries with invisible artists to left-align - the labels and set the font size to that of a title. - kwargs : key, value pairings - Other keyword arguments are passed to the underlying legend methods - on the Figure or Axes object. - - Returns - ------- - self : Grid instance - Returns self for easy chaining. - - """ - # Find the data for the legend - if legend_data is None: - legend_data = self._legend_data - if label_order is None: - if self.hue_names is None: - label_order = list(legend_data.keys()) - else: - label_order = list(map(utils.to_utf8, self.hue_names)) - - blank_handle = mpl.patches.Patch(alpha=0, linewidth=0) - handles = [legend_data.get(lab, blank_handle) for lab in label_order] - title = self._hue_var if title is None else title - title_size = mpl.rcParams["legend.title_fontsize"] - - # Unpack nested labels from a hierarchical legend - labels = [] - for entry in label_order: - if isinstance(entry, tuple): - _, label = entry - else: - label = entry - labels.append(label) - - # Set default legend kwargs - kwargs.setdefault("scatterpoints", 1) - - if self._legend_out: - - kwargs.setdefault("frameon", False) - kwargs.setdefault("loc", "center right") - - # Draw a full-figure legend outside the grid - figlegend = self._figure.legend(handles, labels, **kwargs) - - self._legend = figlegend - figlegend.set_title(title, prop={"size": title_size}) - - if adjust_subtitles: - adjust_legend_subtitles(figlegend) - - # Draw the plot to set the bounding boxes correctly - _draw_figure(self._figure) - - # Calculate and set the new width of the figure so the legend fits - legend_width = figlegend.get_window_extent().width / self._figure.dpi - fig_width, fig_height = self._figure.get_size_inches() - self._figure.set_size_inches(fig_width + legend_width, fig_height) - - # Draw the plot again to get the new transformations - _draw_figure(self._figure) - - # Now calculate how much space we need on the right side - legend_width = figlegend.get_window_extent().width / self._figure.dpi - space_needed = legend_width / (fig_width + legend_width) - margin = .04 if self._margin_titles else .01 - self._space_needed = margin + space_needed - right = 1 - self._space_needed - - # Place the subplot axes to give space for the legend - self._figure.subplots_adjust(right=right) - self._tight_layout_rect[2] = right - - else: - # Draw a legend in the first axis - ax = self.axes.flat[0] - kwargs.setdefault("loc", "best") - - leg = ax.legend(handles, labels, **kwargs) - leg.set_title(title, prop={"size": title_size}) - self._legend = leg - - if adjust_subtitles: - adjust_legend_subtitles(leg) - - return self - - def _update_legend_data(self, ax): - """Extract the legend data from an axes object and save it.""" - data = {} - - # Get data directly from the legend, which is necessary - # for newer functions that don't add labeled proxy artists - if ax.legend_ is not None and self._extract_legend_handles: - handles = get_legend_handles(ax.legend_) - labels = [t.get_text() for t in ax.legend_.texts] - data.update({label: handle for handle, label in zip(handles, labels)}) - - handles, labels = ax.get_legend_handles_labels() - data.update({label: handle for handle, label in zip(handles, labels)}) - - self._legend_data.update(data) - - # Now clear the legend - ax.legend_ = None - - def _get_palette(self, data, hue, hue_order, palette): - """Get a list of colors for the hue variable.""" - if hue is None: - palette = color_palette(n_colors=1) - - else: - hue_names = categorical_order(data[hue], hue_order) - n_colors = len(hue_names) - - # By default use either the current color palette or HUSL - if palette is None: - current_palette = utils.get_color_cycle() - if n_colors > len(current_palette): - colors = color_palette("husl", n_colors) - else: - colors = color_palette(n_colors=n_colors) - - # Allow for palette to map from hue variable names - elif isinstance(palette, dict): - color_names = [palette[h] for h in hue_names] - colors = color_palette(color_names, n_colors) - - # Otherwise act as if we just got a list of colors - else: - colors = color_palette(palette, n_colors) - - palette = color_palette(colors, n_colors) - - return palette - - @property - def legend(self): - """The :class:`matplotlib.legend.Legend` object, if present.""" - try: - return self._legend - except AttributeError: - return None - - def tick_params(self, axis='both', **kwargs): - """Modify the ticks, tick labels, and gridlines. - - Parameters - ---------- - axis : {'x', 'y', 'both'} - The axis on which to apply the formatting. - kwargs : keyword arguments - Additional keyword arguments to pass to - :meth:`matplotlib.axes.Axes.tick_params`. - - Returns - ------- - self : Grid instance - Returns self for easy chaining. - - """ - for ax in self.figure.axes: - ax.tick_params(axis=axis, **kwargs) - return self - - -_facet_docs = dict( - - data=dedent("""\ - data : DataFrame - Tidy ("long-form") dataframe where each column is a variable and each - row is an observation.\ - """), - rowcol=dedent("""\ - row, col : vectors or keys in ``data`` - Variables that define subsets to plot on different facets.\ - """), - rowcol_order=dedent("""\ - {row,col}_order : vector of strings - Specify the order in which levels of the ``row`` and/or ``col`` variables - appear in the grid of subplots.\ - """), - col_wrap=dedent("""\ - col_wrap : int - "Wrap" the column variable at this width, so that the column facets - span multiple rows. Incompatible with a ``row`` facet.\ - """), - share_xy=dedent("""\ - share{x,y} : bool, 'col', or 'row' optional - If true, the facets will share y axes across columns and/or x axes - across rows.\ - """), - height=dedent("""\ - height : scalar - Height (in inches) of each facet. See also: ``aspect``.\ - """), - aspect=dedent("""\ - aspect : scalar - Aspect ratio of each facet, so that ``aspect * height`` gives the width - of each facet in inches.\ - """), - palette=dedent("""\ - palette : palette name, list, or dict - Colors to use for the different levels of the ``hue`` variable. Should - be something that can be interpreted by :func:`color_palette`, or a - dictionary mapping hue levels to matplotlib colors.\ - """), - legend_out=dedent("""\ - legend_out : bool - If ``True``, the figure size will be extended, and the legend will be - drawn outside the plot on the center right.\ - """), - margin_titles=dedent("""\ - margin_titles : bool - If ``True``, the titles for the row variable are drawn to the right of - the last column. This option is experimental and may not work in all - cases.\ - """), - facet_kws=dedent("""\ - facet_kws : dict - Additional parameters passed to :class:`FacetGrid`. - """), -) - - -class FacetGrid(Grid): - """Multi-plot grid for plotting conditional relationships.""" - - def __init__( - self, data, *, - row=None, col=None, hue=None, col_wrap=None, - sharex=True, sharey=True, height=3, aspect=1, palette=None, - row_order=None, col_order=None, hue_order=None, hue_kws=None, - dropna=False, legend_out=True, despine=True, - margin_titles=False, xlim=None, ylim=None, subplot_kws=None, - gridspec_kws=None, - ): - - super().__init__() - data = handle_data_source(data) - - # Determine the hue facet layer information - hue_var = hue - if hue is None: - hue_names = None - else: - hue_names = categorical_order(data[hue], hue_order) - - colors = self._get_palette(data, hue, hue_order, palette) - - # Set up the lists of names for the row and column facet variables - if row is None: - row_names = [] - else: - row_names = categorical_order(data[row], row_order) - - if col is None: - col_names = [] - else: - col_names = categorical_order(data[col], col_order) - - # Additional dict of kwarg -> list of values for mapping the hue var - hue_kws = hue_kws if hue_kws is not None else {} - - # Make a boolean mask that is True anywhere there is an NA - # value in one of the faceting variables, but only if dropna is True - none_na = np.zeros(len(data), bool) - if dropna: - row_na = none_na if row is None else data[row].isnull() - col_na = none_na if col is None else data[col].isnull() - hue_na = none_na if hue is None else data[hue].isnull() - not_na = ~(row_na | col_na | hue_na) - else: - not_na = ~none_na - - # Compute the grid shape - ncol = 1 if col is None else len(col_names) - nrow = 1 if row is None else len(row_names) - self._n_facets = ncol * nrow - - self._col_wrap = col_wrap - if col_wrap is not None: - if row is not None: - err = "Cannot use `row` and `col_wrap` together." - raise ValueError(err) - ncol = col_wrap - nrow = int(np.ceil(len(col_names) / col_wrap)) - self._ncol = ncol - self._nrow = nrow - - # Calculate the base figure size - # This can get stretched later by a legend - # TODO this doesn't account for axis labels - figsize = (ncol * height * aspect, nrow * height) - - # Validate some inputs - if col_wrap is not None: - margin_titles = False - - # Build the subplot keyword dictionary - subplot_kws = {} if subplot_kws is None else subplot_kws.copy() - gridspec_kws = {} if gridspec_kws is None else gridspec_kws.copy() - if xlim is not None: - subplot_kws["xlim"] = xlim - if ylim is not None: - subplot_kws["ylim"] = ylim - - # --- Initialize the subplot grid - - with _disable_autolayout(): - fig = plt.figure(figsize=figsize) - - if col_wrap is None: - - kwargs = dict(squeeze=False, - sharex=sharex, sharey=sharey, - subplot_kw=subplot_kws, - gridspec_kw=gridspec_kws) - - axes = fig.subplots(nrow, ncol, **kwargs) - - if col is None and row is None: - axes_dict = {} - elif col is None: - axes_dict = dict(zip(row_names, axes.flat)) - elif row is None: - axes_dict = dict(zip(col_names, axes.flat)) - else: - facet_product = product(row_names, col_names) - axes_dict = dict(zip(facet_product, axes.flat)) - - else: - - # If wrapping the col variable we need to make the grid ourselves - if gridspec_kws: - warnings.warn("`gridspec_kws` ignored when using `col_wrap`") - - n_axes = len(col_names) - axes = np.empty(n_axes, object) - axes[0] = fig.add_subplot(nrow, ncol, 1, **subplot_kws) - if sharex: - subplot_kws["sharex"] = axes[0] - if sharey: - subplot_kws["sharey"] = axes[0] - for i in range(1, n_axes): - axes[i] = fig.add_subplot(nrow, ncol, i + 1, **subplot_kws) - - axes_dict = dict(zip(col_names, axes)) - - # --- Set up the class attributes - - # Attributes that are part of the public API but accessed through - # a property so that Sphinx adds them to the auto class doc - self._figure = fig - self._axes = axes - self._axes_dict = axes_dict - self._legend = None - - # Public attributes that aren't explicitly documented - # (It's not obvious that having them be public was a good idea) - self.data = data - self.row_names = row_names - self.col_names = col_names - self.hue_names = hue_names - self.hue_kws = hue_kws - - # Next the private variables - self._nrow = nrow - self._row_var = row - self._ncol = ncol - self._col_var = col - - self._margin_titles = margin_titles - self._margin_titles_texts = [] - self._col_wrap = col_wrap - self._hue_var = hue_var - self._colors = colors - self._legend_out = legend_out - self._legend_data = {} - self._x_var = None - self._y_var = None - self._sharex = sharex - self._sharey = sharey - self._dropna = dropna - self._not_na = not_na - - # --- Make the axes look good - - self.set_titles() - self.tight_layout() - - if despine: - self.despine() - - if sharex in [True, 'col']: - for ax in self._not_bottom_axes: - for label in ax.get_xticklabels(): - label.set_visible(False) - ax.xaxis.offsetText.set_visible(False) - ax.xaxis.label.set_visible(False) - - if sharey in [True, 'row']: - for ax in self._not_left_axes: - for label in ax.get_yticklabels(): - label.set_visible(False) - ax.yaxis.offsetText.set_visible(False) - ax.yaxis.label.set_visible(False) - - __init__.__doc__ = dedent("""\ - Initialize the matplotlib figure and FacetGrid object. - - This class maps a dataset onto multiple axes arrayed in a grid of rows - and columns that correspond to *levels* of variables in the dataset. - The plots it produces are often called "lattice", "trellis", or - "small-multiple" graphics. - - It can also represent levels of a third variable with the ``hue`` - parameter, which plots different subsets of data in different colors. - This uses color to resolve elements on a third dimension, but only - draws subsets on top of each other and will not tailor the ``hue`` - parameter for the specific visualization the way that axes-level - functions that accept ``hue`` will. - - The basic workflow is to initialize the :class:`FacetGrid` object with - the dataset and the variables that are used to structure the grid. Then - one or more plotting functions can be applied to each subset by calling - :meth:`FacetGrid.map` or :meth:`FacetGrid.map_dataframe`. Finally, the - plot can be tweaked with other methods to do things like change the - axis labels, use different ticks, or add a legend. See the detailed - code examples below for more information. - - .. warning:: - - When using seaborn functions that infer semantic mappings from a - dataset, care must be taken to synchronize those mappings across - facets (e.g., by defining the ``hue`` mapping with a palette dict or - setting the data type of the variables to ``category``). In most cases, - it will be better to use a figure-level function (e.g. :func:`relplot` - or :func:`catplot`) than to use :class:`FacetGrid` directly. - - See the :ref:`tutorial <grid_tutorial>` for more information. - - Parameters - ---------- - {data} - row, col, hue : strings - Variables that define subsets of the data, which will be drawn on - separate facets in the grid. See the ``{{var}}_order`` parameters to - control the order of levels of this variable. - {col_wrap} - {share_xy} - {height} - {aspect} - {palette} - {{row,col,hue}}_order : lists - Order for the levels of the faceting variables. By default, this - will be the order that the levels appear in ``data`` or, if the - variables are pandas categoricals, the category order. - hue_kws : dictionary of param -> list of values mapping - Other keyword arguments to insert into the plotting call to let - other plot attributes vary across levels of the hue variable (e.g. - the markers in a scatterplot). - {legend_out} - despine : boolean - Remove the top and right spines from the plots. - {margin_titles} - {{x, y}}lim: tuples - Limits for each of the axes on each facet (only relevant when - share{{x, y}} is True). - subplot_kws : dict - Dictionary of keyword arguments passed to matplotlib subplot(s) - methods. - gridspec_kws : dict - Dictionary of keyword arguments passed to - :class:`matplotlib.gridspec.GridSpec` - (via :meth:`matplotlib.figure.Figure.subplots`). - Ignored if ``col_wrap`` is not ``None``. - - See Also - -------- - PairGrid : Subplot grid for plotting pairwise relationships - relplot : Combine a relational plot and a :class:`FacetGrid` - displot : Combine a distribution plot and a :class:`FacetGrid` - catplot : Combine a categorical plot and a :class:`FacetGrid` - lmplot : Combine a regression plot and a :class:`FacetGrid` - - Examples - -------- - - .. note:: - - These examples use seaborn functions to demonstrate some of the - advanced features of the class, but in most cases you will want - to use figue-level functions (e.g. :func:`displot`, :func:`relplot`) - to make the plots shown here. - - .. include:: ../docstrings/FacetGrid.rst - - """).format(**_facet_docs) - - def facet_data(self): - """Generator for name indices and data subsets for each facet. - - Yields - ------ - (i, j, k), data_ijk : tuple of ints, DataFrame - The ints provide an index into the {row, col, hue}_names attribute, - and the dataframe contains a subset of the full data corresponding - to each facet. The generator yields subsets that correspond with - the self.axes.flat iterator, or self.axes[i, j] when `col_wrap` - is None. - - """ - data = self.data - - # Construct masks for the row variable - if self.row_names: - row_masks = [data[self._row_var] == n for n in self.row_names] - else: - row_masks = [np.repeat(True, len(self.data))] - - # Construct masks for the column variable - if self.col_names: - col_masks = [data[self._col_var] == n for n in self.col_names] - else: - col_masks = [np.repeat(True, len(self.data))] - - # Construct masks for the hue variable - if self.hue_names: - hue_masks = [data[self._hue_var] == n for n in self.hue_names] - else: - hue_masks = [np.repeat(True, len(self.data))] - - # Here is the main generator loop - for (i, row), (j, col), (k, hue) in product(enumerate(row_masks), - enumerate(col_masks), - enumerate(hue_masks)): - data_ijk = data[row & col & hue & self._not_na] - yield (i, j, k), data_ijk - - def map(self, func, *args, **kwargs): - """Apply a plotting function to each facet's subset of the data. - - Parameters - ---------- - func : callable - A plotting function that takes data and keyword arguments. It - must plot to the currently active matplotlib Axes and take a - `color` keyword argument. If faceting on the `hue` dimension, - it must also take a `label` keyword argument. - args : strings - Column names in self.data that identify variables with data to - plot. The data for each variable is passed to `func` in the - order the variables are specified in the call. - kwargs : keyword arguments - All keyword arguments are passed to the plotting function. - - Returns - ------- - self : object - Returns self. - - """ - # If color was a keyword argument, grab it here - kw_color = kwargs.pop("color", None) - - # How we use the function depends on where it comes from - func_module = str(getattr(func, "__module__", "")) - - # Check for categorical plots without order information - if func_module == "seaborn.categorical": - if "order" not in kwargs: - warning = ("Using the {} function without specifying " - "`order` is likely to produce an incorrect " - "plot.".format(func.__name__)) - warnings.warn(warning) - if len(args) == 3 and "hue_order" not in kwargs: - warning = ("Using the {} function without specifying " - "`hue_order` is likely to produce an incorrect " - "plot.".format(func.__name__)) - warnings.warn(warning) - - # Iterate over the data subsets - for (row_i, col_j, hue_k), data_ijk in self.facet_data(): - - # If this subset is null, move on - if not data_ijk.values.size: - continue - - # Get the current axis - modify_state = not func_module.startswith("seaborn") - ax = self.facet_axis(row_i, col_j, modify_state) - - # Decide what color to plot with - kwargs["color"] = self._facet_color(hue_k, kw_color) - - # Insert the other hue aesthetics if appropriate - for kw, val_list in self.hue_kws.items(): - kwargs[kw] = val_list[hue_k] - - # Insert a label in the keyword arguments for the legend - if self._hue_var is not None: - kwargs["label"] = utils.to_utf8(self.hue_names[hue_k]) - - # Get the actual data we are going to plot with - plot_data = data_ijk[list(args)] - if self._dropna: - plot_data = plot_data.dropna() - plot_args = [v for k, v in plot_data.items()] - - # Some matplotlib functions don't handle pandas objects correctly - if func_module.startswith("matplotlib"): - plot_args = [v.values for v in plot_args] - - # Draw the plot - self._facet_plot(func, ax, plot_args, kwargs) - - # Finalize the annotations and layout - self._finalize_grid(args[:2]) - - return self - - def map_dataframe(self, func, *args, **kwargs): - """Like ``.map`` but passes args as strings and inserts data in kwargs. - - This method is suitable for plotting with functions that accept a - long-form DataFrame as a `data` keyword argument and access the - data in that DataFrame using string variable names. - - Parameters - ---------- - func : callable - A plotting function that takes data and keyword arguments. Unlike - the `map` method, a function used here must "understand" Pandas - objects. It also must plot to the currently active matplotlib Axes - and take a `color` keyword argument. If faceting on the `hue` - dimension, it must also take a `label` keyword argument. - args : strings - Column names in self.data that identify variables with data to - plot. The data for each variable is passed to `func` in the - order the variables are specified in the call. - kwargs : keyword arguments - All keyword arguments are passed to the plotting function. - - Returns - ------- - self : object - Returns self. - - """ - - # If color was a keyword argument, grab it here - kw_color = kwargs.pop("color", None) - - # Iterate over the data subsets - for (row_i, col_j, hue_k), data_ijk in self.facet_data(): - - # If this subset is null, move on - if not data_ijk.values.size: - continue - - # Get the current axis - modify_state = not str(func.__module__).startswith("seaborn") - ax = self.facet_axis(row_i, col_j, modify_state) - - # Decide what color to plot with - kwargs["color"] = self._facet_color(hue_k, kw_color) - - # Insert the other hue aesthetics if appropriate - for kw, val_list in self.hue_kws.items(): - kwargs[kw] = val_list[hue_k] - - # Insert a label in the keyword arguments for the legend - if self._hue_var is not None: - kwargs["label"] = self.hue_names[hue_k] - - # Stick the facet dataframe into the kwargs - if self._dropna: - data_ijk = data_ijk.dropna() - kwargs["data"] = data_ijk - - # Draw the plot - self._facet_plot(func, ax, args, kwargs) - - # For axis labels, prefer to use positional args for backcompat - # but also extract the x/y kwargs and use if no corresponding arg - axis_labels = [kwargs.get("x", None), kwargs.get("y", None)] - for i, val in enumerate(args[:2]): - axis_labels[i] = val - self._finalize_grid(axis_labels) - - return self - - def _facet_color(self, hue_index, kw_color): - - color = self._colors[hue_index] - if kw_color is not None: - return kw_color - elif color is not None: - return color - - def _facet_plot(self, func, ax, plot_args, plot_kwargs): - - # Draw the plot - if str(func.__module__).startswith("seaborn"): - plot_kwargs = plot_kwargs.copy() - semantics = ["x", "y", "hue", "size", "style"] - for key, val in zip(semantics, plot_args): - plot_kwargs[key] = val - plot_args = [] - plot_kwargs["ax"] = ax - func(*plot_args, **plot_kwargs) - - # Sort out the supporting information - self._update_legend_data(ax) - - def _finalize_grid(self, axlabels): - """Finalize the annotations and layout.""" - self.set_axis_labels(*axlabels) - self.tight_layout() - - def facet_axis(self, row_i, col_j, modify_state=True): - """Make the axis identified by these indices active and return it.""" - - # Calculate the actual indices of the axes to plot on - if self._col_wrap is not None: - ax = self.axes.flat[col_j] - else: - ax = self.axes[row_i, col_j] - - # Get a reference to the axes object we want, and make it active - if modify_state: - plt.sca(ax) - return ax - - def despine(self, **kwargs): - """Remove axis spines from the facets.""" - utils.despine(self._figure, **kwargs) - return self - - def set_axis_labels(self, x_var=None, y_var=None, clear_inner=True, **kwargs): - """Set axis labels on the left column and bottom row of the grid.""" - if x_var is not None: - self._x_var = x_var - self.set_xlabels(x_var, clear_inner=clear_inner, **kwargs) - if y_var is not None: - self._y_var = y_var - self.set_ylabels(y_var, clear_inner=clear_inner, **kwargs) - - return self - - def set_xlabels(self, label=None, clear_inner=True, **kwargs): - """Label the x axis on the bottom row of the grid.""" - if label is None: - label = self._x_var - for ax in self._bottom_axes: - ax.set_xlabel(label, **kwargs) - if clear_inner: - for ax in self._not_bottom_axes: - ax.set_xlabel("") - return self - - def set_ylabels(self, label=None, clear_inner=True, **kwargs): - """Label the y axis on the left column of the grid.""" - if label is None: - label = self._y_var - for ax in self._left_axes: - ax.set_ylabel(label, **kwargs) - if clear_inner: - for ax in self._not_left_axes: - ax.set_ylabel("") - return self - - def set_xticklabels(self, labels=None, step=None, **kwargs): - """Set x axis tick labels of the grid.""" - for ax in self.axes.flat: - curr_ticks = ax.get_xticks() - ax.set_xticks(curr_ticks) - if labels is None: - curr_labels = [label.get_text() for label in ax.get_xticklabels()] - if step is not None: - xticks = ax.get_xticks()[::step] - curr_labels = curr_labels[::step] - ax.set_xticks(xticks) - ax.set_xticklabels(curr_labels, **kwargs) - else: - ax.set_xticklabels(labels, **kwargs) - return self - - def set_yticklabels(self, labels=None, **kwargs): - """Set y axis tick labels on the left column of the grid.""" - for ax in self.axes.flat: - curr_ticks = ax.get_yticks() - ax.set_yticks(curr_ticks) - if labels is None: - curr_labels = [label.get_text() for label in ax.get_yticklabels()] - ax.set_yticklabels(curr_labels, **kwargs) - else: - ax.set_yticklabels(labels, **kwargs) - return self - - def set_titles(self, template=None, row_template=None, col_template=None, **kwargs): - """Draw titles either above each facet or on the grid margins. - - Parameters - ---------- - template : string - Template for all titles with the formatting keys {col_var} and - {col_name} (if using a `col` faceting variable) and/or {row_var} - and {row_name} (if using a `row` faceting variable). - row_template: - Template for the row variable when titles are drawn on the grid - margins. Must have {row_var} and {row_name} formatting keys. - col_template: - Template for the column variable when titles are drawn on the grid - margins. Must have {col_var} and {col_name} formatting keys. - - Returns - ------- - self: object - Returns self. - - """ - args = dict(row_var=self._row_var, col_var=self._col_var) - kwargs["size"] = kwargs.pop("size", mpl.rcParams["axes.labelsize"]) - - # Establish default templates - if row_template is None: - row_template = "{row_var} = {row_name}" - if col_template is None: - col_template = "{col_var} = {col_name}" - if template is None: - if self._row_var is None: - template = col_template - elif self._col_var is None: - template = row_template - else: - template = " | ".join([row_template, col_template]) - - row_template = utils.to_utf8(row_template) - col_template = utils.to_utf8(col_template) - template = utils.to_utf8(template) - - if self._margin_titles: - - # Remove any existing title texts - for text in self._margin_titles_texts: - text.remove() - self._margin_titles_texts = [] - - if self.row_names is not None: - # Draw the row titles on the right edge of the grid - for i, row_name in enumerate(self.row_names): - ax = self.axes[i, -1] - args.update(dict(row_name=row_name)) - title = row_template.format(**args) - text = ax.annotate( - title, xy=(1.02, .5), xycoords="axes fraction", - rotation=270, ha="left", va="center", - **kwargs - ) - self._margin_titles_texts.append(text) - - if self.col_names is not None: - # Draw the column titles as normal titles - for j, col_name in enumerate(self.col_names): - args.update(dict(col_name=col_name)) - title = col_template.format(**args) - self.axes[0, j].set_title(title, **kwargs) - - return self - - # Otherwise title each facet with all the necessary information - if (self._row_var is not None) and (self._col_var is not None): - for i, row_name in enumerate(self.row_names): - for j, col_name in enumerate(self.col_names): - args.update(dict(row_name=row_name, col_name=col_name)) - title = template.format(**args) - self.axes[i, j].set_title(title, **kwargs) - elif self.row_names is not None and len(self.row_names): - for i, row_name in enumerate(self.row_names): - args.update(dict(row_name=row_name)) - title = template.format(**args) - self.axes[i, 0].set_title(title, **kwargs) - elif self.col_names is not None and len(self.col_names): - for i, col_name in enumerate(self.col_names): - args.update(dict(col_name=col_name)) - title = template.format(**args) - # Index the flat array so col_wrap works - self.axes.flat[i].set_title(title, **kwargs) - return self - - def refline(self, *, x=None, y=None, color='.5', linestyle='--', **line_kws): - """Add a reference line(s) to each facet. - - Parameters - ---------- - x, y : numeric - Value(s) to draw the line(s) at. - color : :mod:`matplotlib color <matplotlib.colors>` - Specifies the color of the reference line(s). Pass ``color=None`` to - use ``hue`` mapping. - linestyle : str - Specifies the style of the reference line(s). - line_kws : key, value mappings - Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.axvline` - when ``x`` is not None and :meth:`matplotlib.axes.Axes.axhline` when ``y`` - is not None. - - Returns - ------- - :class:`FacetGrid` instance - Returns ``self`` for easy method chaining. - - """ - line_kws['color'] = color - line_kws['linestyle'] = linestyle - - if x is not None: - self.map(plt.axvline, x=x, **line_kws) - - if y is not None: - self.map(plt.axhline, y=y, **line_kws) - - return self - - # ------ Properties that are part of the public API and documented by Sphinx - - @property - def axes(self): - """An array of the :class:`matplotlib.axes.Axes` objects in the grid.""" - return self._axes - - @property - def ax(self): - """The :class:`matplotlib.axes.Axes` when no faceting variables are assigned.""" - if self.axes.shape == (1, 1): - return self.axes[0, 0] - else: - err = ( - "Use the `.axes` attribute when facet variables are assigned." - ) - raise AttributeError(err) - - @property - def axes_dict(self): - """A mapping of facet names to corresponding :class:`matplotlib.axes.Axes`. - - If only one of ``row`` or ``col`` is assigned, each key is a string - representing a level of that variable. If both facet dimensions are - assigned, each key is a ``({row_level}, {col_level})`` tuple. - - """ - return self._axes_dict - - # ------ Private properties, that require some computation to get - - @property - def _inner_axes(self): - """Return a flat array of the inner axes.""" - if self._col_wrap is None: - return self.axes[:-1, 1:].flat - else: - axes = [] - n_empty = self._nrow * self._ncol - self._n_facets - for i, ax in enumerate(self.axes): - append = ( - i % self._ncol - and i < (self._ncol * (self._nrow - 1)) - and i < (self._ncol * (self._nrow - 1) - n_empty) - ) - if append: - axes.append(ax) - return np.array(axes, object).flat - - @property - def _left_axes(self): - """Return a flat array of the left column of axes.""" - if self._col_wrap is None: - return self.axes[:, 0].flat - else: - axes = [] - for i, ax in enumerate(self.axes): - if not i % self._ncol: - axes.append(ax) - return np.array(axes, object).flat - - @property - def _not_left_axes(self): - """Return a flat array of axes that aren't on the left column.""" - if self._col_wrap is None: - return self.axes[:, 1:].flat - else: - axes = [] - for i, ax in enumerate(self.axes): - if i % self._ncol: - axes.append(ax) - return np.array(axes, object).flat - - @property - def _bottom_axes(self): - """Return a flat array of the bottom row of axes.""" - if self._col_wrap is None: - return self.axes[-1, :].flat - else: - axes = [] - n_empty = self._nrow * self._ncol - self._n_facets - for i, ax in enumerate(self.axes): - append = ( - i >= (self._ncol * (self._nrow - 1)) - or i >= (self._ncol * (self._nrow - 1) - n_empty) - ) - if append: - axes.append(ax) - return np.array(axes, object).flat - - @property - def _not_bottom_axes(self): - """Return a flat array of axes that aren't on the bottom row.""" - if self._col_wrap is None: - return self.axes[:-1, :].flat - else: - axes = [] - n_empty = self._nrow * self._ncol - self._n_facets - for i, ax in enumerate(self.axes): - append = ( - i < (self._ncol * (self._nrow - 1)) - and i < (self._ncol * (self._nrow - 1) - n_empty) - ) - if append: - axes.append(ax) - return np.array(axes, object).flat - - -class PairGrid(Grid): - """Subplot grid for plotting pairwise relationships in a dataset. - - This object maps each variable in a dataset onto a column and row in a - grid of multiple axes. Different axes-level plotting functions can be - used to draw bivariate plots in the upper and lower triangles, and the - marginal distribution of each variable can be shown on the diagonal. - - Several different common plots can be generated in a single line using - :func:`pairplot`. Use :class:`PairGrid` when you need more flexibility. - - See the :ref:`tutorial <grid_tutorial>` for more information. - - """ - def __init__( - self, data, *, hue=None, vars=None, x_vars=None, y_vars=None, - hue_order=None, palette=None, hue_kws=None, corner=False, diag_sharey=True, - height=2.5, aspect=1, layout_pad=.5, despine=True, dropna=False, - ): - """Initialize the plot figure and PairGrid object. - - Parameters - ---------- - data : DataFrame - Tidy (long-form) dataframe where each column is a variable and - each row is an observation. - hue : string (variable name) - Variable in ``data`` to map plot aspects to different colors. This - variable will be excluded from the default x and y variables. - vars : list of variable names - Variables within ``data`` to use, otherwise use every column with - a numeric datatype. - {x, y}_vars : lists of variable names - Variables within ``data`` to use separately for the rows and - columns of the figure; i.e. to make a non-square plot. - hue_order : list of strings - Order for the levels of the hue variable in the palette - palette : dict or seaborn color palette - Set of colors for mapping the ``hue`` variable. If a dict, keys - should be values in the ``hue`` variable. - hue_kws : dictionary of param -> list of values mapping - Other keyword arguments to insert into the plotting call to let - other plot attributes vary across levels of the hue variable (e.g. - the markers in a scatterplot). - corner : bool - If True, don't add axes to the upper (off-diagonal) triangle of the - grid, making this a "corner" plot. - height : scalar - Height (in inches) of each facet. - aspect : scalar - Aspect * height gives the width (in inches) of each facet. - layout_pad : scalar - Padding between axes; passed to ``fig.tight_layout``. - despine : boolean - Remove the top and right spines from the plots. - dropna : boolean - Drop missing values from the data before plotting. - - See Also - -------- - pairplot : Easily drawing common uses of :class:`PairGrid`. - FacetGrid : Subplot grid for plotting conditional relationships. - - Examples - -------- - - .. include:: ../docstrings/PairGrid.rst - - """ - - super().__init__() - data = handle_data_source(data) - - # Sort out the variables that define the grid - numeric_cols = self._find_numeric_cols(data) - if hue in numeric_cols: - numeric_cols.remove(hue) - if vars is not None: - x_vars = list(vars) - y_vars = list(vars) - if x_vars is None: - x_vars = numeric_cols - if y_vars is None: - y_vars = numeric_cols - - if np.isscalar(x_vars): - x_vars = [x_vars] - if np.isscalar(y_vars): - y_vars = [y_vars] - - self.x_vars = x_vars = list(x_vars) - self.y_vars = y_vars = list(y_vars) - self.square_grid = self.x_vars == self.y_vars - - if not x_vars: - raise ValueError("No variables found for grid columns.") - if not y_vars: - raise ValueError("No variables found for grid rows.") - - # Create the figure and the array of subplots - figsize = len(x_vars) * height * aspect, len(y_vars) * height - - with _disable_autolayout(): - fig = plt.figure(figsize=figsize) - - axes = fig.subplots(len(y_vars), len(x_vars), - sharex="col", sharey="row", - squeeze=False) - - # Possibly remove upper axes to make a corner grid - # Note: setting up the axes is usually the most time-intensive part - # of using the PairGrid. We are foregoing the speed improvement that - # we would get by just not setting up the hidden axes so that we can - # avoid implementing fig.subplots ourselves. But worth thinking about. - self._corner = corner - if corner: - hide_indices = np.triu_indices_from(axes, 1) - for i, j in zip(*hide_indices): - axes[i, j].remove() - axes[i, j] = None - - self._figure = fig - self.axes = axes - self.data = data - - # Save what we are going to do with the diagonal - self.diag_sharey = diag_sharey - self.diag_vars = None - self.diag_axes = None - - self._dropna = dropna - - # Label the axes - self._add_axis_labels() - - # Sort out the hue variable - self._hue_var = hue - if hue is None: - self.hue_names = hue_order = ["_nolegend_"] - self.hue_vals = pd.Series(["_nolegend_"] * len(data), - index=data.index) - else: - # We need hue_order and hue_names because the former is used to control - # the order of drawing and the latter is used to control the order of - # the legend. hue_names can become string-typed while hue_order must - # retain the type of the input data. This is messy but results from - # the fact that PairGrid can implement the hue-mapping logic itself - # (and was originally written exclusively that way) but now can delegate - # to the axes-level functions, while always handling legend creation. - # See GH2307 - hue_names = hue_order = categorical_order(data[hue], hue_order) - if dropna: - # Filter NA from the list of unique hue names - hue_names = list(filter(pd.notnull, hue_names)) - self.hue_names = hue_names - self.hue_vals = data[hue] - - # Additional dict of kwarg -> list of values for mapping the hue var - self.hue_kws = hue_kws if hue_kws is not None else {} - - self._orig_palette = palette - self._hue_order = hue_order - self.palette = self._get_palette(data, hue, hue_order, palette) - self._legend_data = {} - - # Make the plot look nice - for ax in axes[:-1, :].flat: - if ax is None: - continue - for label in ax.get_xticklabels(): - label.set_visible(False) - ax.xaxis.offsetText.set_visible(False) - ax.xaxis.label.set_visible(False) - - for ax in axes[:, 1:].flat: - if ax is None: - continue - for label in ax.get_yticklabels(): - label.set_visible(False) - ax.yaxis.offsetText.set_visible(False) - ax.yaxis.label.set_visible(False) - - self._tight_layout_rect = [.01, .01, .99, .99] - self._tight_layout_pad = layout_pad - self._despine = despine - if despine: - utils.despine(fig=fig) - self.tight_layout(pad=layout_pad) - - def map(self, func, **kwargs): - """Plot with the same function in every subplot. - - Parameters - ---------- - func : callable plotting function - Must take x, y arrays as positional arguments and draw onto the - "currently active" matplotlib Axes. Also needs to accept kwargs - called ``color`` and ``label``. - - """ - row_indices, col_indices = np.indices(self.axes.shape) - indices = zip(row_indices.flat, col_indices.flat) - self._map_bivariate(func, indices, **kwargs) - - return self - - def map_lower(self, func, **kwargs): - """Plot with a bivariate function on the lower diagonal subplots. - - Parameters - ---------- - func : callable plotting function - Must take x, y arrays as positional arguments and draw onto the - "currently active" matplotlib Axes. Also needs to accept kwargs - called ``color`` and ``label``. - - """ - indices = zip(*np.tril_indices_from(self.axes, -1)) - self._map_bivariate(func, indices, **kwargs) - return self - - def map_upper(self, func, **kwargs): - """Plot with a bivariate function on the upper diagonal subplots. - - Parameters - ---------- - func : callable plotting function - Must take x, y arrays as positional arguments and draw onto the - "currently active" matplotlib Axes. Also needs to accept kwargs - called ``color`` and ``label``. - - """ - indices = zip(*np.triu_indices_from(self.axes, 1)) - self._map_bivariate(func, indices, **kwargs) - return self - - def map_offdiag(self, func, **kwargs): - """Plot with a bivariate function on the off-diagonal subplots. - - Parameters - ---------- - func : callable plotting function - Must take x, y arrays as positional arguments and draw onto the - "currently active" matplotlib Axes. Also needs to accept kwargs - called ``color`` and ``label``. - - """ - if self.square_grid: - self.map_lower(func, **kwargs) - if not self._corner: - self.map_upper(func, **kwargs) - else: - indices = [] - for i, (y_var) in enumerate(self.y_vars): - for j, (x_var) in enumerate(self.x_vars): - if x_var != y_var: - indices.append((i, j)) - self._map_bivariate(func, indices, **kwargs) - return self - - def map_diag(self, func, **kwargs): - """Plot with a univariate function on each diagonal subplot. - - Parameters - ---------- - func : callable plotting function - Must take an x array as a positional argument and draw onto the - "currently active" matplotlib Axes. Also needs to accept kwargs - called ``color`` and ``label``. - - """ - # Add special diagonal axes for the univariate plot - if self.diag_axes is None: - diag_vars = [] - diag_axes = [] - for i, y_var in enumerate(self.y_vars): - for j, x_var in enumerate(self.x_vars): - if x_var == y_var: - - # Make the density axes - diag_vars.append(x_var) - ax = self.axes[i, j] - diag_ax = ax.twinx() - diag_ax.set_axis_off() - diag_axes.append(diag_ax) - - # Work around matplotlib bug - # https://github.com/matplotlib/matplotlib/issues/15188 - if not plt.rcParams.get("ytick.left", True): - for tick in ax.yaxis.majorTicks: - tick.tick1line.set_visible(False) - - # Remove main y axis from density axes in a corner plot - if self._corner: - ax.yaxis.set_visible(False) - if self._despine: - utils.despine(ax=ax, left=True) - # TODO add optional density ticks (on the right) - # when drawing a corner plot? - - if self.diag_sharey and diag_axes: - for ax in diag_axes[1:]: - share_axis(diag_axes[0], ax, "y") - - self.diag_vars = diag_vars - self.diag_axes = diag_axes - - if "hue" not in signature(func).parameters: - return self._map_diag_iter_hue(func, **kwargs) - - # Loop over diagonal variables and axes, making one plot in each - for var, ax in zip(self.diag_vars, self.diag_axes): - - plot_kwargs = kwargs.copy() - if str(func.__module__).startswith("seaborn"): - plot_kwargs["ax"] = ax - else: - plt.sca(ax) - - vector = self.data[var] - if self._hue_var is not None: - hue = self.data[self._hue_var] - else: - hue = None - - if self._dropna: - not_na = vector.notna() - if hue is not None: - not_na &= hue.notna() - vector = vector[not_na] - if hue is not None: - hue = hue[not_na] - - plot_kwargs.setdefault("hue", hue) - plot_kwargs.setdefault("hue_order", self._hue_order) - plot_kwargs.setdefault("palette", self._orig_palette) - func(x=vector, **plot_kwargs) - ax.legend_ = None - - self._add_axis_labels() - return self - - def _map_diag_iter_hue(self, func, **kwargs): - """Put marginal plot on each diagonal axes, iterating over hue.""" - # Plot on each of the diagonal axes - fixed_color = kwargs.pop("color", None) - - for var, ax in zip(self.diag_vars, self.diag_axes): - hue_grouped = self.data[var].groupby(self.hue_vals, observed=True) - - plot_kwargs = kwargs.copy() - if str(func.__module__).startswith("seaborn"): - plot_kwargs["ax"] = ax - else: - plt.sca(ax) - - for k, label_k in enumerate(self._hue_order): - - # Attempt to get data for this level, allowing for empty - try: - data_k = hue_grouped.get_group(label_k) - except KeyError: - data_k = pd.Series([], dtype=float) - - if fixed_color is None: - color = self.palette[k] - else: - color = fixed_color - - if self._dropna: - data_k = utils.remove_na(data_k) - - if str(func.__module__).startswith("seaborn"): - func(x=data_k, label=label_k, color=color, **plot_kwargs) - else: - func(data_k, label=label_k, color=color, **plot_kwargs) - - self._add_axis_labels() - - return self - - def _map_bivariate(self, func, indices, **kwargs): - """Draw a bivariate plot on the indicated axes.""" - # This is a hack to handle the fact that new distribution plots don't add - # their artists onto the axes. This is probably superior in general, but - # we'll need a better way to handle it in the axisgrid functions. - from .distributions import histplot, kdeplot - if func is histplot or func is kdeplot: - self._extract_legend_handles = True - - kws = kwargs.copy() # Use copy as we insert other kwargs - for i, j in indices: - x_var = self.x_vars[j] - y_var = self.y_vars[i] - ax = self.axes[i, j] - if ax is None: # i.e. we are in corner mode - continue - self._plot_bivariate(x_var, y_var, ax, func, **kws) - self._add_axis_labels() - - if "hue" in signature(func).parameters: - self.hue_names = list(self._legend_data) - - def _plot_bivariate(self, x_var, y_var, ax, func, **kwargs): - """Draw a bivariate plot on the specified axes.""" - if "hue" not in signature(func).parameters: - self._plot_bivariate_iter_hue(x_var, y_var, ax, func, **kwargs) - return - - kwargs = kwargs.copy() - if str(func.__module__).startswith("seaborn"): - kwargs["ax"] = ax - else: - plt.sca(ax) - - if x_var == y_var: - axes_vars = [x_var] - else: - axes_vars = [x_var, y_var] - - if self._hue_var is not None and self._hue_var not in axes_vars: - axes_vars.append(self._hue_var) - - data = self.data[axes_vars] - if self._dropna: - data = data.dropna() - - x = data[x_var] - y = data[y_var] - if self._hue_var is None: - hue = None - else: - hue = data.get(self._hue_var) - - if "hue" not in kwargs: - kwargs.update({ - "hue": hue, "hue_order": self._hue_order, "palette": self._orig_palette, - }) - func(x=x, y=y, **kwargs) - - self._update_legend_data(ax) - - def _plot_bivariate_iter_hue(self, x_var, y_var, ax, func, **kwargs): - """Draw a bivariate plot while iterating over hue subsets.""" - kwargs = kwargs.copy() - if str(func.__module__).startswith("seaborn"): - kwargs["ax"] = ax - else: - plt.sca(ax) - - if x_var == y_var: - axes_vars = [x_var] - else: - axes_vars = [x_var, y_var] - - hue_grouped = self.data.groupby(self.hue_vals, observed=True) - for k, label_k in enumerate(self._hue_order): - - kws = kwargs.copy() - - # Attempt to get data for this level, allowing for empty - try: - data_k = hue_grouped.get_group(label_k) - except KeyError: - data_k = pd.DataFrame(columns=axes_vars, - dtype=float) - - if self._dropna: - data_k = data_k[axes_vars].dropna() - - x = data_k[x_var] - y = data_k[y_var] - - for kw, val_list in self.hue_kws.items(): - kws[kw] = val_list[k] - kws.setdefault("color", self.palette[k]) - if self._hue_var is not None: - kws["label"] = label_k - - if str(func.__module__).startswith("seaborn"): - func(x=x, y=y, **kws) - else: - func(x, y, **kws) - - self._update_legend_data(ax) - - def _add_axis_labels(self): - """Add labels to the left and bottom Axes.""" - for ax, label in zip(self.axes[-1, :], self.x_vars): - ax.set_xlabel(label) - for ax, label in zip(self.axes[:, 0], self.y_vars): - ax.set_ylabel(label) - - def _find_numeric_cols(self, data): - """Find which variables in a DataFrame are numeric.""" - numeric_cols = [] - for col in data: - if variable_type(data[col]) == "numeric": - numeric_cols.append(col) - return numeric_cols - - -class JointGrid(_BaseGrid): - """Grid for drawing a bivariate plot with marginal univariate plots. - - Many plots can be drawn by using the figure-level interface :func:`jointplot`. - Use this class directly when you need more flexibility. - - """ - - def __init__( - self, data=None, *, - x=None, y=None, hue=None, - height=6, ratio=5, space=.2, - palette=None, hue_order=None, hue_norm=None, - dropna=False, xlim=None, ylim=None, marginal_ticks=False, - ): - - # Set up the subplot grid - f = plt.figure(figsize=(height, height)) - gs = plt.GridSpec(ratio + 1, ratio + 1) - - ax_joint = f.add_subplot(gs[1:, :-1]) - ax_marg_x = f.add_subplot(gs[0, :-1], sharex=ax_joint) - ax_marg_y = f.add_subplot(gs[1:, -1], sharey=ax_joint) - - self._figure = f - self.ax_joint = ax_joint - self.ax_marg_x = ax_marg_x - self.ax_marg_y = ax_marg_y - - # Turn off tick visibility for the measure axis on the marginal plots - plt.setp(ax_marg_x.get_xticklabels(), visible=False) - plt.setp(ax_marg_y.get_yticklabels(), visible=False) - plt.setp(ax_marg_x.get_xticklabels(minor=True), visible=False) - plt.setp(ax_marg_y.get_yticklabels(minor=True), visible=False) - - # Turn off the ticks on the density axis for the marginal plots - if not marginal_ticks: - plt.setp(ax_marg_x.yaxis.get_majorticklines(), visible=False) - plt.setp(ax_marg_x.yaxis.get_minorticklines(), visible=False) - plt.setp(ax_marg_y.xaxis.get_majorticklines(), visible=False) - plt.setp(ax_marg_y.xaxis.get_minorticklines(), visible=False) - plt.setp(ax_marg_x.get_yticklabels(), visible=False) - plt.setp(ax_marg_y.get_xticklabels(), visible=False) - plt.setp(ax_marg_x.get_yticklabels(minor=True), visible=False) - plt.setp(ax_marg_y.get_xticklabels(minor=True), visible=False) - ax_marg_x.yaxis.grid(False) - ax_marg_y.xaxis.grid(False) - - # Process the input variables - p = VectorPlotter(data=data, variables=dict(x=x, y=y, hue=hue)) - plot_data = p.plot_data.loc[:, p.plot_data.notna().any()] - - # Possibly drop NA - if dropna: - plot_data = plot_data.dropna() - - def get_var(var): - vector = plot_data.get(var, None) - if vector is not None: - vector = vector.rename(p.variables.get(var, None)) - return vector - - self.x = get_var("x") - self.y = get_var("y") - self.hue = get_var("hue") - - for axis in "xy": - name = p.variables.get(axis, None) - if name is not None: - getattr(ax_joint, f"set_{axis}label")(name) - - if xlim is not None: - ax_joint.set_xlim(xlim) - if ylim is not None: - ax_joint.set_ylim(ylim) - - # Store the semantic mapping parameters for axes-level functions - self._hue_params = dict(palette=palette, hue_order=hue_order, hue_norm=hue_norm) - - # Make the grid look nice - utils.despine(f) - if not marginal_ticks: - utils.despine(ax=ax_marg_x, left=True) - utils.despine(ax=ax_marg_y, bottom=True) - for axes in [ax_marg_x, ax_marg_y]: - for axis in [axes.xaxis, axes.yaxis]: - axis.label.set_visible(False) - f.tight_layout() - f.subplots_adjust(hspace=space, wspace=space) - - def _inject_kwargs(self, func, kws, params): - """Add params to kws if they are accepted by func.""" - func_params = signature(func).parameters - for key, val in params.items(): - if key in func_params: - kws.setdefault(key, val) - - def plot(self, joint_func, marginal_func, **kwargs): - """Draw the plot by passing functions for joint and marginal axes. - - This method passes the ``kwargs`` dictionary to both functions. If you - need more control, call :meth:`JointGrid.plot_joint` and - :meth:`JointGrid.plot_marginals` directly with specific parameters. - - Parameters - ---------- - joint_func, marginal_func : callables - Functions to draw the bivariate and univariate plots. See methods - referenced above for information about the required characteristics - of these functions. - kwargs - Additional keyword arguments are passed to both functions. - - Returns - ------- - :class:`JointGrid` instance - Returns ``self`` for easy method chaining. - - """ - self.plot_marginals(marginal_func, **kwargs) - self.plot_joint(joint_func, **kwargs) - return self - - def plot_joint(self, func, **kwargs): - """Draw a bivariate plot on the joint axes of the grid. - - Parameters - ---------- - func : plotting callable - If a seaborn function, it should accept ``x`` and ``y``. Otherwise, - it must accept ``x`` and ``y`` vectors of data as the first two - positional arguments, and it must plot on the "current" axes. - If ``hue`` was defined in the class constructor, the function must - accept ``hue`` as a parameter. - kwargs - Keyword argument are passed to the plotting function. - - Returns - ------- - :class:`JointGrid` instance - Returns ``self`` for easy method chaining. - - """ - kwargs = kwargs.copy() - if str(func.__module__).startswith("seaborn"): - kwargs["ax"] = self.ax_joint - else: - plt.sca(self.ax_joint) - if self.hue is not None: - kwargs["hue"] = self.hue - self._inject_kwargs(func, kwargs, self._hue_params) - - if str(func.__module__).startswith("seaborn"): - func(x=self.x, y=self.y, **kwargs) - else: - func(self.x, self.y, **kwargs) - - return self - - def plot_marginals(self, func, **kwargs): - """Draw univariate plots on each marginal axes. - - Parameters - ---------- - func : plotting callable - If a seaborn function, it should accept ``x`` and ``y`` and plot - when only one of them is defined. Otherwise, it must accept a vector - of data as the first positional argument and determine its orientation - using the ``vertical`` parameter, and it must plot on the "current" axes. - If ``hue`` was defined in the class constructor, it must accept ``hue`` - as a parameter. - kwargs - Keyword argument are passed to the plotting function. - - Returns - ------- - :class:`JointGrid` instance - Returns ``self`` for easy method chaining. - - """ - seaborn_func = ( - str(func.__module__).startswith("seaborn") - # deprecated distplot has a legacy API, special case it - and not func.__name__ == "distplot" - ) - func_params = signature(func).parameters - kwargs = kwargs.copy() - if self.hue is not None: - kwargs["hue"] = self.hue - self._inject_kwargs(func, kwargs, self._hue_params) - - if "legend" in func_params: - kwargs.setdefault("legend", False) - - if "orientation" in func_params: - # e.g. plt.hist - orient_kw_x = {"orientation": "vertical"} - orient_kw_y = {"orientation": "horizontal"} - elif "vertical" in func_params: - # e.g. sns.distplot (also how did this get backwards?) - orient_kw_x = {"vertical": False} - orient_kw_y = {"vertical": True} - - if seaborn_func: - func(x=self.x, ax=self.ax_marg_x, **kwargs) - else: - plt.sca(self.ax_marg_x) - func(self.x, **orient_kw_x, **kwargs) - - if seaborn_func: - func(y=self.y, ax=self.ax_marg_y, **kwargs) - else: - plt.sca(self.ax_marg_y) - func(self.y, **orient_kw_y, **kwargs) - - self.ax_marg_x.yaxis.get_label().set_visible(False) - self.ax_marg_y.xaxis.get_label().set_visible(False) - - return self - - def refline( - self, *, x=None, y=None, joint=True, marginal=True, - color='.5', linestyle='--', **line_kws - ): - """Add a reference line(s) to joint and/or marginal axes. - - Parameters - ---------- - x, y : numeric - Value(s) to draw the line(s) at. - joint, marginal : bools - Whether to add the reference line(s) to the joint/marginal axes. - color : :mod:`matplotlib color <matplotlib.colors>` - Specifies the color of the reference line(s). - linestyle : str - Specifies the style of the reference line(s). - line_kws : key, value mappings - Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.axvline` - when ``x`` is not None and :meth:`matplotlib.axes.Axes.axhline` when ``y`` - is not None. - - Returns - ------- - :class:`JointGrid` instance - Returns ``self`` for easy method chaining. - - """ - line_kws['color'] = color - line_kws['linestyle'] = linestyle - - if x is not None: - if joint: - self.ax_joint.axvline(x, **line_kws) - if marginal: - self.ax_marg_x.axvline(x, **line_kws) - - if y is not None: - if joint: - self.ax_joint.axhline(y, **line_kws) - if marginal: - self.ax_marg_y.axhline(y, **line_kws) - - return self - - def set_axis_labels(self, xlabel="", ylabel="", **kwargs): - """Set axis labels on the bivariate axes. - - Parameters - ---------- - xlabel, ylabel : strings - Label names for the x and y variables. - kwargs : key, value mappings - Other keyword arguments are passed to the following functions: - - - :meth:`matplotlib.axes.Axes.set_xlabel` - - :meth:`matplotlib.axes.Axes.set_ylabel` - - Returns - ------- - :class:`JointGrid` instance - Returns ``self`` for easy method chaining. - - """ - self.ax_joint.set_xlabel(xlabel, **kwargs) - self.ax_joint.set_ylabel(ylabel, **kwargs) - return self - - -JointGrid.__init__.__doc__ = """\ -Set up the grid of subplots and store data internally for easy plotting. - -Parameters ----------- -{params.core.data} -{params.core.xy} -height : number - Size of each side of the figure in inches (it will be square). -ratio : number - Ratio of joint axes height to marginal axes height. -space : number - Space between the joint and marginal axes -dropna : bool - If True, remove missing observations before plotting. -{{x, y}}lim : pairs of numbers - Set axis limits to these values before plotting. -marginal_ticks : bool - If False, suppress ticks on the count/density axis of the marginal plots. -{params.core.hue} - Note: unlike in :class:`FacetGrid` or :class:`PairGrid`, the axes-level - functions must support ``hue`` to use it in :class:`JointGrid`. -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} - -See Also --------- -{seealso.jointplot} -{seealso.pairgrid} -{seealso.pairplot} - -Examples --------- - -.. include:: ../docstrings/JointGrid.rst - -""".format( - params=_param_docs, - seealso=_core_docs["seealso"], -) - - -def pairplot( - data, *, - hue=None, hue_order=None, palette=None, - vars=None, x_vars=None, y_vars=None, - kind="scatter", diag_kind="auto", markers=None, - height=2.5, aspect=1, corner=False, dropna=False, - plot_kws=None, diag_kws=None, grid_kws=None, size=None, -): - """Plot pairwise relationships in a dataset. - - By default, this function will create a grid of Axes such that each numeric - variable in ``data`` will by shared across the y-axes across a single row and - the x-axes across a single column. The diagonal plots are treated - differently: a univariate distribution plot is drawn to show the marginal - distribution of the data in each column. - - It is also possible to show a subset of variables or plot different - variables on the rows and columns. - - This is a high-level interface for :class:`PairGrid` that is intended to - make it easy to draw a few common styles. You should use :class:`PairGrid` - directly if you need more flexibility. - - Parameters - ---------- - data : `pandas.DataFrame` - Tidy (long-form) dataframe where each column is a variable and - each row is an observation. - hue : name of variable in ``data`` - Variable in ``data`` to map plot aspects to different colors. - hue_order : list of strings - Order for the levels of the hue variable in the palette - palette : dict or seaborn color palette - Set of colors for mapping the ``hue`` variable. If a dict, keys - should be values in the ``hue`` variable. - vars : list of variable names - Variables within ``data`` to use, otherwise use every column with - a numeric datatype. - {x, y}_vars : lists of variable names - Variables within ``data`` to use separately for the rows and - columns of the figure; i.e. to make a non-square plot. - kind : {'scatter', 'kde', 'hist', 'reg'} - Kind of plot to make. - diag_kind : {'auto', 'hist', 'kde', None} - Kind of plot for the diagonal subplots. If 'auto', choose based on - whether or not ``hue`` is used. - markers : single matplotlib marker code or list - Either the marker to use for all scatterplot points or a list of markers - with a length the same as the number of levels in the hue variable so that - differently colored points will also have different scatterplot - markers. - height : scalar - Height (in inches) of each facet. - aspect : scalar - Aspect * height gives the width (in inches) of each facet. - corner : bool - If True, don't add axes to the upper (off-diagonal) triangle of the - grid, making this a "corner" plot. - dropna : boolean - Drop missing values from the data before plotting. - {plot, diag, grid}_kws : dicts - Dictionaries of keyword arguments. ``plot_kws`` are passed to the - bivariate plotting function, ``diag_kws`` are passed to the univariate - plotting function, and ``grid_kws`` are passed to the :class:`PairGrid` - constructor. - - Returns - ------- - grid : :class:`PairGrid` - Returns the underlying :class:`PairGrid` instance for further tweaking. - - See Also - -------- - PairGrid : Subplot grid for more flexible plotting of pairwise relationships. - JointGrid : Grid for plotting joint and marginal distributions of two variables. - - Examples - -------- - - .. include:: ../docstrings/pairplot.rst - - """ - # Avoid circular import - from .distributions import histplot, kdeplot - - # Handle deprecations - if size is not None: - height = size - msg = ("The `size` parameter has been renamed to `height`; " - "please update your code.") - warnings.warn(msg, UserWarning) - - if not isinstance(data, pd.DataFrame): - raise TypeError( - f"'data' must be pandas DataFrame object, not: {type(data)}") - - plot_kws = {} if plot_kws is None else plot_kws.copy() - diag_kws = {} if diag_kws is None else diag_kws.copy() - grid_kws = {} if grid_kws is None else grid_kws.copy() - - # Resolve "auto" diag kind - if diag_kind == "auto": - if hue is None: - diag_kind = "kde" if kind == "kde" else "hist" - else: - diag_kind = "hist" if kind == "hist" else "kde" - - # Set up the PairGrid - grid_kws.setdefault("diag_sharey", diag_kind == "hist") - grid = PairGrid(data, vars=vars, x_vars=x_vars, y_vars=y_vars, hue=hue, - hue_order=hue_order, palette=palette, corner=corner, - height=height, aspect=aspect, dropna=dropna, **grid_kws) - - # Add the markers here as PairGrid has figured out how many levels of the - # hue variable are needed and we don't want to duplicate that process - if markers is not None: - if kind == "reg": - # Needed until regplot supports style - if grid.hue_names is None: - n_markers = 1 - else: - n_markers = len(grid.hue_names) - if not isinstance(markers, list): - markers = [markers] * n_markers - if len(markers) != n_markers: - raise ValueError("markers must be a singleton or a list of " - "markers for each level of the hue variable") - grid.hue_kws = {"marker": markers} - elif kind == "scatter": - if isinstance(markers, str): - plot_kws["marker"] = markers - elif hue is not None: - plot_kws["style"] = data[hue] - plot_kws["markers"] = markers - - # Draw the marginal plots on the diagonal - diag_kws = diag_kws.copy() - diag_kws.setdefault("legend", False) - if diag_kind == "hist": - grid.map_diag(histplot, **diag_kws) - elif diag_kind == "kde": - diag_kws.setdefault("fill", True) - diag_kws.setdefault("warn_singular", False) - grid.map_diag(kdeplot, **diag_kws) - - # Maybe plot on the off-diagonals - if diag_kind is not None: - plotter = grid.map_offdiag - else: - plotter = grid.map - - if kind == "scatter": - from .relational import scatterplot # Avoid circular import - plotter(scatterplot, **plot_kws) - elif kind == "reg": - from .regression import regplot # Avoid circular import - plotter(regplot, **plot_kws) - elif kind == "kde": - from .distributions import kdeplot # Avoid circular import - plot_kws.setdefault("warn_singular", False) - plotter(kdeplot, **plot_kws) - elif kind == "hist": - from .distributions import histplot # Avoid circular import - plotter(histplot, **plot_kws) - - # Add a legend - if hue is not None: - grid.add_legend() - - grid.tight_layout() - - return grid - - -def jointplot( - data=None, *, x=None, y=None, hue=None, kind="scatter", - height=6, ratio=5, space=.2, dropna=False, xlim=None, ylim=None, - color=None, palette=None, hue_order=None, hue_norm=None, marginal_ticks=False, - joint_kws=None, marginal_kws=None, - **kwargs -): - # Avoid circular imports - from .relational import scatterplot - from .regression import regplot, residplot - from .distributions import histplot, kdeplot, _freedman_diaconis_bins - - if kwargs.pop("ax", None) is not None: - msg = "Ignoring `ax`; jointplot is a figure-level function." - warnings.warn(msg, UserWarning, stacklevel=2) - - # Set up empty default kwarg dicts - joint_kws = {} if joint_kws is None else joint_kws.copy() - joint_kws.update(kwargs) - marginal_kws = {} if marginal_kws is None else marginal_kws.copy() - - # Handle deprecations of distplot-specific kwargs - distplot_keys = [ - "rug", "fit", "hist_kws", "norm_hist" "hist_kws", "rug_kws", - ] - unused_keys = [] - for key in distplot_keys: - if key in marginal_kws: - unused_keys.append(key) - marginal_kws.pop(key) - if unused_keys and kind != "kde": - msg = ( - "The marginal plotting function has changed to `histplot`," - " which does not accept the following argument(s): {}." - ).format(", ".join(unused_keys)) - warnings.warn(msg, UserWarning) - - # Validate the plot kind - plot_kinds = ["scatter", "hist", "hex", "kde", "reg", "resid"] - _check_argument("kind", plot_kinds, kind) - - # Raise early if using `hue` with a kind that does not support it - if hue is not None and kind in ["hex", "reg", "resid"]: - msg = f"Use of `hue` with `kind='{kind}'` is not currently supported." - raise ValueError(msg) - - # Make a colormap based off the plot color - # (Currently used only for kind="hex") - if color is None: - color = "C0" - color_rgb = mpl.colors.colorConverter.to_rgb(color) - colors = [set_hls_values(color_rgb, l=val) for val in np.linspace(1, 0, 12)] - cmap = blend_palette(colors, as_cmap=True) - - # Matplotlib's hexbin plot is not na-robust - if kind == "hex": - dropna = True - - # Initialize the JointGrid object - grid = JointGrid( - data=data, x=x, y=y, hue=hue, - palette=palette, hue_order=hue_order, hue_norm=hue_norm, - dropna=dropna, height=height, ratio=ratio, space=space, - xlim=xlim, ylim=ylim, marginal_ticks=marginal_ticks, - ) - - if grid.hue is not None: - marginal_kws.setdefault("legend", False) - - # Plot the data using the grid - if kind.startswith("scatter"): - - joint_kws.setdefault("color", color) - grid.plot_joint(scatterplot, **joint_kws) - - if grid.hue is None: - marg_func = histplot - else: - marg_func = kdeplot - marginal_kws.setdefault("warn_singular", False) - marginal_kws.setdefault("fill", True) - - marginal_kws.setdefault("color", color) - grid.plot_marginals(marg_func, **marginal_kws) - - elif kind.startswith("hist"): - - # TODO process pair parameters for bins, etc. and pass - # to both joint and marginal plots - - joint_kws.setdefault("color", color) - grid.plot_joint(histplot, **joint_kws) - - marginal_kws.setdefault("kde", False) - marginal_kws.setdefault("color", color) - - marg_x_kws = marginal_kws.copy() - marg_y_kws = marginal_kws.copy() - - pair_keys = "bins", "binwidth", "binrange" - for key in pair_keys: - if isinstance(joint_kws.get(key), tuple): - x_val, y_val = joint_kws[key] - marg_x_kws.setdefault(key, x_val) - marg_y_kws.setdefault(key, y_val) - - histplot(data=data, x=x, hue=hue, **marg_x_kws, ax=grid.ax_marg_x) - histplot(data=data, y=y, hue=hue, **marg_y_kws, ax=grid.ax_marg_y) - - elif kind.startswith("kde"): - - joint_kws.setdefault("color", color) - joint_kws.setdefault("warn_singular", False) - grid.plot_joint(kdeplot, **joint_kws) - - marginal_kws.setdefault("color", color) - if "fill" in joint_kws: - marginal_kws.setdefault("fill", joint_kws["fill"]) - - grid.plot_marginals(kdeplot, **marginal_kws) - - elif kind.startswith("hex"): - - x_bins = min(_freedman_diaconis_bins(grid.x), 50) - y_bins = min(_freedman_diaconis_bins(grid.y), 50) - gridsize = int(np.mean([x_bins, y_bins])) - - joint_kws.setdefault("gridsize", gridsize) - joint_kws.setdefault("cmap", cmap) - grid.plot_joint(plt.hexbin, **joint_kws) - - marginal_kws.setdefault("kde", False) - marginal_kws.setdefault("color", color) - grid.plot_marginals(histplot, **marginal_kws) - - elif kind.startswith("reg"): - - marginal_kws.setdefault("color", color) - marginal_kws.setdefault("kde", True) - grid.plot_marginals(histplot, **marginal_kws) - - joint_kws.setdefault("color", color) - grid.plot_joint(regplot, **joint_kws) - - elif kind.startswith("resid"): - - joint_kws.setdefault("color", color) - grid.plot_joint(residplot, **joint_kws) - - x, y = grid.ax_joint.collections[0].get_offsets().T - marginal_kws.setdefault("color", color) - histplot(x=x, hue=hue, ax=grid.ax_marg_x, **marginal_kws) - histplot(y=y, hue=hue, ax=grid.ax_marg_y, **marginal_kws) - - # Make the main axes active in the matplotlib state machine - plt.sca(grid.ax_joint) - - return grid - - -jointplot.__doc__ = """\ -Draw a plot of two variables with bivariate and univariate graphs. - -This function provides a convenient interface to the :class:`JointGrid` -class, with several canned plot kinds. This is intended to be a fairly -lightweight wrapper; if you need more flexibility, you should use -:class:`JointGrid` directly. - -Parameters ----------- -{params.core.data} -{params.core.xy} -{params.core.hue} -kind : {{ "scatter" | "kde" | "hist" | "hex" | "reg" | "resid" }} - Kind of plot to draw. See the examples for references to the underlying functions. -height : numeric - Size of the figure (it will be square). -ratio : numeric - Ratio of joint axes height to marginal axes height. -space : numeric - Space between the joint and marginal axes -dropna : bool - If True, remove observations that are missing from ``x`` and ``y``. -{{x, y}}lim : pairs of numbers - Axis limits to set before plotting. -{params.core.color} -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -marginal_ticks : bool - If False, suppress ticks on the count/density axis of the marginal plots. -{{joint, marginal}}_kws : dicts - Additional keyword arguments for the plot components. -kwargs - Additional keyword arguments are passed to the function used to - draw the plot on the joint Axes, superseding items in the - ``joint_kws`` dictionary. - -Returns -------- -{returns.jointgrid} - -See Also --------- -{seealso.jointgrid} -{seealso.pairgrid} -{seealso.pairplot} - -Examples --------- - -.. include:: ../docstrings/jointplot.rst - -""".format( - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) diff --git a/seaborn/categorical.py b/seaborn/categorical.py deleted file mode 100644 index a43c085ba28b0eb6ce34c0161102937216a01638..0000000000000000000000000000000000000000 --- a/seaborn/categorical.py +++ /dev/null @@ -1,3456 +0,0 @@ -from collections import namedtuple -from textwrap import dedent -import warnings -from colorsys import rgb_to_hls -from functools import partial - -import numpy as np -import pandas as pd - -import matplotlib as mpl -from matplotlib.cbook import normalize_kwargs -from matplotlib.collections import PatchCollection -from matplotlib.markers import MarkerStyle -from matplotlib.patches import Rectangle -import matplotlib.pyplot as plt - -from seaborn._core.typing import default, deprecated -from seaborn._base import VectorPlotter, infer_orient, categorical_order -from seaborn._stats.density import KDE -from seaborn import utils -from seaborn.utils import ( - desaturate, - _check_argument, - _draw_figure, - _default_color, - _get_patch_legend_artist, - _get_transform_functions, - _scatter_legend_artist, - _version_predates, -) -from seaborn._compat import groupby_apply_include_groups -from seaborn._statistics import ( - EstimateAggregator, - LetterValues, - WeightedAggregator, -) -from seaborn.palettes import light_palette -from seaborn.axisgrid import FacetGrid, _facet_docs - - -__all__ = [ - "catplot", - "stripplot", "swarmplot", - "boxplot", "violinplot", "boxenplot", - "pointplot", "barplot", "countplot", -] - - -class _CategoricalPlotter(VectorPlotter): - - wide_structure = {"x": "@columns", "y": "@values", "hue": "@columns"} - flat_structure = {"y": "@values"} - - _legend_attributes = ["color"] - - def __init__( - self, - data=None, - variables={}, - order=None, - orient=None, - require_numeric=False, - color=None, - legend="auto", - ): - - super().__init__(data=data, variables=variables) - - # This method takes care of some bookkeeping that is necessary because the - # original categorical plots (prior to the 2021 refactor) had some rules that - # don't fit exactly into VectorPlotter logic. It may be wise to have a second - # round of refactoring that moves the logic deeper, but this will keep things - # relatively sensible for now. - - # For wide data, orient determines assignment to x/y differently from the - # default VectorPlotter rules. If we do decide to make orient part of the - # _base variable assignment, we'll want to figure out how to express that. - if self.input_format == "wide" and orient in ["h", "y"]: - self.plot_data = self.plot_data.rename(columns={"x": "y", "y": "x"}) - orig_variables = set(self.variables) - orig_x = self.variables.pop("x", None) - orig_y = self.variables.pop("y", None) - orig_x_type = self.var_types.pop("x", None) - orig_y_type = self.var_types.pop("y", None) - if "x" in orig_variables: - self.variables["y"] = orig_x - self.var_types["y"] = orig_x_type - if "y" in orig_variables: - self.variables["x"] = orig_y - self.var_types["x"] = orig_y_type - - # Initially there was more special code for wide-form data where plots were - # multi-colored by default and then either palette or color could be used. - # We want to provide backwards compatibility for this behavior in a relatively - # simply way, so we delete the hue information when color is specified. - if ( - self.input_format == "wide" - and "hue" in self.variables - and color is not None - ): - self.plot_data.drop("hue", axis=1) - self.variables.pop("hue") - - # The concept of an "orientation" is important to the original categorical - # plots, but there's no provision for it in VectorPlotter, so we need it here. - # Note that it could be useful for the other functions in at least two ways - # (orienting a univariate distribution plot from long-form data and selecting - # the aggregation axis in lineplot), so we may want to eventually refactor it. - self.orient = infer_orient( - x=self.plot_data.get("x", None), - y=self.plot_data.get("y", None), - orient=orient, - require_numeric=False, - ) - - self.legend = legend - - # Short-circuit in the case of an empty plot - if not self.has_xy_data: - return - - # Categorical plots can be "univariate" in which case they get an anonymous - # category label on the opposite axis. Note: this duplicates code in the core - # scale_categorical function. We need to do it here because of the next line. - if self.orient not in self.variables: - self.variables[self.orient] = None - self.var_types[self.orient] = "categorical" - self.plot_data[self.orient] = "" - - # Categorical variables have discrete levels that we need to track - cat_levels = categorical_order(self.plot_data[self.orient], order) - self.var_levels[self.orient] = cat_levels - - def _hue_backcompat(self, color, palette, hue_order, force_hue=False): - """Implement backwards compatibility for hue parametrization. - - Note: the force_hue parameter is used so that functions can be shown to - pass existing tests during refactoring and then tested for new behavior. - It can be removed after completion of the work. - - """ - # The original categorical functions applied a palette to the categorical axis - # by default. We want to require an explicit hue mapping, to be more consistent - # with how things work elsewhere now. I don't think there's any good way to - # do this gently -- because it's triggered by the default value of hue=None, - # users would always get a warning, unless we introduce some sentinel "default" - # argument for this change. That's possible, but asking users to set `hue=None` - # on every call is annoying. - # We are keeping the logic for implementing the old behavior in with the current - # system so that (a) we can punt on that decision and (b) we can ensure that - # refactored code passes old tests. - default_behavior = color is None or palette is not None - if force_hue and "hue" not in self.variables and default_behavior: - self._redundant_hue = True - self.plot_data["hue"] = self.plot_data[self.orient] - self.variables["hue"] = self.variables[self.orient] - self.var_types["hue"] = "categorical" - hue_order = self.var_levels[self.orient] - - # Because we convert the categorical axis variable to string, - # we need to update a dictionary palette too - if isinstance(palette, dict): - palette = {str(k): v for k, v in palette.items()} - - else: - if "hue" in self.variables: - redundant = (self.plot_data["hue"] == self.plot_data[self.orient]).all() - else: - redundant = False - self._redundant_hue = redundant - - # Previously, categorical plots had a trick where color= could seed the palette. - # Because that's an explicit parameterization, we are going to give it one - # release cycle with a warning before removing. - if "hue" in self.variables and palette is None and color is not None: - if not isinstance(color, str): - color = mpl.colors.to_hex(color) - palette = f"dark:{color}" - msg = ( - "\n\nSetting a gradient palette using color= is deprecated and will be " - f"removed in v0.14.0. Set `palette='{palette}'` for the same effect.\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - - return palette, hue_order - - def _palette_without_hue_backcompat(self, palette, hue_order): - """Provide one cycle where palette= implies hue= when not provided""" - if "hue" not in self.variables and palette is not None: - msg = ( - "\n\nPassing `palette` without assigning `hue` is deprecated " - f"and will be removed in v0.14.0. Assign the `{self.orient}` variable " - "to `hue` and set `legend=False` for the same effect.\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - - self.legend = False - self.plot_data["hue"] = self.plot_data[self.orient] - self.variables["hue"] = self.variables.get(self.orient) - self.var_types["hue"] = self.var_types.get(self.orient) - - hue_order = self.var_levels.get(self.orient) - self._var_levels.pop("hue", None) - - return hue_order - - def _point_kwargs_backcompat(self, scale, join, kwargs): - """Provide two cycles where scale= and join= work, but redirect to kwargs.""" - if scale is not deprecated: - lw = mpl.rcParams["lines.linewidth"] * 1.8 * scale - mew = lw * .75 - ms = lw * 2 - - msg = ( - "\n\n" - "The `scale` parameter is deprecated and will be removed in v0.15.0. " - "You can now control the size of each plot element using matplotlib " - "`Line2D` parameters (e.g., `linewidth`, `markersize`, etc.)." - "\n" - ) - warnings.warn(msg, stacklevel=3) - kwargs.update(linewidth=lw, markeredgewidth=mew, markersize=ms) - - if join is not deprecated: - msg = ( - "\n\n" - "The `join` parameter is deprecated and will be removed in v0.15.0." - ) - if not join: - msg += ( - " You can remove the line between points with `linestyle='none'`." - ) - kwargs.update(linestyle="") - msg += "\n" - warnings.warn(msg, stacklevel=3) - - def _err_kws_backcompat(self, err_kws, errcolor, errwidth, capsize): - """Provide two cycles where existing signature-level err_kws are handled.""" - def deprecate_err_param(name, key, val): - if val is deprecated: - return - suggest = f"err_kws={{'{key}': {val!r}}}" - msg = ( - f"\n\nThe `{name}` parameter is deprecated. And will be removed " - f"in v0.15.0. Pass `{suggest}` instead.\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=4) - err_kws[key] = val - - if errcolor is not None: - deprecate_err_param("errcolor", "color", errcolor) - deprecate_err_param("errwidth", "linewidth", errwidth) - - if capsize is None: - capsize = 0 - msg = ( - "\n\nPassing `capsize=None` is deprecated and will be removed " - "in v0.15.0. Pass `capsize=0` to disable caps.\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - - return err_kws, capsize - - def _violin_scale_backcompat(self, scale, scale_hue, density_norm, common_norm): - """Provide two cycles of backcompat for scale kwargs""" - if scale is not deprecated: - density_norm = scale - msg = ( - "\n\nThe `scale` parameter has been renamed and will be removed " - f"in v0.15.0. Pass `density_norm={scale!r}` for the same effect." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - - if scale_hue is not deprecated: - common_norm = scale_hue - msg = ( - "\n\nThe `scale_hue` parameter has been replaced and will be removed " - f"in v0.15.0. Pass `common_norm={not scale_hue}` for the same effect." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - - return density_norm, common_norm - - def _violin_bw_backcompat(self, bw, bw_method): - """Provide two cycles of backcompat for violin bandwidth parameterization.""" - if bw is not deprecated: - bw_method = bw - msg = dedent(f"""\n - The `bw` parameter is deprecated in favor of `bw_method`/`bw_adjust`. - Setting `bw_method={bw!r}`, but please see docs for the new parameters - and update your code. This will become an error in seaborn v0.15.0. - """) - warnings.warn(msg, FutureWarning, stacklevel=3) - return bw_method - - def _boxen_scale_backcompat(self, scale, width_method): - """Provide two cycles of backcompat for scale kwargs""" - if scale is not deprecated: - width_method = scale - msg = ( - "\n\nThe `scale` parameter has been renamed to `width_method` and " - f"will be removed in v0.15. Pass `width_method={scale!r}" - ) - if scale == "area": - msg += ", but note that the result for 'area' will appear different." - else: - msg += " for the same effect." - warnings.warn(msg, FutureWarning, stacklevel=3) - - return width_method - - def _complement_color(self, color, base_color, hue_map): - """Allow a color to be set automatically using a basis of comparison.""" - if color == "gray": - msg = ( - 'Use "auto" to set automatic grayscale colors. From v0.14.0, ' - '"gray" will default to matplotlib\'s definition.' - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - color = "auto" - elif color is None or color is default: - color = "auto" - - if color != "auto": - return color - - if hue_map.lookup_table is None: - if base_color is None: - return None - basis = [mpl.colors.to_rgb(base_color)] - else: - basis = [mpl.colors.to_rgb(c) for c in hue_map.lookup_table.values()] - unique_colors = np.unique(basis, axis=0) - light_vals = [rgb_to_hls(*rgb[:3])[1] for rgb in unique_colors] - lum = min(light_vals) * .6 - return (lum, lum, lum) - - def _map_prop_with_hue(self, name, value, fallback, plot_kws): - """Support pointplot behavior of modifying the marker/linestyle with hue.""" - if value is default: - value = plot_kws.pop(name, fallback) - - if "hue" in self.variables: - levels = self._hue_map.levels - if isinstance(value, list): - mapping = {k: v for k, v in zip(levels, value)} - else: - mapping = {k: value for k in levels} - else: - mapping = {None: value} - - return mapping - - def _adjust_cat_axis(self, ax, axis): - """Set ticks and limits for a categorical variable.""" - # Note: in theory, this could happen in _attach for all categorical axes - # But two reasons not to do that: - # - If it happens before plotting, autoscaling messes up the plot limits - # - It would change existing plots from other seaborn functions - if self.var_types[axis] != "categorical": - return - - # If both x/y data are empty, the correct way to set up the plot is - # somewhat undefined; because we don't add null category data to the plot in - # this case we don't *have* a categorical axis (yet), so best to just bail. - if self.plot_data[axis].empty: - return - - # We can infer the total number of categories (including those from previous - # plots that are not part of the plot we are currently making) from the number - # of ticks, which matplotlib sets up while doing unit conversion. This feels - # slightly risky, as if we are relying on something that may be a matplotlib - # implementation detail. But I cannot think of a better way to keep track of - # the state from previous categorical calls (see GH2516 for context) - n = len(getattr(ax, f"get_{axis}ticks")()) - - if axis == "x": - ax.xaxis.grid(False) - ax.set_xlim(-.5, n - .5, auto=None) - else: - ax.yaxis.grid(False) - # Note limits that correspond to previously-inverted y axis - ax.set_ylim(n - .5, -.5, auto=None) - - def _dodge_needed(self): - """Return True when use of `hue` would cause overlaps.""" - groupers = list({self.orient, "col", "row"} & set(self.variables)) - if "hue" in self.variables: - orient = self.plot_data[groupers].value_counts() - paired = self.plot_data[[*groupers, "hue"]].value_counts() - return orient.size != paired.size - return False - - def _dodge(self, keys, data): - """Apply a dodge transform to coordinates in place.""" - if "hue" not in self.variables: - # Short-circuit if hue variable was not assigned - # We could potentially warn when hue=None, dodge=True, user may be confused - # But I think it's fine to just treat it as a no-op. - return - hue_idx = self._hue_map.levels.index(keys["hue"]) - n = len(self._hue_map.levels) - data["width"] /= n - - full_width = data["width"] * n - offset = data["width"] * hue_idx + data["width"] / 2 - full_width / 2 - data[self.orient] += offset - - def _invert_scale(self, ax, data, vars=("x", "y")): - """Undo scaling after computation so data are plotted correctly.""" - for var in vars: - _, inv = _get_transform_functions(ax, var[0]) - if var == self.orient and "width" in data: - hw = data["width"] / 2 - data["edge"] = inv(data[var] - hw) - data["width"] = inv(data[var] + hw) - data["edge"].to_numpy() - for suf in ["", "min", "max"]: - if (col := f"{var}{suf}") in data: - data[col] = inv(data[col]) - - def _configure_legend(self, ax, func, common_kws=None, semantic_kws=None): - if self.legend == "auto": - show_legend = not self._redundant_hue and self.input_format != "wide" - else: - show_legend = bool(self.legend) - if show_legend: - self.add_legend_data(ax, func, common_kws, semantic_kws=semantic_kws) - handles, _ = ax.get_legend_handles_labels() - if handles: - ax.legend(title=self.legend_title) - - @property - def _native_width(self): - """Return unit of width separating categories on native numeric scale.""" - # Categorical data always have a unit width - if self.var_types[self.orient] == "categorical": - return 1 - - # Otherwise, define the width as the smallest space between observations - unique_values = np.unique(self.comp_data[self.orient]) - if len(unique_values) > 1: - native_width = np.nanmin(np.diff(unique_values)) - else: - native_width = 1 - return native_width - - def _nested_offsets(self, width, dodge): - """Return offsets for each hue level for dodged plots.""" - offsets = None - if "hue" in self.variables and self._hue_map.levels is not None: - n_levels = len(self._hue_map.levels) - if dodge: - each_width = width / n_levels - offsets = np.linspace(0, width - each_width, n_levels) - offsets -= offsets.mean() - else: - offsets = np.zeros(n_levels) - return offsets - - # Note that the plotting methods here aim (in most cases) to produce the - # exact same artists as the original (pre 0.12) version of the code, so - # there is some weirdness that might not otherwise be clean or make sense in - # this context, such as adding empty artists for combinations of variables - # with no observations - - def plot_strips( - self, - jitter, - dodge, - color, - plot_kws, - ): - - width = .8 * self._native_width - offsets = self._nested_offsets(width, dodge) - - if jitter is True: - jlim = 0.1 - else: - jlim = float(jitter) - if "hue" in self.variables and dodge and self._hue_map.levels is not None: - jlim /= len(self._hue_map.levels) - jlim *= self._native_width - jitterer = partial(np.random.uniform, low=-jlim, high=+jlim) - - iter_vars = [self.orient] - if dodge: - iter_vars.append("hue") - - ax = self.ax - dodge_move = jitter_move = 0 - - if "marker" in plot_kws and not MarkerStyle(plot_kws["marker"]).is_filled(): - plot_kws.pop("edgecolor", None) - - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=True): - - ax = self._get_axes(sub_vars) - - if offsets is not None and (offsets != 0).any(): - dodge_move = offsets[sub_data["hue"].map(self._hue_map.levels.index)] - - jitter_move = jitterer(size=len(sub_data)) if len(sub_data) > 1 else 0 - - adjusted_data = sub_data[self.orient] + dodge_move + jitter_move - sub_data[self.orient] = adjusted_data - self._invert_scale(ax, sub_data) - - points = ax.scatter(sub_data["x"], sub_data["y"], color=color, **plot_kws) - if "hue" in self.variables: - points.set_facecolors(self._hue_map(sub_data["hue"])) - - self._configure_legend(ax, _scatter_legend_artist, common_kws=plot_kws) - - def plot_swarms( - self, - dodge, - color, - warn_thresh, - plot_kws, - ): - - width = .8 * self._native_width - offsets = self._nested_offsets(width, dodge) - - iter_vars = [self.orient] - if dodge: - iter_vars.append("hue") - - ax = self.ax - point_collections = {} - dodge_move = 0 - - if "marker" in plot_kws and not MarkerStyle(plot_kws["marker"]).is_filled(): - plot_kws.pop("edgecolor", None) - - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=True): - - ax = self._get_axes(sub_vars) - - if offsets is not None: - dodge_move = offsets[sub_data["hue"].map(self._hue_map.levels.index)] - - if not sub_data.empty: - sub_data[self.orient] = sub_data[self.orient] + dodge_move - - self._invert_scale(ax, sub_data) - - points = ax.scatter(sub_data["x"], sub_data["y"], color=color, **plot_kws) - if "hue" in self.variables: - points.set_facecolors(self._hue_map(sub_data["hue"])) - - if not sub_data.empty: - point_collections[(ax, sub_data[self.orient].iloc[0])] = points - - beeswarm = Beeswarm(width=width, orient=self.orient, warn_thresh=warn_thresh) - for (ax, center), points in point_collections.items(): - if points.get_offsets().shape[0] > 1: - - def draw(points, renderer, *, center=center): - - beeswarm(points, center) - - if self.orient == "y": - scalex = False - scaley = ax.get_autoscaley_on() - else: - scalex = ax.get_autoscalex_on() - scaley = False - - # This prevents us from undoing the nice categorical axis limits - # set in _adjust_cat_axis, because that method currently leave - # the autoscale flag in its original setting. It may be better - # to disable autoscaling there to avoid needing to do this. - fixed_scale = self.var_types[self.orient] == "categorical" - ax.update_datalim(points.get_datalim(ax.transData)) - if not fixed_scale and (scalex or scaley): - ax.autoscale_view(scalex=scalex, scaley=scaley) - - super(points.__class__, points).draw(renderer) - - points.draw = draw.__get__(points) - - _draw_figure(ax.figure) - self._configure_legend(ax, _scatter_legend_artist, plot_kws) - - def plot_boxes( - self, - width, - dodge, - gap, - fill, - whis, - color, - linecolor, - linewidth, - fliersize, - plot_kws, # TODO rename user_kws? - ): - - iter_vars = ["hue"] - value_var = {"x": "y", "y": "x"}[self.orient] - - def get_props(element, artist=mpl.lines.Line2D): - return normalize_kwargs(plot_kws.pop(f"{element}props", {}), artist) - - if not fill and linewidth is None: - linewidth = mpl.rcParams["lines.linewidth"] - bootstrap = plot_kws.pop("bootstrap", mpl.rcParams["boxplot.bootstrap"]) - plot_kws.setdefault("shownotches", plot_kws.pop("notch", False)) - - box_artist = mpl.patches.Rectangle if fill else mpl.lines.Line2D - props = { - "box": get_props("box", box_artist), - "median": get_props("median"), - "whisker": get_props("whisker"), - "flier": get_props("flier"), - "cap": get_props("cap"), - } - - props["median"].setdefault("solid_capstyle", "butt") - props["whisker"].setdefault("solid_capstyle", "butt") - props["flier"].setdefault("markersize", fliersize) - - ax = self.ax - - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=False): - - ax = self._get_axes(sub_vars) - - grouped = sub_data.groupby(self.orient)[value_var] - positions = sorted(sub_data[self.orient].unique().astype(float)) - value_data = [x.to_numpy() for _, x in grouped] - stats = pd.DataFrame(mpl.cbook.boxplot_stats(value_data, whis=whis, - bootstrap=bootstrap)) - - orig_width = width * self._native_width - data = pd.DataFrame({self.orient: positions, "width": orig_width}) - if dodge: - self._dodge(sub_vars, data) - if gap: - data["width"] *= 1 - gap - capwidth = plot_kws.get("capwidths", 0.5 * data["width"]) - - self._invert_scale(ax, data) - _, inv = _get_transform_functions(ax, value_var) - for stat in ["mean", "med", "q1", "q3", "cilo", "cihi", "whislo", "whishi"]: - stats[stat] = inv(stats[stat]) - stats["fliers"] = stats["fliers"].map(inv) - - linear_orient_scale = getattr(ax, f"get_{self.orient}scale")() == "linear" - - maincolor = self._hue_map(sub_vars["hue"]) if "hue" in sub_vars else color - if fill: - boxprops = { - "facecolor": maincolor, "edgecolor": linecolor, **props["box"] - } - medianprops = {"color": linecolor, **props["median"]} - whiskerprops = {"color": linecolor, **props["whisker"]} - flierprops = {"markeredgecolor": linecolor, **props["flier"]} - capprops = {"color": linecolor, **props["cap"]} - else: - boxprops = {"color": maincolor, **props["box"]} - medianprops = {"color": maincolor, **props["median"]} - whiskerprops = {"color": maincolor, **props["whisker"]} - flierprops = {"markeredgecolor": maincolor, **props["flier"]} - capprops = {"color": maincolor, **props["cap"]} - - if linewidth is not None: - for prop_dict in [boxprops, medianprops, whiskerprops, capprops]: - prop_dict.setdefault("linewidth", linewidth) - - default_kws = dict( - bxpstats=stats.to_dict("records"), - positions=data[self.orient], - # Set width to 0 to avoid going out of domain - widths=data["width"] if linear_orient_scale else 0, - patch_artist=fill, - vert=self.orient == "x", - manage_ticks=False, - boxprops=boxprops, - medianprops=medianprops, - whiskerprops=whiskerprops, - flierprops=flierprops, - capprops=capprops, - # Added in matplotlib 3.6.0; see below - # capwidths=capwidth, - **( - {} if _version_predates(mpl, "3.6.0") - else {"capwidths": capwidth} - ) - ) - boxplot_kws = {**default_kws, **plot_kws} - artists = ax.bxp(**boxplot_kws) - - # Reset artist widths after adding so everything stays positive - ori_idx = ["x", "y"].index(self.orient) - - if not linear_orient_scale: - for i, box in enumerate(data.to_dict("records")): - p0 = box["edge"] - p1 = box["edge"] + box["width"] - - if artists["boxes"]: - box_artist = artists["boxes"][i] - if fill: - box_verts = box_artist.get_path().vertices.T - else: - box_verts = box_artist.get_data() - box_verts[ori_idx][0] = p0 - box_verts[ori_idx][3:] = p0 - box_verts[ori_idx][1:3] = p1 - if not fill: - # When fill is True, the data get changed in place - box_artist.set_data(box_verts) - ax.update_datalim( - np.transpose(box_verts), - updatex=self.orient == "x", - updatey=self.orient == "y", - ) - - if artists["medians"]: - verts = artists["medians"][i].get_xydata().T - verts[ori_idx][:] = p0, p1 - artists["medians"][i].set_data(verts) - - if artists["caps"]: - f_fwd, f_inv = _get_transform_functions(ax, self.orient) - for line in artists["caps"][2 * i:2 * i + 2]: - p0 = f_inv(f_fwd(box[self.orient]) - capwidth[i] / 2) - p1 = f_inv(f_fwd(box[self.orient]) + capwidth[i] / 2) - verts = line.get_xydata().T - verts[ori_idx][:] = p0, p1 - line.set_data(verts) - - ax.add_container(BoxPlotContainer(artists)) - - legend_artist = _get_patch_legend_artist(fill) - self._configure_legend(ax, legend_artist, boxprops) - - def plot_boxens( - self, - width, - dodge, - gap, - fill, - color, - linecolor, - linewidth, - width_method, - k_depth, - outlier_prop, - trust_alpha, - showfliers, - box_kws, - flier_kws, - line_kws, - plot_kws, - ): - - iter_vars = [self.orient, "hue"] - value_var = {"x": "y", "y": "x"}[self.orient] - - estimator = LetterValues(k_depth, outlier_prop, trust_alpha) - - width_method_options = ["exponential", "linear", "area"] - _check_argument("width_method", width_method_options, width_method) - - box_kws = plot_kws if box_kws is None else {**plot_kws, **box_kws} - flier_kws = {} if flier_kws is None else flier_kws.copy() - line_kws = {} if line_kws is None else line_kws.copy() - - if linewidth is None: - if fill: - linewidth = 0.5 * mpl.rcParams["lines.linewidth"] - else: - linewidth = mpl.rcParams["lines.linewidth"] - - ax = self.ax - - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=False): - - ax = self._get_axes(sub_vars) - _, inv_ori = _get_transform_functions(ax, self.orient) - _, inv_val = _get_transform_functions(ax, value_var) - - # Statistics - lv_data = estimator(sub_data[value_var]) - n = lv_data["k"] * 2 - 1 - vals = lv_data["values"] - - pos_data = pd.DataFrame({ - self.orient: [sub_vars[self.orient]], - "width": [width * self._native_width], - }) - if dodge: - self._dodge(sub_vars, pos_data) - if gap: - pos_data["width"] *= 1 - gap - - # Letter-value boxes - levels = lv_data["levels"] - exponent = (levels - 1 - lv_data["k"]).astype(float) - if width_method == "linear": - rel_widths = levels + 1 - elif width_method == "exponential": - rel_widths = 2 ** exponent - elif width_method == "area": - tails = levels < (lv_data["k"] - 1) - rel_widths = 2 ** (exponent - tails) / np.diff(lv_data["values"]) - - center = pos_data[self.orient].item() - widths = rel_widths / rel_widths.max() * pos_data["width"].item() - - box_vals = inv_val(vals) - box_pos = inv_ori(center - widths / 2) - box_heights = inv_val(vals[1:]) - inv_val(vals[:-1]) - box_widths = inv_ori(center + widths / 2) - inv_ori(center - widths / 2) - - maincolor = self._hue_map(sub_vars["hue"]) if "hue" in sub_vars else color - flier_colors = { - "facecolor": "none", "edgecolor": ".45" if fill else maincolor - } - if fill: - cmap = light_palette(maincolor, as_cmap=True) - boxcolors = cmap(2 ** ((exponent + 2) / 3)) - else: - boxcolors = maincolor - - boxen = [] - for i in range(n): - if self.orient == "x": - xy = (box_pos[i], box_vals[i]) - w, h = (box_widths[i], box_heights[i]) - else: - xy = (box_vals[i], box_pos[i]) - w, h = (box_heights[i], box_widths[i]) - boxen.append(Rectangle(xy, w, h)) - - if fill: - box_colors = {"facecolors": boxcolors, "edgecolors": linecolor} - else: - box_colors = {"facecolors": "none", "edgecolors": boxcolors} - - collection_kws = {**box_colors, "linewidth": linewidth, **box_kws} - ax.add_collection(PatchCollection(boxen, **collection_kws), autolim=False) - ax.update_datalim( - np.column_stack([box_vals, box_vals]), - updatex=self.orient == "y", - updatey=self.orient == "x", - ) - - # Median line - med = lv_data["median"] - hw = pos_data["width"].item() / 2 - if self.orient == "x": - x, y = inv_ori([center - hw, center + hw]), inv_val([med, med]) - else: - x, y = inv_val([med, med]), inv_ori([center - hw, center + hw]) - default_kws = { - "color": linecolor if fill else maincolor, - "solid_capstyle": "butt", - "linewidth": 1.25 * linewidth, - } - ax.plot(x, y, **{**default_kws, **line_kws}) - - # Outliers ("fliers") - if showfliers: - vals = inv_val(lv_data["fliers"]) - pos = np.full(len(vals), inv_ori(pos_data[self.orient].item())) - x, y = (pos, vals) if self.orient == "x" else (vals, pos) - ax.scatter(x, y, **{**flier_colors, "s": 25, **flier_kws}) - - ax.autoscale_view(scalex=self.orient == "y", scaley=self.orient == "x") - - legend_artist = _get_patch_legend_artist(fill) - common_kws = {**box_kws, "linewidth": linewidth, "edgecolor": linecolor} - self._configure_legend(ax, legend_artist, common_kws) - - def plot_violins( - self, - width, - dodge, - gap, - split, - color, - fill, - linecolor, - linewidth, - inner, - density_norm, - common_norm, - kde_kws, - inner_kws, - plot_kws, - ): - - iter_vars = [self.orient, "hue"] - value_var = {"x": "y", "y": "x"}[self.orient] - - inner_options = ["box", "quart", "stick", "point", None] - _check_argument("inner", inner_options, inner, prefix=True) - _check_argument("density_norm", ["area", "count", "width"], density_norm) - - if linewidth is None: - if fill: - linewidth = 1.25 * mpl.rcParams["patch.linewidth"] - else: - linewidth = mpl.rcParams["lines.linewidth"] - - if inner is not None and inner.startswith("box"): - box_width = inner_kws.pop("box_width", linewidth * 4.5) - whis_width = inner_kws.pop("whis_width", box_width / 3) - marker = inner_kws.pop("marker", "_" if self.orient == "x" else "|") - - kde = KDE(**kde_kws) - ax = self.ax - violin_data = [] - - # Iterate through all the data splits once to compute the KDEs - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=False): - - sub_data["weight"] = sub_data.get("weights", 1) - stat_data = kde._transform(sub_data, value_var, []) - - maincolor = self._hue_map(sub_vars["hue"]) if "hue" in sub_vars else color - if not fill: - linecolor = maincolor - maincolor = "none" - default_kws = dict( - facecolor=maincolor, - edgecolor=linecolor, - linewidth=linewidth, - ) - - violin_data.append({ - "position": sub_vars[self.orient], - "observations": sub_data[value_var], - "density": stat_data["density"], - "support": stat_data[value_var], - "kwargs": {**default_kws, **plot_kws}, - "sub_vars": sub_vars, - "ax": self._get_axes(sub_vars), - }) - - # Once we've computed all the KDEs, get statistics for normalization - def vars_to_key(sub_vars): - return tuple((k, v) for k, v in sub_vars.items() if k != self.orient) - - norm_keys = [vars_to_key(violin["sub_vars"]) for violin in violin_data] - if common_norm: - common_max_density = np.nanmax([v["density"].max() for v in violin_data]) - common_max_count = np.nanmax([len(v["observations"]) for v in violin_data]) - max_density = {key: common_max_density for key in norm_keys} - max_count = {key: common_max_count for key in norm_keys} - else: - with warnings.catch_warnings(): - # Ignore warning when all violins are singular; it's not important - warnings.filterwarnings('ignore', "All-NaN (slice|axis) encountered") - max_density = { - key: np.nanmax([ - v["density"].max() for v in violin_data - if vars_to_key(v["sub_vars"]) == key - ]) for key in norm_keys - } - max_count = { - key: np.nanmax([ - len(v["observations"]) for v in violin_data - if vars_to_key(v["sub_vars"]) == key - ]) for key in norm_keys - } - - real_width = width * self._native_width - - # Now iterate through the violins again to apply the normalization and plot - for violin in violin_data: - - index = pd.RangeIndex(0, max(len(violin["support"]), 1)) - data = pd.DataFrame({ - self.orient: violin["position"], - value_var: violin["support"], - "density": violin["density"], - "width": real_width, - }, index=index) - - if dodge: - self._dodge(violin["sub_vars"], data) - if gap: - data["width"] *= 1 - gap - - # Normalize the density across the distribution(s) and relative to the width - norm_key = vars_to_key(violin["sub_vars"]) - hw = data["width"] / 2 - peak_density = violin["density"].max() - if np.isnan(peak_density): - span = 1 - elif density_norm == "area": - span = data["density"] / max_density[norm_key] - elif density_norm == "count": - count = len(violin["observations"]) - span = data["density"] / peak_density * (count / max_count[norm_key]) - elif density_norm == "width": - span = data["density"] / peak_density - span = span * hw * (2 if split else 1) - - # Handle split violins (i.e. asymmetric spans) - right_side = ( - 0 if "hue" not in self.variables - else self._hue_map.levels.index(violin["sub_vars"]["hue"]) % 2 - ) - if split: - offsets = (hw, span - hw) if right_side else (span - hw, hw) - else: - offsets = span, span - - ax = violin["ax"] - _, invx = _get_transform_functions(ax, "x") - _, invy = _get_transform_functions(ax, "y") - inv_pos = {"x": invx, "y": invy}[self.orient] - inv_val = {"x": invx, "y": invy}[value_var] - - linecolor = violin["kwargs"]["edgecolor"] - - # Handle singular datasets (one or more observations with no variance - if np.isnan(peak_density): - pos = data[self.orient].iloc[0] - val = violin["observations"].mean() - if self.orient == "x": - x, y = [pos - offsets[0], pos + offsets[1]], [val, val] - else: - x, y = [val, val], [pos - offsets[0], pos + offsets[1]] - ax.plot(invx(x), invy(y), color=linecolor, linewidth=linewidth) - continue - - # Plot the main violin body - plot_func = {"x": ax.fill_betweenx, "y": ax.fill_between}[self.orient] - plot_func( - inv_val(data[value_var]), - inv_pos(data[self.orient] - offsets[0]), - inv_pos(data[self.orient] + offsets[1]), - **violin["kwargs"] - ) - - # Adjust the observation data - obs = violin["observations"] - pos_dict = {self.orient: violin["position"], "width": real_width} - if dodge: - self._dodge(violin["sub_vars"], pos_dict) - if gap: - pos_dict["width"] *= (1 - gap) - - # --- Plot the inner components - if inner is None: - continue - - elif inner.startswith("point"): - pos = np.array([pos_dict[self.orient]] * len(obs)) - if split: - pos += (-1 if right_side else 1) * pos_dict["width"] / 2 - x, y = (pos, obs) if self.orient == "x" else (obs, pos) - kws = { - "color": linecolor, - "edgecolor": linecolor, - "s": (linewidth * 2) ** 2, - "zorder": violin["kwargs"].get("zorder", 2) + 1, - **inner_kws, - } - ax.scatter(invx(x), invy(y), **kws) - - elif inner.startswith("stick"): - pos0 = np.interp(obs, data[value_var], data[self.orient] - offsets[0]) - pos1 = np.interp(obs, data[value_var], data[self.orient] + offsets[1]) - pos_pts = np.stack([inv_pos(pos0), inv_pos(pos1)]) - val_pts = np.stack([inv_val(obs), inv_val(obs)]) - segments = np.stack([pos_pts, val_pts]).transpose(2, 1, 0) - if self.orient == "y": - segments = segments[:, :, ::-1] - kws = { - "color": linecolor, - "linewidth": linewidth / 2, - **inner_kws, - } - lines = mpl.collections.LineCollection(segments, **kws) - ax.add_collection(lines, autolim=False) - - elif inner.startswith("quart"): - stats = np.percentile(obs, [25, 50, 75]) - pos0 = np.interp(stats, data[value_var], data[self.orient] - offsets[0]) - pos1 = np.interp(stats, data[value_var], data[self.orient] + offsets[1]) - pos_pts = np.stack([inv_pos(pos0), inv_pos(pos1)]) - val_pts = np.stack([inv_val(stats), inv_val(stats)]) - segments = np.stack([pos_pts, val_pts]).transpose(2, 0, 1) - if self.orient == "y": - segments = segments[:, ::-1, :] - dashes = [(1.25, .75), (2.5, 1), (1.25, .75)] - for i, segment in enumerate(segments): - kws = { - "color": linecolor, - "linewidth": linewidth, - "dashes": dashes[i], - **inner_kws, - } - ax.plot(*segment, **kws) - - elif inner.startswith("box"): - stats = mpl.cbook.boxplot_stats(obs)[0] - pos = np.array(pos_dict[self.orient]) - if split: - pos += (-1 if right_side else 1) * pos_dict["width"] / 2 - pos = [pos, pos], [pos, pos], [pos] - val = ( - [stats["whislo"], stats["whishi"]], - [stats["q1"], stats["q3"]], - [stats["med"]] - ) - if self.orient == "x": - (x0, x1, x2), (y0, y1, y2) = pos, val - else: - (x0, x1, x2), (y0, y1, y2) = val, pos - - if split: - offset = (1 if right_side else -1) * box_width / 72 / 2 - dx, dy = (offset, 0) if self.orient == "x" else (0, -offset) - trans = ax.transData + mpl.transforms.ScaledTranslation( - dx, dy, ax.figure.dpi_scale_trans, - ) - else: - trans = ax.transData - line_kws = { - "color": linecolor, - "transform": trans, - **inner_kws, - "linewidth": whis_width, - } - ax.plot(invx(x0), invy(y0), **line_kws) - line_kws["linewidth"] = box_width - ax.plot(invx(x1), invy(y1), **line_kws) - dot_kws = { - "marker": marker, - "markersize": box_width / 1.2, - "markeredgewidth": box_width / 5, - "transform": trans, - **inner_kws, - "markeredgecolor": "w", - "markerfacecolor": "w", - "color": linecolor, # simplify tests - } - ax.plot(invx(x2), invy(y2), **dot_kws) - - legend_artist = _get_patch_legend_artist(fill) - common_kws = {**plot_kws, "linewidth": linewidth, "edgecolor": linecolor} - self._configure_legend(ax, legend_artist, common_kws) - - def plot_points( - self, - aggregator, - markers, - linestyles, - dodge, - color, - capsize, - err_kws, - plot_kws, - ): - - agg_var = {"x": "y", "y": "x"}[self.orient] - iter_vars = ["hue"] - - plot_kws = normalize_kwargs(plot_kws, mpl.lines.Line2D) - plot_kws.setdefault("linewidth", mpl.rcParams["lines.linewidth"] * 1.8) - plot_kws.setdefault("markeredgewidth", plot_kws["linewidth"] * 0.75) - plot_kws.setdefault("markersize", plot_kws["linewidth"] * np.sqrt(2 * np.pi)) - - markers = self._map_prop_with_hue("marker", markers, "o", plot_kws) - linestyles = self._map_prop_with_hue("linestyle", linestyles, "-", plot_kws) - - base_positions = self.var_levels[self.orient] - if self.var_types[self.orient] == "categorical": - min_cat_val = int(self.comp_data[self.orient].min()) - max_cat_val = int(self.comp_data[self.orient].max()) - base_positions = [i for i in range(min_cat_val, max_cat_val + 1)] - - n_hue_levels = 0 if self._hue_map.levels is None else len(self._hue_map.levels) - if dodge is True: - dodge = .025 * n_hue_levels - - ax = self.ax - - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=True): - - ax = self._get_axes(sub_vars) - - ori_axis = getattr(ax, f"{self.orient}axis") - transform, _ = _get_transform_functions(ax, self.orient) - positions = transform(ori_axis.convert_units(base_positions)) - agg_data = sub_data if sub_data.empty else ( - sub_data - .groupby(self.orient) - .apply(aggregator, agg_var, **groupby_apply_include_groups(False)) - .reindex(pd.Index(positions, name=self.orient)) - .reset_index() - ) - - if dodge: - hue_idx = self._hue_map.levels.index(sub_vars["hue"]) - step_size = dodge / (n_hue_levels - 1) - offset = -dodge / 2 + step_size * hue_idx - agg_data[self.orient] += offset * self._native_width - - self._invert_scale(ax, agg_data) - - sub_kws = plot_kws.copy() - sub_kws.update( - marker=markers[sub_vars.get("hue")], - linestyle=linestyles[sub_vars.get("hue")], - color=self._hue_map(sub_vars["hue"]) if "hue" in sub_vars else color, - ) - - line, = ax.plot(agg_data["x"], agg_data["y"], **sub_kws) - - sub_err_kws = err_kws.copy() - line_props = line.properties() - for prop in ["color", "linewidth", "alpha", "zorder"]: - sub_err_kws.setdefault(prop, line_props[prop]) - if aggregator.error_method is not None: - self.plot_errorbars(ax, agg_data, capsize, sub_err_kws) - - legend_artist = partial(mpl.lines.Line2D, [], []) - semantic_kws = {"hue": {"marker": markers, "linestyle": linestyles}} - self._configure_legend(ax, legend_artist, sub_kws, semantic_kws) - - def plot_bars( - self, - aggregator, - dodge, - gap, - width, - fill, - color, - capsize, - err_kws, - plot_kws, - ): - - agg_var = {"x": "y", "y": "x"}[self.orient] - iter_vars = ["hue"] - - ax = self.ax - - if self._hue_map.levels is None: - dodge = False - - if dodge and capsize is not None: - capsize = capsize / len(self._hue_map.levels) - - if not fill: - plot_kws.setdefault("linewidth", 1.5 * mpl.rcParams["lines.linewidth"]) - - err_kws.setdefault("linewidth", 1.5 * mpl.rcParams["lines.linewidth"]) - - for sub_vars, sub_data in self.iter_data(iter_vars, - from_comp_data=True, - allow_empty=True): - - ax = self._get_axes(sub_vars) - - agg_data = sub_data if sub_data.empty else ( - sub_data - .groupby(self.orient) - .apply(aggregator, agg_var, **groupby_apply_include_groups(False)) - .reset_index() - ) - - agg_data["width"] = width * self._native_width - if dodge: - self._dodge(sub_vars, agg_data) - if gap: - agg_data["width"] *= 1 - gap - - agg_data["edge"] = agg_data[self.orient] - agg_data["width"] / 2 - self._invert_scale(ax, agg_data) - - if self.orient == "x": - bar_func = ax.bar - kws = dict( - x=agg_data["edge"], height=agg_data["y"], width=agg_data["width"] - ) - else: - bar_func = ax.barh - kws = dict( - y=agg_data["edge"], width=agg_data["x"], height=agg_data["width"] - ) - - main_color = self._hue_map(sub_vars["hue"]) if "hue" in sub_vars else color - - # Set both color and facecolor for property cycle logic - kws["align"] = "edge" - if fill: - kws.update(color=main_color, facecolor=main_color) - else: - kws.update(color=main_color, edgecolor=main_color, facecolor="none") - - bar_func(**{**kws, **plot_kws}) - - if aggregator.error_method is not None: - self.plot_errorbars( - ax, agg_data, capsize, - {"color": ".26" if fill else main_color, **err_kws} - ) - - legend_artist = _get_patch_legend_artist(fill) - self._configure_legend(ax, legend_artist, plot_kws) - - def plot_errorbars(self, ax, data, capsize, err_kws): - - var = {"x": "y", "y": "x"}[self.orient] - for row in data.to_dict("records"): - - row = dict(row) - pos = np.array([row[self.orient], row[self.orient]]) - val = np.array([row[f"{var}min"], row[f"{var}max"]]) - - if capsize: - - cw = capsize * self._native_width / 2 - scl, inv = _get_transform_functions(ax, self.orient) - cap = inv(scl(pos[0]) - cw), inv(scl(pos[1]) + cw) - - pos = np.concatenate([ - [*cap, np.nan], pos, [np.nan, *cap] - ]) - val = np.concatenate([ - [val[0], val[0], np.nan], val, [np.nan, val[-1], val[-1]], - ]) - - if self.orient == "x": - args = pos, val - else: - args = val, pos - ax.plot(*args, **err_kws) - - -class _CategoricalAggPlotter(_CategoricalPlotter): - - flat_structure = {"x": "@index", "y": "@values"} - - -_categorical_docs = dict( - - # Shared narrative docs - categorical_narrative=dedent("""\ - See the :ref:`tutorial <categorical_tutorial>` for more information. - - .. note:: - By default, this function treats one of the variables as categorical - and draws data at ordinal positions (0, 1, ... n) on the relevant axis. - As of version 0.13.0, this can be disabled by setting `native_scale=True`. - """), - - # Shared function parameters - input_params=dedent("""\ - x, y, hue : names of variables in `data` or vector data - Inputs for plotting long-form data. See examples for interpretation.\ - """), - categorical_data=dedent("""\ - data : DataFrame, Series, dict, array, or list of arrays - Dataset for plotting. If `x` and `y` are absent, this is - interpreted as wide-form. Otherwise it is expected to be long-form.\ - """), - order_vars=dedent("""\ - order, hue_order : lists of strings - Order to plot the categorical levels in; otherwise the levels are - inferred from the data objects.\ - """), - stat_api_params=dedent("""\ - estimator : string or callable that maps vector -> scalar - Statistical function to estimate within each categorical bin. - errorbar : string, (string, number) tuple, callable or None - Name of errorbar method (either "ci", "pi", "se", or "sd"), or a tuple - with a method name and a level parameter, or a function that maps from a - vector to a (min, max) interval, or None to hide errorbar. See the - :doc:`errorbar tutorial </tutorial/error_bars>` for more information. - - .. versionadded:: v0.12.0 - n_boot : int - Number of bootstrap samples used to compute confidence intervals. - seed : int, `numpy.random.Generator`, or `numpy.random.RandomState` - Seed or random number generator for reproducible bootstrapping. - units : name of variable in `data` or vector data - Identifier of sampling units; used by the errorbar function to - perform a multilevel bootstrap and account for repeated measures - weights : name of variable in `data` or vector data - Data values or column used to compute weighted statistics. - Note that the use of weights may limit other statistical options. - - .. versionadded:: v0.13.1\ - """), - ci=dedent("""\ - ci : float - Level of the confidence interval to show, in [0, 100]. - - .. deprecated:: v0.12.0 - Use `errorbar=("ci", ...)`.\ - """), - orient=dedent("""\ - orient : "v" | "h" | "x" | "y" - Orientation of the plot (vertical or horizontal). This is usually - inferred based on the type of the input variables, but it can be used - to resolve ambiguity when both `x` and `y` are numeric or when - plotting wide-form data. - - .. versionchanged:: v0.13.0 - Added 'x'/'y' as options, equivalent to 'v'/'h'.\ - """), - color=dedent("""\ - color : matplotlib color - Single color for the elements in the plot.\ - """), - palette=dedent("""\ - palette : palette name, list, dict, or :class:`matplotlib.colors.Colormap` - Color palette that maps the hue variable. If the palette is a dictionary, - keys should be names of levels and values should be matplotlib colors. - The type/value will sometimes force a qualitative/quantitative mapping.\ - """), - hue_norm=dedent("""\ - hue_norm : tuple or :class:`matplotlib.colors.Normalize` object - Normalization in data units for colormap applied to the `hue` - variable when it is numeric. Not relevant if `hue` is categorical. - - .. versionadded:: v0.12.0\ - """), - saturation=dedent("""\ - saturation : float - Proportion of the original saturation to draw fill colors in. Large - patches often look better with desaturated colors, but set this to - `1` if you want the colors to perfectly match the input values.\ - """), - capsize=dedent("""\ - capsize : float - Width of the "caps" on error bars, relative to bar spacing.\ - """), - errcolor=dedent("""\ - errcolor : matplotlib color - Color used for the error bar lines. - - .. deprecated:: 0.13.0 - Use `err_kws={'color': ...}`.\ - """), - errwidth=dedent("""\ - errwidth : float - Thickness of error bar lines (and caps), in points. - - .. deprecated:: 0.13.0 - Use `err_kws={'linewidth': ...}`.\ - """), - fill=dedent("""\ - fill : bool - If True, use a solid patch. Otherwise, draw as line art. - - .. versionadded:: v0.13.0\ - """), - gap=dedent("""\ - gap : float - Shrink on the orient axis by this factor to add a gap between dodged elements. - - .. versionadded:: 0.13.0\ - """), - width=dedent("""\ - width : float - Width allotted to each element on the orient axis. When `native_scale=True`, - it is relative to the minimum distance between two values in the native scale.\ - """), - dodge=dedent("""\ - dodge : "auto" or bool - When hue mapping is used, whether elements should be narrowed and shifted along - the orient axis to eliminate overlap. If `"auto"`, set to `True` when the - orient variable is crossed with the categorical variable or `False` otherwise. - - .. versionchanged:: 0.13.0 - - Added `"auto"` mode as a new default.\ - """), - linewidth=dedent("""\ - linewidth : float - Width of the lines that frame the plot elements.\ - """), - linecolor=dedent("""\ - linecolor : color - Color to use for line elements, when `fill` is True. - - .. versionadded:: v0.13.0\ - """), - log_scale=dedent("""\ - log_scale : bool or number, or pair of bools or numbers - Set axis scale(s) to log. A single value sets the data axis for any numeric - axes in the plot. A pair of values sets each axis independently. - Numeric values are interpreted as the desired base (default 10). - When `None` or `False`, seaborn defers to the existing Axes scale. - - .. versionadded:: v0.13.0\ - """), - native_scale=dedent("""\ - native_scale : bool - When True, numeric or datetime values on the categorical axis will maintain - their original scaling rather than being converted to fixed indices. - - .. versionadded:: v0.13.0\ - """), - formatter=dedent("""\ - formatter : callable - Function for converting categorical data into strings. Affects both grouping - and tick labels. - - .. versionadded:: v0.13.0\ - """), - legend=dedent("""\ - legend : "auto", "brief", "full", or False - How to draw the legend. If "brief", numeric `hue` and `size` - variables will be represented with a sample of evenly spaced values. - If "full", every group will get an entry in the legend. If "auto", - choose between brief or full representation based on number of levels. - If `False`, no legend data is added and no legend is drawn. - - .. versionadded:: v0.13.0\ - """), - err_kws=dedent("""\ - err_kws : dict - Parameters of :class:`matplotlib.lines.Line2D`, for the error bar artists. - - .. versionadded:: v0.13.0\ - """), - ax_in=dedent("""\ - ax : matplotlib Axes - Axes object to draw the plot onto, otherwise uses the current Axes.\ - """), - ax_out=dedent("""\ - ax : matplotlib Axes - Returns the Axes object with the plot drawn onto it.\ - """), - - # Shared see also - boxplot=dedent("""\ - boxplot : A traditional box-and-whisker plot with a similar API.\ - """), - violinplot=dedent("""\ - violinplot : A combination of boxplot and kernel density estimation.\ - """), - stripplot=dedent("""\ - stripplot : A scatterplot where one variable is categorical. Can be used - in conjunction with other plots to show each observation.\ - """), - swarmplot=dedent("""\ - swarmplot : A categorical scatterplot where the points do not overlap. Can - be used with other plots to show each observation.\ - """), - barplot=dedent("""\ - barplot : Show point estimates and confidence intervals using bars.\ - """), - countplot=dedent("""\ - countplot : Show the counts of observations in each categorical bin.\ - """), - pointplot=dedent("""\ - pointplot : Show point estimates and confidence intervals using dots.\ - """), - catplot=dedent("""\ - catplot : Combine a categorical plot with a :class:`FacetGrid`.\ - """), - boxenplot=dedent("""\ - boxenplot : An enhanced boxplot for larger datasets.\ - """), - -) - -_categorical_docs.update(_facet_docs) - - -def boxplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - orient=None, color=None, palette=None, saturation=.75, fill=True, - dodge="auto", width=.8, gap=0, whis=1.5, linecolor="auto", linewidth=None, - fliersize=None, hue_norm=None, native_scale=False, log_scale=None, formatter=None, - legend="auto", ax=None, **kwargs -): - - p = _CategoricalPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if dodge == "auto": - # Needs to be before scale_categorical changes the coordinate series dtype - dodge = p._dodge_needed() - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - saturation = saturation if fill else 1 - p.map_hue(palette=palette, order=hue_order, norm=hue_norm, saturation=saturation) - color = _default_color( - ax.fill_between, hue, color, - {k: v for k, v in kwargs.items() if k in ["c", "color", "fc", "facecolor"]}, - saturation=saturation, - ) - linecolor = p._complement_color(linecolor, color, p._hue_map) - - p.plot_boxes( - width=width, - dodge=dodge, - gap=gap, - fill=fill, - whis=whis, - color=color, - linecolor=linecolor, - linewidth=linewidth, - fliersize=fliersize, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -boxplot.__doc__ = dedent("""\ - Draw a box plot to show distributions with respect to categories. - - A box plot (or box-and-whisker plot) shows the distribution of quantitative - data in a way that facilitates comparisons between variables or across - levels of a categorical variable. The box shows the quartiles of the - dataset while the whiskers extend to show the rest of the distribution, - except for points that are determined to be "outliers" using a method - that is a function of the inter-quartile range. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - {orient} - {color} - {palette} - {saturation} - {fill} - {dodge} - {width} - {gap} - whis : float or pair of floats - Paramater that controls whisker length. If scalar, whiskers are drawn - to the farthest datapoint within *whis * IQR* from the nearest hinge. - If a tuple, it is interpreted as percentiles that whiskers represent. - {linecolor} - {linewidth} - fliersize : float - Size of the markers used to indicate outlier observations. - {hue_norm} - {log_scale} - {native_scale} - {formatter} - {legend} - {ax_in} - kwargs : key, value mappings - Other keyword arguments are passed through to - :meth:`matplotlib.axes.Axes.boxplot`. - - Returns - ------- - {ax_out} - - See Also - -------- - {violinplot} - {stripplot} - {swarmplot} - {catplot} - - Examples - -------- - .. include:: ../docstrings/boxplot.rst - - """).format(**_categorical_docs) - - -def violinplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - orient=None, color=None, palette=None, saturation=.75, fill=True, - inner="box", split=False, width=.8, dodge="auto", gap=0, - linewidth=None, linecolor="auto", cut=2, gridsize=100, - bw_method="scott", bw_adjust=1, density_norm="area", common_norm=False, - hue_norm=None, formatter=None, log_scale=None, native_scale=False, - legend="auto", scale=deprecated, scale_hue=deprecated, bw=deprecated, - inner_kws=None, ax=None, **kwargs, -): - - p = _CategoricalPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if dodge == "auto": - # Needs to be before scale_categorical changes the coordinate series dtype - dodge = p._dodge_needed() - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - saturation = saturation if fill else 1 - p.map_hue(palette=palette, order=hue_order, norm=hue_norm, saturation=saturation) - color = _default_color( - ax.fill_between, hue, color, - {k: v for k, v in kwargs.items() if k in ["c", "color", "fc", "facecolor"]}, - saturation=saturation, - ) - linecolor = p._complement_color(linecolor, color, p._hue_map) - - density_norm, common_norm = p._violin_scale_backcompat( - scale, scale_hue, density_norm, common_norm, - ) - - bw_method = p._violin_bw_backcompat(bw, bw_method) - kde_kws = dict(cut=cut, gridsize=gridsize, bw_method=bw_method, bw_adjust=bw_adjust) - inner_kws = {} if inner_kws is None else inner_kws.copy() - - p.plot_violins( - width=width, - dodge=dodge, - gap=gap, - split=split, - color=color, - fill=fill, - linecolor=linecolor, - linewidth=linewidth, - inner=inner, - density_norm=density_norm, - common_norm=common_norm, - kde_kws=kde_kws, - inner_kws=inner_kws, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -violinplot.__doc__ = dedent("""\ - Draw a patch representing a KDE and add observations or box plot statistics. - - A violin plot plays a similar role as a box-and-whisker plot. It shows the - distribution of data points after grouping by one (or more) variables. - Unlike a box plot, each violin is drawn using a kernel density estimate - of the underlying distribution. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - {orient} - {color} - {palette} - {saturation} - {fill} - inner : {{"box", "quart", "point", "stick", None}} - Representation of the data in the violin interior. One of the following: - - - `"box"`: draw a miniature box-and-whisker plot - - `"quart"`: show the quartiles of the data - - `"point"` or `"stick"`: show each observation - split : bool - Show an un-mirrored distribution, alternating sides when using `hue`. - - .. versionchanged:: v0.13.0 - Previously, this option required a `hue` variable with exactly two levels. - {width} - {dodge} - {gap} - {linewidth} - {linecolor} - cut : float - Distance, in units of bandwidth, to extend the density past extreme - datapoints. Set to 0 to limit the violin within the data range. - gridsize : int - Number of points in the discrete grid used to evaluate the KDE. - bw_method : {{"scott", "silverman", float}} - Either the name of a reference rule or the scale factor to use when - computing the kernel bandwidth. The actual kernel size will be - determined by multiplying the scale factor by the standard deviation of - the data within each group. - - .. versionadded:: v0.13.0 - bw_adjust: float - Factor that scales the bandwidth to use more or less smoothing. - - .. versionadded:: v0.13.0 - density_norm : {{"area", "count", "width"}} - Method that normalizes each density to determine the violin's width. - If `area`, each violin will have the same area. If `count`, the width - will be proportional to the number of observations. If `width`, each - violin will have the same width. - - .. versionadded:: v0.13.0 - common_norm : bool - When `True`, normalize the density across all violins. - - .. versionadded:: v0.13.0 - {hue_norm} - {formatter} - {log_scale} - {native_scale} - {legend} - scale : {{"area", "count", "width"}} - .. deprecated:: v0.13.0 - See `density_norm`. - scale_hue : bool - .. deprecated:: v0.13.0 - See `common_norm`. - bw : {{'scott', 'silverman', float}} - .. deprecated:: v0.13.0 - See `bw_method` and `bw_adjust`. - inner_kws : dict of key, value mappings - Keyword arguments for the "inner" plot, passed to one of: - - - :class:`matplotlib.collections.LineCollection` (with `inner="stick"`) - - :meth:`matplotlib.axes.Axes.scatter` (with `inner="point"`) - - :meth:`matplotlib.axes.Axes.plot` (with `inner="quart"` or `inner="box"`) - - Additionally, with `inner="box"`, the keywords `box_width`, `whis_width`, - and `marker` receive special handling for the components of the "box" plot. - - .. versionadded:: v0.13.0 - {ax_in} - kwargs : key, value mappings - Keyword arguments for the violin patches, passsed through to - :meth:`matplotlib.axes.Axes.fill_between`. - - Returns - ------- - {ax_out} - - See Also - -------- - {boxplot} - {stripplot} - {swarmplot} - {catplot} - - Examples - -------- - .. include:: ../docstrings/violinplot.rst - - """).format(**_categorical_docs) - - -def boxenplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - orient=None, color=None, palette=None, saturation=.75, fill=True, - dodge="auto", width=.8, gap=0, linewidth=None, linecolor=None, - width_method="exponential", k_depth="tukey", outlier_prop=0.007, trust_alpha=0.05, - showfliers=True, hue_norm=None, log_scale=None, native_scale=False, formatter=None, - legend="auto", scale=deprecated, box_kws=None, flier_kws=None, line_kws=None, - ax=None, **kwargs, -): - - p = _CategoricalPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if dodge == "auto": - # Needs to be before scale_categorical changes the coordinate series dtype - dodge = p._dodge_needed() - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - # Longer-term deprecations - width_method = p._boxen_scale_backcompat(scale, width_method) - - saturation = saturation if fill else 1 - p.map_hue(palette=palette, order=hue_order, norm=hue_norm, saturation=saturation) - color = _default_color( - ax.fill_between, hue, color, - {}, # TODO how to get default color? - # {k: v for k, v in kwargs.items() if k in ["c", "color", "fc", "facecolor"]}, - saturation=saturation, - ) - linecolor = p._complement_color(linecolor, color, p._hue_map) - - p.plot_boxens( - width=width, - dodge=dodge, - gap=gap, - fill=fill, - color=color, - linecolor=linecolor, - linewidth=linewidth, - width_method=width_method, - k_depth=k_depth, - outlier_prop=outlier_prop, - trust_alpha=trust_alpha, - showfliers=showfliers, - box_kws=box_kws, - flier_kws=flier_kws, - line_kws=line_kws, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -boxenplot.__doc__ = dedent("""\ - Draw an enhanced box plot for larger datasets. - - This style of plot was originally named a "letter value" plot because it - shows a large number of quantiles that are defined as "letter values". It - is similar to a box plot in plotting a nonparametric representation of a - distribution in which all features correspond to actual observations. By - plotting more quantiles, it provides more information about the shape of - the distribution, particularly in the tails. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - {orient} - {color} - {palette} - {saturation} - {fill} - {dodge} - {width} - {gap} - {linewidth} - {linecolor} - width_method : {{"exponential", "linear", "area"}} - Method to use for the width of the letter value boxes: - - - `"exponential"`: Represent the corresponding percentile - - `"linear"`: Decrease by a constant amount for each box - - `"area"`: Represent the density of data points in that box - k_depth : {{"tukey", "proportion", "trustworthy", "full"}} or int - The number of levels to compute and draw in each tail: - - - `"tukey"`: Use log2(n) - 3 levels, covering similar range as boxplot whiskers - - `"proportion"`: Leave approximately `outlier_prop` fliers - - `"trusthworthy"`: Extend to level with confidence of at least `trust_alpha` - - `"full"`: Use log2(n) + 1 levels and extend to most extreme points - outlier_prop : float - Proportion of data expected to be outliers; used when `k_depth="proportion"`. - trust_alpha : float - Confidence threshold for most extreme level; used when `k_depth="trustworthy"`. - showfliers : bool - If False, suppress the plotting of outliers. - {hue_norm} - {log_scale} - {native_scale} - {formatter} - {legend} - box_kws: dict - Keyword arguments for the box artists; passed to - :class:`matplotlib.patches.Rectangle`. - - .. versionadded:: v0.12.0 - line_kws: dict - Keyword arguments for the line denoting the median; passed to - :meth:`matplotlib.axes.Axes.plot`. - - .. versionadded:: v0.12.0 - flier_kws: dict - Keyword arguments for the scatter denoting the outlier observations; - passed to :meth:`matplotlib.axes.Axes.scatter`. - - .. versionadded:: v0.12.0 - {ax_in} - kwargs : key, value mappings - Other keyword arguments are passed to :class:`matplotlib.patches.Rectangle`, - superceded by those in `box_kws`. - - Returns - ------- - {ax_out} - - See Also - -------- - {violinplot} - {boxplot} - {catplot} - - Notes - ----- - - For a more extensive explanation, you can read the paper that introduced the plot: - https://vita.had.co.nz/papers/letter-value-plot.html - - Examples - -------- - .. include:: ../docstrings/boxenplot.rst - - """).format(**_categorical_docs) - - -def stripplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - jitter=True, dodge=False, orient=None, color=None, palette=None, - size=5, edgecolor=default, linewidth=0, - hue_norm=None, log_scale=None, native_scale=False, formatter=None, legend="auto", - ax=None, **kwargs -): - - p = _CategoricalPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - color = _default_color(ax.scatter, hue, color, kwargs) - edgecolor = p._complement_color(edgecolor, color, p._hue_map) - - kwargs.setdefault("zorder", 3) - size = kwargs.get("s", size) - - kwargs.update( - s=size ** 2, - edgecolor=edgecolor, - linewidth=linewidth, - ) - - p.plot_strips( - jitter=jitter, - dodge=dodge, - color=color, - plot_kws=kwargs, - ) - - # XXX this happens inside a plotting method in the distribution plots - # but maybe it's better out here? Alternatively, we have an open issue - # suggesting that _attach could add default axes labels, which seems smart. - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -stripplot.__doc__ = dedent("""\ - Draw a categorical scatterplot using jitter to reduce overplotting. - - A strip plot can be drawn on its own, but it is also a good complement - to a box or violin plot in cases where you want to show all observations - along with some representation of the underlying distribution. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - jitter : float, `True`/`1` is special-cased - Amount of jitter (only along the categorical axis) to apply. This - can be useful when you have many points and they overlap, so that - it is easier to see the distribution. You can specify the amount - of jitter (half the width of the uniform random variable support), - or use `True` for a good default. - dodge : bool - When a `hue` variable is assigned, setting this to `True` will - separate the strips for different hue levels along the categorical - axis and narrow the amount of space allotedto each strip. Otherwise, - the points for each level will be plotted in the same strip. - {orient} - {color} - {palette} - size : float - Radius of the markers, in points. - edgecolor : matplotlib color, "gray" is special-cased - Color of the lines around each point. If you pass `"gray"`, the - brightness is determined by the color palette used for the body - of the points. Note that `stripplot` has `linewidth=0` by default, - so edge colors are only visible with nonzero line width. - {linewidth} - {hue_norm} - {log_scale} - {native_scale} - {formatter} - {legend} - {ax_in} - kwargs : key, value mappings - Other keyword arguments are passed through to - :meth:`matplotlib.axes.Axes.scatter`. - - Returns - ------- - {ax_out} - - See Also - -------- - {swarmplot} - {boxplot} - {violinplot} - {catplot} - - Examples - -------- - .. include:: ../docstrings/stripplot.rst - - """).format(**_categorical_docs) - - -def swarmplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - dodge=False, orient=None, color=None, palette=None, - size=5, edgecolor=None, linewidth=0, hue_norm=None, log_scale=None, - native_scale=False, formatter=None, legend="auto", warn_thresh=.05, - ax=None, **kwargs -): - - p = _CategoricalPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - if not p.has_xy_data: - return ax - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - color = _default_color(ax.scatter, hue, color, kwargs) - edgecolor = p._complement_color(edgecolor, color, p._hue_map) - - kwargs.setdefault("zorder", 3) - size = kwargs.get("s", size) - - if linewidth is None: - linewidth = size / 10 - - kwargs.update(dict( - s=size ** 2, - edgecolor=edgecolor, - linewidth=linewidth, - )) - - p.plot_swarms( - dodge=dodge, - color=color, - warn_thresh=warn_thresh, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -swarmplot.__doc__ = dedent("""\ - Draw a categorical scatterplot with points adjusted to be non-overlapping. - - This function is similar to :func:`stripplot`, but the points are adjusted - (only along the categorical axis) so that they don't overlap. This gives a - better representation of the distribution of values, but it does not scale - well to large numbers of observations. This style of plot is sometimes - called a "beeswarm". - - A swarm plot can be drawn on its own, but it is also a good complement - to a box or violin plot in cases where you want to show all observations - along with some representation of the underlying distribution. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - dodge : bool - When a `hue` variable is assigned, setting this to `True` will - separate the swarms for different hue levels along the categorical - axis and narrow the amount of space allotedto each strip. Otherwise, - the points for each level will be plotted in the same swarm. - {orient} - {color} - {palette} - size : float - Radius of the markers, in points. - edgecolor : matplotlib color, "gray" is special-cased - Color of the lines around each point. If you pass `"gray"`, the - brightness is determined by the color palette used for the body - of the points. - {linewidth} - {log_scale} - {native_scale} - {formatter} - {legend} - {ax_in} - kwargs : key, value mappings - Other keyword arguments are passed through to - :meth:`matplotlib.axes.Axes.scatter`. - - Returns - ------- - {ax_out} - - See Also - -------- - {boxplot} - {violinplot} - {stripplot} - {catplot} - - Examples - -------- - .. include:: ../docstrings/swarmplot.rst - - """).format(**_categorical_docs) - - -def barplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - estimator="mean", errorbar=("ci", 95), n_boot=1000, seed=None, units=None, - weights=None, orient=None, color=None, palette=None, saturation=.75, - fill=True, hue_norm=None, width=.8, dodge="auto", gap=0, log_scale=None, - native_scale=False, formatter=None, legend="auto", capsize=0, err_kws=None, - ci=deprecated, errcolor=deprecated, errwidth=deprecated, ax=None, **kwargs, -): - - errorbar = utils._deprecate_ci(errorbar, ci) - - # Be backwards compatible with len passed directly, which - # does not work in Series.agg (maybe a pandas bug?) - if estimator is len: - estimator = "size" - - p = _CategoricalAggPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, units=units, weight=weights), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if dodge == "auto": - # Needs to be before scale_categorical changes the coordinate series dtype - dodge = p._dodge_needed() - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - saturation = saturation if fill else 1 - p.map_hue(palette=palette, order=hue_order, norm=hue_norm, saturation=saturation) - color = _default_color(ax.bar, hue, color, kwargs, saturation=saturation) - - agg_cls = WeightedAggregator if "weight" in p.plot_data else EstimateAggregator - aggregator = agg_cls(estimator, errorbar, n_boot=n_boot, seed=seed) - err_kws = {} if err_kws is None else normalize_kwargs(err_kws, mpl.lines.Line2D) - - # Deprecations to remove in v0.15.0. - err_kws, capsize = p._err_kws_backcompat(err_kws, errcolor, errwidth, capsize) - - p.plot_bars( - aggregator=aggregator, - dodge=dodge, - width=width, - gap=gap, - color=color, - fill=fill, - capsize=capsize, - err_kws=err_kws, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -barplot.__doc__ = dedent("""\ - Show point estimates and errors as rectangular bars. - - A bar plot represents an aggregate or statistical estimate for a numeric - variable with the height of each rectangle and indicates the uncertainty - around that estimate using an error bar. Bar plots include 0 in the - axis range, and they are a good choice when 0 is a meaningful value - for the variable to take. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - {stat_api_params} - {orient} - {color} - {palette} - {saturation} - {fill} - {hue_norm} - {width} - {dodge} - {gap} - {log_scale} - {native_scale} - {formatter} - {legend} - {capsize} - {err_kws} - {ci} - {errcolor} - {errwidth} - {ax_in} - kwargs : key, value mappings - Other parameters are passed through to :class:`matplotlib.patches.Rectangle`. - - Returns - ------- - {ax_out} - - See Also - -------- - {countplot} - {pointplot} - {catplot} - - Notes - ----- - - For datasets where 0 is not a meaningful value, a :func:`pointplot` will - allow you to focus on differences between levels of one or more categorical - variables. - - It is also important to keep in mind that a bar plot shows only the mean (or - other aggregate) value, but it is often more informative to show the - distribution of values at each level of the categorical variables. In those - cases, approaches such as a :func:`boxplot` or :func:`violinplot` may be - more appropriate. - - Examples - -------- - .. include:: ../docstrings/barplot.rst - - """).format(**_categorical_docs) - - -def pointplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - estimator="mean", errorbar=("ci", 95), n_boot=1000, seed=None, units=None, - weights=None, color=None, palette=None, hue_norm=None, markers=default, - linestyles=default, dodge=False, log_scale=None, native_scale=False, - orient=None, capsize=0, formatter=None, legend="auto", err_kws=None, - ci=deprecated, errwidth=deprecated, join=deprecated, scale=deprecated, - ax=None, **kwargs, -): - - errorbar = utils._deprecate_ci(errorbar, ci) - - p = _CategoricalAggPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, units=units, weight=weights), - order=order, - orient=orient, - # Handle special backwards compatibility where pointplot originally - # did *not* default to multi-colored unless a palette was specified. - color="C0" if (color is None and palette is None) else color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - color = _default_color(ax.plot, hue, color, kwargs) - - agg_cls = WeightedAggregator if "weight" in p.plot_data else EstimateAggregator - aggregator = agg_cls(estimator, errorbar, n_boot=n_boot, seed=seed) - err_kws = {} if err_kws is None else normalize_kwargs(err_kws, mpl.lines.Line2D) - - # Deprecations to remove in v0.15.0. - p._point_kwargs_backcompat(scale, join, kwargs) - err_kws, capsize = p._err_kws_backcompat(err_kws, None, errwidth, capsize) - - p.plot_points( - aggregator=aggregator, - markers=markers, - linestyles=linestyles, - dodge=dodge, - color=color, - capsize=capsize, - err_kws=err_kws, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -pointplot.__doc__ = dedent("""\ - Show point estimates and errors using lines with markers. - - A point plot represents an estimate of central tendency for a numeric - variable by the position of the dot and provides some indication of the - uncertainty around that estimate using error bars. - - Point plots can be more useful than bar plots for focusing comparisons - between different levels of one or more categorical variables. They are - particularly adept at showing interactions: how the relationship between - levels of one categorical variable changes across levels of a second - categorical variable. The lines that join each point from the same `hue` - level allow interactions to be judged by differences in slope, which is - easier for the eyes than comparing the heights of several groups of points - or bars. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - {stat_api_params} - {color} - {palette} - markers : string or list of strings - Markers to use for each of the `hue` levels. - linestyles : string or list of strings - Line styles to use for each of the `hue` levels. - dodge : bool or float - Amount to separate the points for each level of the `hue` variable along - the categorical axis. Setting to `True` will apply a small default. - {log_scale} - {native_scale} - {orient} - {capsize} - {formatter} - {legend} - {err_kws} - {ci} - {errwidth} - join : bool - If `True`, connect point estimates with a line. - - .. deprecated:: v0.13.0 - Set `linestyle="none"` to remove the lines between the points. - scale : float - Scale factor for the plot elements. - - .. deprecated:: v0.13.0 - Control element sizes with :class:`matplotlib.lines.Line2D` parameters. - {ax_in} - kwargs : key, value mappings - Other parameters are passed through to :class:`matplotlib.lines.Line2D`. - - .. versionadded:: v0.13.0 - - Returns - ------- - {ax_out} - - See Also - -------- - {barplot} - {catplot} - - Notes - ----- - It is important to keep in mind that a point plot shows only the mean (or - other estimator) value, but in many cases it may be more informative to - show the distribution of values at each level of the categorical variables. - In that case, other approaches such as a box or violin plot may be more - appropriate. - - Examples - -------- - .. include:: ../docstrings/pointplot.rst - - """).format(**_categorical_docs) - - -def countplot( - data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, - orient=None, color=None, palette=None, saturation=.75, fill=True, hue_norm=None, - stat="count", width=.8, dodge="auto", gap=0, log_scale=None, native_scale=False, - formatter=None, legend="auto", ax=None, **kwargs -): - - if x is None and y is not None: - orient = "y" - x = 1 if list(y) else None - elif x is not None and y is None: - orient = "x" - y = 1 if list(x) else None - elif x is not None and y is not None: - raise TypeError("Cannot pass values for both `x` and `y`.") - - p = _CategoricalAggPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - order=order, - orient=orient, - color=color, - legend=legend, - ) - - if ax is None: - ax = plt.gca() - - if p.plot_data.empty: - return ax - - if dodge == "auto": - # Needs to be before scale_categorical changes the coordinate series dtype - dodge = p._dodge_needed() - - if p.var_types.get(p.orient) == "categorical" or not native_scale: - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(ax, log_scale=log_scale) - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - saturation = saturation if fill else 1 - p.map_hue(palette=palette, order=hue_order, norm=hue_norm, saturation=saturation) - color = _default_color(ax.bar, hue, color, kwargs, saturation) - - count_axis = {"x": "y", "y": "x"}[p.orient] - if p.input_format == "wide": - p.plot_data[count_axis] = 1 - - _check_argument("stat", ["count", "percent", "probability", "proportion"], stat) - p.variables[count_axis] = stat - if stat != "count": - denom = 100 if stat == "percent" else 1 - p.plot_data[count_axis] /= len(p.plot_data) / denom - - aggregator = EstimateAggregator("sum", errorbar=None) - - p.plot_bars( - aggregator=aggregator, - dodge=dodge, - width=width, - gap=gap, - color=color, - fill=fill, - capsize=0, - err_kws={}, - plot_kws=kwargs, - ) - - p._add_axis_labels(ax) - p._adjust_cat_axis(ax, axis=p.orient) - - return ax - - -countplot.__doc__ = dedent("""\ - Show the counts of observations in each categorical bin using bars. - - A count plot can be thought of as a histogram across a categorical, instead - of quantitative, variable. The basic API and options are identical to those - for :func:`barplot`, so you can compare counts across nested variables. - - Note that :func:`histplot` function offers similar functionality with additional - features (e.g. bar stacking), although its default behavior is somewhat different. - - {categorical_narrative} - - Parameters - ---------- - {categorical_data} - {input_params} - {order_vars} - {orient} - {color} - {palette} - {saturation} - {hue_norm} - stat : {{'count', 'percent', 'proportion', 'probability'}} - Statistic to compute; when not `'count'`, bar heights will be normalized so that - they sum to 100 (for `'percent'`) or 1 (otherwise) across the plot. - - .. versionadded:: v0.13.0 - {width} - {dodge} - {log_scale} - {native_scale} - {formatter} - {legend} - {ax_in} - kwargs : key, value mappings - Other parameters are passed through to :class:`matplotlib.patches.Rectangle`. - - Returns - ------- - {ax_out} - - See Also - -------- - histplot : Bin and count observations with additional options. - {barplot} - {catplot} - - Examples - -------- - .. include:: ../docstrings/countplot.rst - - """).format(**_categorical_docs) - - -def catplot( - data=None, *, x=None, y=None, hue=None, row=None, col=None, kind="strip", - estimator="mean", errorbar=("ci", 95), n_boot=1000, seed=None, units=None, - weights=None, order=None, hue_order=None, row_order=None, col_order=None, - col_wrap=None, height=5, aspect=1, log_scale=None, native_scale=False, - formatter=None, orient=None, color=None, palette=None, hue_norm=None, - legend="auto", legend_out=True, sharex=True, sharey=True, - margin_titles=False, facet_kws=None, ci=deprecated, **kwargs -): - - # Check for attempt to plot onto specific axes and warn - if "ax" in kwargs: - msg = ("catplot is a figure-level function and does not accept " - f"target axes. You may wish to try {kind}plot") - warnings.warn(msg, UserWarning) - kwargs.pop("ax") - - desaturated_kinds = ["bar", "count", "box", "violin", "boxen"] - undodged_kinds = ["strip", "swarm", "point"] - - if kind in ["bar", "point", "count"]: - Plotter = _CategoricalAggPlotter - else: - Plotter = _CategoricalPlotter - - if kind == "count": - if x is None and y is not None: - orient = "y" - x = 1 - elif x is not None and y is None: - orient = "x" - y = 1 - elif x is not None and y is not None: - raise ValueError("Cannot pass values for both `x` and `y`.") - - p = Plotter( - data=data, - variables=dict( - x=x, y=y, hue=hue, row=row, col=col, units=units, weight=weights - ), - order=order, - orient=orient, - # Handle special backwards compatibility where pointplot originally - # did *not* default to multi-colored unless a palette was specified. - color="C0" if kind == "point" and palette is None and color is None else color, - legend=legend, - ) - - for var in ["row", "col"]: - # Handle faceting variables that lack name information - if var in p.variables and p.variables[var] is None: - p.variables[var] = f"_{var}_" - - # Adapt the plot_data dataframe for use with FacetGrid - facet_data = p.plot_data.rename(columns=p.variables) - facet_data = facet_data.loc[:, ~facet_data.columns.duplicated()] - - col_name = p.variables.get("col", None) - row_name = p.variables.get("row", None) - - if facet_kws is None: - facet_kws = {} - - g = FacetGrid( - data=facet_data, row=row_name, col=col_name, col_wrap=col_wrap, - row_order=row_order, col_order=col_order, sharex=sharex, sharey=sharey, - legend_out=legend_out, margin_titles=margin_titles, - height=height, aspect=aspect, - **facet_kws, - ) - - # Capture this here because scale_categorical is going to insert a (null) - # x variable even if it is empty. It's not clear whether that needs to - # happen or if disabling that is the cleaner solution. - has_xy_data = p.has_xy_data - - if not native_scale or p.var_types[p.orient] == "categorical": - p.scale_categorical(p.orient, order=order, formatter=formatter) - - p._attach(g, log_scale=log_scale) - - if not has_xy_data: - return g - - # Deprecations to remove in v0.14.0. - hue_order = p._palette_without_hue_backcompat(palette, hue_order) - palette, hue_order = p._hue_backcompat(color, palette, hue_order) - - # Othe deprecations - errorbar = utils._deprecate_ci(errorbar, ci) - - saturation = kwargs.pop( - "saturation", - 0.75 if kind in desaturated_kinds and kwargs.get("fill", True) else 1 - ) - p.map_hue(palette=palette, order=hue_order, norm=hue_norm, saturation=saturation) - - # Set a default color - # Otherwise each artist will be plotted separately and trip the color cycle - if hue is None: - color = "C0" if color is None else color - if saturation < 1: - color = desaturate(color, saturation) - - if kind in ["strip", "swarm"]: - kwargs = normalize_kwargs(kwargs, mpl.collections.PathCollection) - kwargs["edgecolor"] = p._complement_color( - kwargs.pop("edgecolor", default), color, p._hue_map - ) - - width = kwargs.pop("width", 0.8) - dodge = kwargs.pop("dodge", False if kind in undodged_kinds else "auto") - if dodge == "auto": - dodge = p._dodge_needed() - - if "weight" in p.plot_data: - if kind not in ["bar", "point"]: - msg = f"The `weights` parameter has no effect with kind={kind!r}." - warnings.warn(msg, stacklevel=2) - agg_cls = WeightedAggregator - else: - agg_cls = EstimateAggregator - - if kind == "strip": - - jitter = kwargs.pop("jitter", True) - plot_kws = kwargs.copy() - plot_kws.setdefault("zorder", 3) - plot_kws.setdefault("linewidth", 0) - if "s" not in plot_kws: - plot_kws["s"] = plot_kws.pop("size", 5) ** 2 - - p.plot_strips( - jitter=jitter, - dodge=dodge, - color=color, - plot_kws=plot_kws, - ) - - elif kind == "swarm": - - warn_thresh = kwargs.pop("warn_thresh", .05) - plot_kws = kwargs.copy() - plot_kws.setdefault("zorder", 3) - if "s" not in plot_kws: - plot_kws["s"] = plot_kws.pop("size", 5) ** 2 - - if plot_kws.setdefault("linewidth", 0) is None: - plot_kws["linewidth"] = np.sqrt(plot_kws["s"]) / 10 - - p.plot_swarms( - dodge=dodge, - color=color, - warn_thresh=warn_thresh, - plot_kws=plot_kws, - ) - - elif kind == "box": - - plot_kws = kwargs.copy() - gap = plot_kws.pop("gap", 0) - fill = plot_kws.pop("fill", True) - whis = plot_kws.pop("whis", 1.5) - linewidth = plot_kws.pop("linewidth", None) - fliersize = plot_kws.pop("fliersize", 5) - linecolor = p._complement_color( - plot_kws.pop("linecolor", "auto"), color, p._hue_map - ) - - p.plot_boxes( - width=width, - dodge=dodge, - gap=gap, - fill=fill, - whis=whis, - color=color, - linecolor=linecolor, - linewidth=linewidth, - fliersize=fliersize, - plot_kws=plot_kws, - ) - - elif kind == "violin": - - plot_kws = kwargs.copy() - gap = plot_kws.pop("gap", 0) - fill = plot_kws.pop("fill", True) - split = plot_kws.pop("split", False) - inner = plot_kws.pop("inner", "box") - density_norm = plot_kws.pop("density_norm", "area") - common_norm = plot_kws.pop("common_norm", False) - - scale = plot_kws.pop("scale", deprecated) - scale_hue = plot_kws.pop("scale_hue", deprecated) - density_norm, common_norm = p._violin_scale_backcompat( - scale, scale_hue, density_norm, common_norm, - ) - - bw_method = p._violin_bw_backcompat( - plot_kws.pop("bw", deprecated), plot_kws.pop("bw_method", "scott") - ) - kde_kws = dict( - cut=plot_kws.pop("cut", 2), - gridsize=plot_kws.pop("gridsize", 100), - bw_adjust=plot_kws.pop("bw_adjust", 1), - bw_method=bw_method, - ) - - inner_kws = plot_kws.pop("inner_kws", {}).copy() - linewidth = plot_kws.pop("linewidth", None) - linecolor = plot_kws.pop("linecolor", "auto") - linecolor = p._complement_color(linecolor, color, p._hue_map) - - p.plot_violins( - width=width, - dodge=dodge, - gap=gap, - split=split, - color=color, - fill=fill, - linecolor=linecolor, - linewidth=linewidth, - inner=inner, - density_norm=density_norm, - common_norm=common_norm, - kde_kws=kde_kws, - inner_kws=inner_kws, - plot_kws=plot_kws, - ) - - elif kind == "boxen": - - plot_kws = kwargs.copy() - gap = plot_kws.pop("gap", 0) - fill = plot_kws.pop("fill", True) - linecolor = plot_kws.pop("linecolor", "auto") - linewidth = plot_kws.pop("linewidth", None) - k_depth = plot_kws.pop("k_depth", "tukey") - width_method = plot_kws.pop("width_method", "exponential") - outlier_prop = plot_kws.pop("outlier_prop", 0.007) - trust_alpha = plot_kws.pop("trust_alpha", 0.05) - showfliers = plot_kws.pop("showfliers", True) - box_kws = plot_kws.pop("box_kws", {}) - flier_kws = plot_kws.pop("flier_kws", {}) - line_kws = plot_kws.pop("line_kws", {}) - if "scale" in plot_kws: - width_method = p._boxen_scale_backcompat( - plot_kws["scale"], width_method - ) - linecolor = p._complement_color(linecolor, color, p._hue_map) - - p.plot_boxens( - width=width, - dodge=dodge, - gap=gap, - fill=fill, - color=color, - linecolor=linecolor, - linewidth=linewidth, - width_method=width_method, - k_depth=k_depth, - outlier_prop=outlier_prop, - trust_alpha=trust_alpha, - showfliers=showfliers, - box_kws=box_kws, - flier_kws=flier_kws, - line_kws=line_kws, - plot_kws=plot_kws, - ) - - elif kind == "point": - - aggregator = agg_cls(estimator, errorbar, n_boot=n_boot, seed=seed) - - markers = kwargs.pop("markers", default) - linestyles = kwargs.pop("linestyles", default) - - # Deprecations to remove in v0.15.0. - # TODO Uncomment when removing deprecation backcompat - # capsize = kwargs.pop("capsize", 0) - # err_kws = normalize_kwargs(kwargs.pop("err_kws", {}), mpl.lines.Line2D) - p._point_kwargs_backcompat( - kwargs.pop("scale", deprecated), - kwargs.pop("join", deprecated), - kwargs - ) - err_kws, capsize = p._err_kws_backcompat( - normalize_kwargs(kwargs.pop("err_kws", {}), mpl.lines.Line2D), - None, - errwidth=kwargs.pop("errwidth", deprecated), - capsize=kwargs.pop("capsize", 0), - ) - - p.plot_points( - aggregator=aggregator, - markers=markers, - linestyles=linestyles, - dodge=dodge, - color=color, - capsize=capsize, - err_kws=err_kws, - plot_kws=kwargs, - ) - - elif kind == "bar": - - aggregator = agg_cls(estimator, errorbar, n_boot=n_boot, seed=seed) - - err_kws, capsize = p._err_kws_backcompat( - normalize_kwargs(kwargs.pop("err_kws", {}), mpl.lines.Line2D), - errcolor=kwargs.pop("errcolor", deprecated), - errwidth=kwargs.pop("errwidth", deprecated), - capsize=kwargs.pop("capsize", 0), - ) - gap = kwargs.pop("gap", 0) - fill = kwargs.pop("fill", True) - - p.plot_bars( - aggregator=aggregator, - dodge=dodge, - width=width, - gap=gap, - color=color, - fill=fill, - capsize=capsize, - err_kws=err_kws, - plot_kws=kwargs, - ) - - elif kind == "count": - - aggregator = EstimateAggregator("sum", errorbar=None) - - count_axis = {"x": "y", "y": "x"}[p.orient] - p.plot_data[count_axis] = 1 - - stat_options = ["count", "percent", "probability", "proportion"] - stat = _check_argument("stat", stat_options, kwargs.pop("stat", "count")) - p.variables[count_axis] = stat - if stat != "count": - denom = 100 if stat == "percent" else 1 - p.plot_data[count_axis] /= len(p.plot_data) / denom - - gap = kwargs.pop("gap", 0) - fill = kwargs.pop("fill", True) - - p.plot_bars( - aggregator=aggregator, - dodge=dodge, - width=width, - gap=gap, - color=color, - fill=fill, - capsize=0, - err_kws={}, - plot_kws=kwargs, - ) - - else: - msg = ( - f"Invalid `kind`: {kind!r}. Options are 'strip', 'swarm', " - "'box', 'boxen', 'violin', 'bar', 'count', and 'point'." - ) - raise ValueError(msg) - - for ax in g.axes.flat: - p._adjust_cat_axis(ax, axis=p.orient) - - g.set_axis_labels(p.variables.get("x"), p.variables.get("y")) - g.set_titles() - g.tight_layout() - - for ax in g.axes.flat: - g._update_legend_data(ax) - ax.legend_ = None - - if legend == "auto": - show_legend = not p._redundant_hue and p.input_format != "wide" - else: - show_legend = bool(legend) - if show_legend: - g.add_legend(title=p.variables.get("hue"), label_order=hue_order) - - if data is not None: - # Replace the dataframe on the FacetGrid for any subsequent maps - g.data = data - - return g - - -catplot.__doc__ = dedent("""\ - Figure-level interface for drawing categorical plots onto a FacetGrid. - - This function provides access to several axes-level functions that - show the relationship between a numerical and one or more categorical - variables using one of several visual representations. The `kind` - parameter selects the underlying axes-level function to use. - - Categorical scatterplots: - - - :func:`stripplot` (with `kind="strip"`; the default) - - :func:`swarmplot` (with `kind="swarm"`) - - Categorical distribution plots: - - - :func:`boxplot` (with `kind="box"`) - - :func:`violinplot` (with `kind="violin"`) - - :func:`boxenplot` (with `kind="boxen"`) - - Categorical estimate plots: - - - :func:`pointplot` (with `kind="point"`) - - :func:`barplot` (with `kind="bar"`) - - :func:`countplot` (with `kind="count"`) - - Extra keyword arguments are passed to the underlying function, so you - should refer to the documentation for each to see kind-specific options. - - {categorical_narrative} - - After plotting, the :class:`FacetGrid` with the plot is returned and can - be used directly to tweak supporting plot details or add other layers. - - Parameters - ---------- - {categorical_data} - {input_params} - row, col : names of variables in `data` or vector data - Categorical variables that will determine the faceting of the grid. - kind : str - The kind of plot to draw, corresponds to the name of a categorical - axes-level plotting function. Options are: "strip", "swarm", "box", "violin", - "boxen", "point", "bar", or "count". - {stat_api_params} - {order_vars} - row_order, col_order : lists of strings - Order to organize the rows and/or columns of the grid in; otherwise the - orders are inferred from the data objects. - {col_wrap} - {height} - {aspect} - {native_scale} - {formatter} - {orient} - {color} - {palette} - {hue_norm} - {legend} - {legend_out} - {share_xy} - {margin_titles} - facet_kws : dict - Dictionary of other keyword arguments to pass to :class:`FacetGrid`. - kwargs : key, value pairings - Other keyword arguments are passed through to the underlying plotting - function. - - Returns - ------- - :class:`FacetGrid` - Returns the :class:`FacetGrid` object with the plot on it for further - tweaking. - - Examples - -------- - .. include:: ../docstrings/catplot.rst - - """).format(**_categorical_docs) - - -class Beeswarm: - """Modifies a scatterplot artist to show a beeswarm plot.""" - def __init__(self, orient="x", width=0.8, warn_thresh=.05): - - self.orient = orient - self.width = width - self.warn_thresh = warn_thresh - - def __call__(self, points, center): - """Swarm `points`, a PathCollection, around the `center` position.""" - # Convert from point size (area) to diameter - - ax = points.axes - dpi = ax.figure.dpi - - # Get the original positions of the points - orig_xy_data = points.get_offsets() - - # Reset the categorical positions to the center line - cat_idx = 1 if self.orient == "y" else 0 - orig_xy_data[:, cat_idx] = center - - # Transform the data coordinates to point coordinates. - # We'll figure out the swarm positions in the latter - # and then convert back to data coordinates and replot - orig_x_data, orig_y_data = orig_xy_data.T - orig_xy = ax.transData.transform(orig_xy_data) - - # Order the variables so that x is the categorical axis - if self.orient == "y": - orig_xy = orig_xy[:, [1, 0]] - - # Add a column with each point's radius - sizes = points.get_sizes() - if sizes.size == 1: - sizes = np.repeat(sizes, orig_xy.shape[0]) - edge = points.get_linewidth().item() - radii = (np.sqrt(sizes) + edge) / 2 * (dpi / 72) - orig_xy = np.c_[orig_xy, radii] - - # Sort along the value axis to facilitate the beeswarm - sorter = np.argsort(orig_xy[:, 1]) - orig_xyr = orig_xy[sorter] - - # Adjust points along the categorical axis to prevent overlaps - new_xyr = np.empty_like(orig_xyr) - new_xyr[sorter] = self.beeswarm(orig_xyr) - - # Transform the point coordinates back to data coordinates - if self.orient == "y": - new_xy = new_xyr[:, [1, 0]] - else: - new_xy = new_xyr[:, :2] - new_x_data, new_y_data = ax.transData.inverted().transform(new_xy).T - - # Add gutters - t_fwd, t_inv = _get_transform_functions(ax, self.orient) - if self.orient == "y": - self.add_gutters(new_y_data, center, t_fwd, t_inv) - else: - self.add_gutters(new_x_data, center, t_fwd, t_inv) - - # Reposition the points so they do not overlap - if self.orient == "y": - points.set_offsets(np.c_[orig_x_data, new_y_data]) - else: - points.set_offsets(np.c_[new_x_data, orig_y_data]) - - def beeswarm(self, orig_xyr): - """Adjust x position of points to avoid overlaps.""" - # In this method, `x` is always the categorical axis - # Center of the swarm, in point coordinates - midline = orig_xyr[0, 0] - - # Start the swarm with the first point - swarm = np.atleast_2d(orig_xyr[0]) - - # Loop over the remaining points - for xyr_i in orig_xyr[1:]: - - # Find the points in the swarm that could possibly - # overlap with the point we are currently placing - neighbors = self.could_overlap(xyr_i, swarm) - - # Find positions that would be valid individually - # with respect to each of the swarm neighbors - candidates = self.position_candidates(xyr_i, neighbors) - - # Sort candidates by their centrality - offsets = np.abs(candidates[:, 0] - midline) - candidates = candidates[np.argsort(offsets)] - - # Find the first candidate that does not overlap any neighbors - new_xyr_i = self.first_non_overlapping_candidate(candidates, neighbors) - - # Place it into the swarm - swarm = np.vstack([swarm, new_xyr_i]) - - return swarm - - def could_overlap(self, xyr_i, swarm): - """Return a list of all swarm points that could overlap with target.""" - # Because we work backwards through the swarm and can short-circuit, - # the for-loop is faster than vectorization - _, y_i, r_i = xyr_i - neighbors = [] - for xyr_j in reversed(swarm): - _, y_j, r_j = xyr_j - if (y_i - y_j) < (r_i + r_j): - neighbors.append(xyr_j) - else: - break - return np.array(neighbors)[::-1] - - def position_candidates(self, xyr_i, neighbors): - """Return a list of coordinates that might be valid by adjusting x.""" - candidates = [xyr_i] - x_i, y_i, r_i = xyr_i - left_first = True - for x_j, y_j, r_j in neighbors: - dy = y_i - y_j - dx = np.sqrt(max((r_i + r_j) ** 2 - dy ** 2, 0)) * 1.05 - cl, cr = (x_j - dx, y_i, r_i), (x_j + dx, y_i, r_i) - if left_first: - new_candidates = [cl, cr] - else: - new_candidates = [cr, cl] - candidates.extend(new_candidates) - left_first = not left_first - return np.array(candidates) - - def first_non_overlapping_candidate(self, candidates, neighbors): - """Find the first candidate that does not overlap with the swarm.""" - - # If we have no neighbors, all candidates are good. - if len(neighbors) == 0: - return candidates[0] - - neighbors_x = neighbors[:, 0] - neighbors_y = neighbors[:, 1] - neighbors_r = neighbors[:, 2] - - for xyr_i in candidates: - - x_i, y_i, r_i = xyr_i - - dx = neighbors_x - x_i - dy = neighbors_y - y_i - sq_distances = np.square(dx) + np.square(dy) - - sep_needed = np.square(neighbors_r + r_i) - - # Good candidate does not overlap any of neighbors which means that - # squared distance between candidate and any of the neighbors has - # to be at least square of the summed radii - good_candidate = np.all(sq_distances >= sep_needed) - - if good_candidate: - return xyr_i - - raise RuntimeError( - "No non-overlapping candidates found. This should not happen." - ) - - def add_gutters(self, points, center, trans_fwd, trans_inv): - """Stop points from extending beyond their territory.""" - half_width = self.width / 2 - low_gutter = trans_inv(trans_fwd(center) - half_width) - off_low = points < low_gutter - if off_low.any(): - points[off_low] = low_gutter - high_gutter = trans_inv(trans_fwd(center) + half_width) - off_high = points > high_gutter - if off_high.any(): - points[off_high] = high_gutter - - gutter_prop = (off_high + off_low).sum() / len(points) - if gutter_prop > self.warn_thresh: - msg = ( - "{:.1%} of the points cannot be placed; you may want " - "to decrease the size of the markers or use stripplot." - ).format(gutter_prop) - warnings.warn(msg, UserWarning) - - return points - - -BoxPlotArtists = namedtuple("BoxPlotArtists", "box median whiskers caps fliers mean") - - -class BoxPlotContainer: - - def __init__(self, artist_dict): - - self.boxes = artist_dict["boxes"] - self.medians = artist_dict["medians"] - self.whiskers = artist_dict["whiskers"] - self.caps = artist_dict["caps"] - self.fliers = artist_dict["fliers"] - self.means = artist_dict["means"] - - self._label = None - self._children = [ - *self.boxes, - *self.medians, - *self.whiskers, - *self.caps, - *self.fliers, - *self.means, - ] - - def __repr__(self): - return f"<BoxPlotContainer object with {len(self.boxes)} boxes>" - - def __getitem__(self, idx): - pair_slice = slice(2 * idx, 2 * idx + 2) - return BoxPlotArtists( - self.boxes[idx] if self.boxes else [], - self.medians[idx] if self.medians else [], - self.whiskers[pair_slice] if self.whiskers else [], - self.caps[pair_slice] if self.caps else [], - self.fliers[idx] if self.fliers else [], - self.means[idx]if self.means else [], - ) - - def __iter__(self): - yield from (self[i] for i in range(len(self.boxes))) - - def get_label(self): - return self._label - - def set_label(self, value): - self._label = value - - def get_children(self): - return self._children - - def remove(self): - for child in self._children: - child.remove() diff --git a/seaborn/cm.py b/seaborn/cm.py deleted file mode 100644 index df7ce61997882d7d7f734052292438e4234a5cc7..0000000000000000000000000000000000000000 --- a/seaborn/cm.py +++ /dev/null @@ -1,1586 +0,0 @@ -from matplotlib import colors -from seaborn._compat import register_colormap - - -_rocket_lut = [ - [ 0.01060815, 0.01808215, 0.10018654], - [ 0.01428972, 0.02048237, 0.10374486], - [ 0.01831941, 0.0229766 , 0.10738511], - [ 0.02275049, 0.02554464, 0.11108639], - [ 0.02759119, 0.02818316, 0.11483751], - [ 0.03285175, 0.03088792, 0.11863035], - [ 0.03853466, 0.03365771, 0.12245873], - [ 0.04447016, 0.03648425, 0.12631831], - [ 0.05032105, 0.03936808, 0.13020508], - [ 0.05611171, 0.04224835, 0.13411624], - [ 0.0618531 , 0.04504866, 0.13804929], - [ 0.06755457, 0.04778179, 0.14200206], - [ 0.0732236 , 0.05045047, 0.14597263], - [ 0.0788708 , 0.05305461, 0.14995981], - [ 0.08450105, 0.05559631, 0.15396203], - [ 0.09011319, 0.05808059, 0.15797687], - [ 0.09572396, 0.06050127, 0.16200507], - [ 0.10132312, 0.06286782, 0.16604287], - [ 0.10692823, 0.06517224, 0.17009175], - [ 0.1125315 , 0.06742194, 0.17414848], - [ 0.11813947, 0.06961499, 0.17821272], - [ 0.12375803, 0.07174938, 0.18228425], - [ 0.12938228, 0.07383015, 0.18636053], - [ 0.13501631, 0.07585609, 0.19044109], - [ 0.14066867, 0.0778224 , 0.19452676], - [ 0.14633406, 0.07973393, 0.1986151 ], - [ 0.15201338, 0.08159108, 0.20270523], - [ 0.15770877, 0.08339312, 0.20679668], - [ 0.16342174, 0.0851396 , 0.21088893], - [ 0.16915387, 0.08682996, 0.21498104], - [ 0.17489524, 0.08848235, 0.2190294 ], - [ 0.18065495, 0.09009031, 0.22303512], - [ 0.18643324, 0.09165431, 0.22699705], - [ 0.19223028, 0.09317479, 0.23091409], - [ 0.19804623, 0.09465217, 0.23478512], - [ 0.20388117, 0.09608689, 0.23860907], - [ 0.20973515, 0.09747934, 0.24238489], - [ 0.21560818, 0.09882993, 0.24611154], - [ 0.22150014, 0.10013944, 0.2497868 ], - [ 0.22741085, 0.10140876, 0.25340813], - [ 0.23334047, 0.10263737, 0.25697736], - [ 0.23928891, 0.10382562, 0.2604936 ], - [ 0.24525608, 0.10497384, 0.26395596], - [ 0.25124182, 0.10608236, 0.26736359], - [ 0.25724602, 0.10715148, 0.27071569], - [ 0.26326851, 0.1081815 , 0.27401148], - [ 0.26930915, 0.1091727 , 0.2772502 ], - [ 0.27536766, 0.11012568, 0.28043021], - [ 0.28144375, 0.11104133, 0.2835489 ], - [ 0.2875374 , 0.11191896, 0.28660853], - [ 0.29364846, 0.11275876, 0.2896085 ], - [ 0.29977678, 0.11356089, 0.29254823], - [ 0.30592213, 0.11432553, 0.29542718], - [ 0.31208435, 0.11505284, 0.29824485], - [ 0.31826327, 0.1157429 , 0.30100076], - [ 0.32445869, 0.11639585, 0.30369448], - [ 0.33067031, 0.11701189, 0.30632563], - [ 0.33689808, 0.11759095, 0.3088938 ], - [ 0.34314168, 0.11813362, 0.31139721], - [ 0.34940101, 0.11863987, 0.3138355 ], - [ 0.355676 , 0.11910909, 0.31620996], - [ 0.36196644, 0.1195413 , 0.31852037], - [ 0.36827206, 0.11993653, 0.32076656], - [ 0.37459292, 0.12029443, 0.32294825], - [ 0.38092887, 0.12061482, 0.32506528], - [ 0.38727975, 0.12089756, 0.3271175 ], - [ 0.39364518, 0.12114272, 0.32910494], - [ 0.40002537, 0.12134964, 0.33102734], - [ 0.40642019, 0.12151801, 0.33288464], - [ 0.41282936, 0.12164769, 0.33467689], - [ 0.41925278, 0.12173833, 0.33640407], - [ 0.42569057, 0.12178916, 0.33806605], - [ 0.43214263, 0.12179973, 0.33966284], - [ 0.43860848, 0.12177004, 0.34119475], - [ 0.44508855, 0.12169883, 0.34266151], - [ 0.45158266, 0.12158557, 0.34406324], - [ 0.45809049, 0.12142996, 0.34540024], - [ 0.46461238, 0.12123063, 0.34667231], - [ 0.47114798, 0.12098721, 0.34787978], - [ 0.47769736, 0.12069864, 0.34902273], - [ 0.48426077, 0.12036349, 0.35010104], - [ 0.49083761, 0.11998161, 0.35111537], - [ 0.49742847, 0.11955087, 0.35206533], - [ 0.50403286, 0.11907081, 0.35295152], - [ 0.51065109, 0.11853959, 0.35377385], - [ 0.51728314, 0.1179558 , 0.35453252], - [ 0.52392883, 0.11731817, 0.35522789], - [ 0.53058853, 0.11662445, 0.35585982], - [ 0.53726173, 0.11587369, 0.35642903], - [ 0.54394898, 0.11506307, 0.35693521], - [ 0.5506426 , 0.11420757, 0.35737863], - [ 0.55734473, 0.11330456, 0.35775059], - [ 0.56405586, 0.11235265, 0.35804813], - [ 0.57077365, 0.11135597, 0.35827146], - [ 0.5774991 , 0.11031233, 0.35841679], - [ 0.58422945, 0.10922707, 0.35848469], - [ 0.59096382, 0.10810205, 0.35847347], - [ 0.59770215, 0.10693774, 0.35838029], - [ 0.60444226, 0.10573912, 0.35820487], - [ 0.61118304, 0.10450943, 0.35794557], - [ 0.61792306, 0.10325288, 0.35760108], - [ 0.62466162, 0.10197244, 0.35716891], - [ 0.63139686, 0.10067417, 0.35664819], - [ 0.63812122, 0.09938212, 0.35603757], - [ 0.64483795, 0.0980891 , 0.35533555], - [ 0.65154562, 0.09680192, 0.35454107], - [ 0.65824241, 0.09552918, 0.3536529 ], - [ 0.66492652, 0.09428017, 0.3526697 ], - [ 0.67159578, 0.09306598, 0.35159077], - [ 0.67824099, 0.09192342, 0.3504148 ], - [ 0.684863 , 0.09085633, 0.34914061], - [ 0.69146268, 0.0898675 , 0.34776864], - [ 0.69803757, 0.08897226, 0.3462986 ], - [ 0.70457834, 0.0882129 , 0.34473046], - [ 0.71108138, 0.08761223, 0.3430635 ], - [ 0.7175507 , 0.08716212, 0.34129974], - [ 0.72398193, 0.08688725, 0.33943958], - [ 0.73035829, 0.0868623 , 0.33748452], - [ 0.73669146, 0.08704683, 0.33543669], - [ 0.74297501, 0.08747196, 0.33329799], - [ 0.74919318, 0.08820542, 0.33107204], - [ 0.75535825, 0.08919792, 0.32876184], - [ 0.76145589, 0.09050716, 0.32637117], - [ 0.76748424, 0.09213602, 0.32390525], - [ 0.77344838, 0.09405684, 0.32136808], - [ 0.77932641, 0.09634794, 0.31876642], - [ 0.78513609, 0.09892473, 0.31610488], - [ 0.79085854, 0.10184672, 0.313391 ], - [ 0.7965014 , 0.10506637, 0.31063031], - [ 0.80205987, 0.10858333, 0.30783 ], - [ 0.80752799, 0.11239964, 0.30499738], - [ 0.81291606, 0.11645784, 0.30213802], - [ 0.81820481, 0.12080606, 0.29926105], - [ 0.82341472, 0.12535343, 0.2963705 ], - [ 0.82852822, 0.13014118, 0.29347474], - [ 0.83355779, 0.13511035, 0.29057852], - [ 0.83850183, 0.14025098, 0.2876878 ], - [ 0.84335441, 0.14556683, 0.28480819], - [ 0.84813096, 0.15099892, 0.281943 ], - [ 0.85281737, 0.15657772, 0.27909826], - [ 0.85742602, 0.1622583 , 0.27627462], - [ 0.86196552, 0.16801239, 0.27346473], - [ 0.86641628, 0.17387796, 0.27070818], - [ 0.87079129, 0.17982114, 0.26797378], - [ 0.87507281, 0.18587368, 0.26529697], - [ 0.87925878, 0.19203259, 0.26268136], - [ 0.8833417 , 0.19830556, 0.26014181], - [ 0.88731387, 0.20469941, 0.25769539], - [ 0.89116859, 0.21121788, 0.2553592 ], - [ 0.89490337, 0.21785614, 0.25314362], - [ 0.8985026 , 0.22463251, 0.25108745], - [ 0.90197527, 0.23152063, 0.24918223], - [ 0.90530097, 0.23854541, 0.24748098], - [ 0.90848638, 0.24568473, 0.24598324], - [ 0.911533 , 0.25292623, 0.24470258], - [ 0.9144225 , 0.26028902, 0.24369359], - [ 0.91717106, 0.26773821, 0.24294137], - [ 0.91978131, 0.27526191, 0.24245973], - [ 0.92223947, 0.28287251, 0.24229568], - [ 0.92456587, 0.29053388, 0.24242622], - [ 0.92676657, 0.29823282, 0.24285536], - [ 0.92882964, 0.30598085, 0.24362274], - [ 0.93078135, 0.31373977, 0.24468803], - [ 0.93262051, 0.3215093 , 0.24606461], - [ 0.93435067, 0.32928362, 0.24775328], - [ 0.93599076, 0.33703942, 0.24972157], - [ 0.93752831, 0.34479177, 0.25199928], - [ 0.93899289, 0.35250734, 0.25452808], - [ 0.94036561, 0.36020899, 0.25734661], - [ 0.94167588, 0.36786594, 0.2603949 ], - [ 0.94291042, 0.37549479, 0.26369821], - [ 0.94408513, 0.3830811 , 0.26722004], - [ 0.94520419, 0.39062329, 0.27094924], - [ 0.94625977, 0.39813168, 0.27489742], - [ 0.94727016, 0.4055909 , 0.27902322], - [ 0.94823505, 0.41300424, 0.28332283], - [ 0.94914549, 0.42038251, 0.28780969], - [ 0.95001704, 0.42771398, 0.29244728], - [ 0.95085121, 0.43500005, 0.29722817], - [ 0.95165009, 0.44224144, 0.30214494], - [ 0.9524044 , 0.44944853, 0.3072105 ], - [ 0.95312556, 0.45661389, 0.31239776], - [ 0.95381595, 0.46373781, 0.31769923], - [ 0.95447591, 0.47082238, 0.32310953], - [ 0.95510255, 0.47787236, 0.32862553], - [ 0.95569679, 0.48489115, 0.33421404], - [ 0.95626788, 0.49187351, 0.33985601], - [ 0.95681685, 0.49882008, 0.34555431], - [ 0.9573439 , 0.50573243, 0.35130912], - [ 0.95784842, 0.51261283, 0.35711942], - [ 0.95833051, 0.51946267, 0.36298589], - [ 0.95879054, 0.52628305, 0.36890904], - [ 0.95922872, 0.53307513, 0.3748895 ], - [ 0.95964538, 0.53983991, 0.38092784], - [ 0.96004345, 0.54657593, 0.3870292 ], - [ 0.96042097, 0.55328624, 0.39319057], - [ 0.96077819, 0.55997184, 0.39941173], - [ 0.9611152 , 0.5666337 , 0.40569343], - [ 0.96143273, 0.57327231, 0.41203603], - [ 0.96173392, 0.57988594, 0.41844491], - [ 0.96201757, 0.58647675, 0.42491751], - [ 0.96228344, 0.59304598, 0.43145271], - [ 0.96253168, 0.5995944 , 0.43805131], - [ 0.96276513, 0.60612062, 0.44471698], - [ 0.96298491, 0.6126247 , 0.45145074], - [ 0.96318967, 0.61910879, 0.45824902], - [ 0.96337949, 0.6255736 , 0.46511271], - [ 0.96355923, 0.63201624, 0.47204746], - [ 0.96372785, 0.63843852, 0.47905028], - [ 0.96388426, 0.64484214, 0.4861196 ], - [ 0.96403203, 0.65122535, 0.4932578 ], - [ 0.96417332, 0.65758729, 0.50046894], - [ 0.9643063 , 0.66393045, 0.5077467 ], - [ 0.96443322, 0.67025402, 0.51509334], - [ 0.96455845, 0.67655564, 0.52251447], - [ 0.96467922, 0.68283846, 0.53000231], - [ 0.96479861, 0.68910113, 0.53756026], - [ 0.96492035, 0.69534192, 0.5451917 ], - [ 0.96504223, 0.7015636 , 0.5528892 ], - [ 0.96516917, 0.70776351, 0.5606593 ], - [ 0.96530224, 0.71394212, 0.56849894], - [ 0.96544032, 0.72010124, 0.57640375], - [ 0.96559206, 0.72623592, 0.58438387], - [ 0.96575293, 0.73235058, 0.59242739], - [ 0.96592829, 0.73844258, 0.60053991], - [ 0.96612013, 0.74451182, 0.60871954], - [ 0.96632832, 0.75055966, 0.61696136], - [ 0.96656022, 0.75658231, 0.62527295], - [ 0.96681185, 0.76258381, 0.63364277], - [ 0.96709183, 0.76855969, 0.64207921], - [ 0.96739773, 0.77451297, 0.65057302], - [ 0.96773482, 0.78044149, 0.65912731], - [ 0.96810471, 0.78634563, 0.66773889], - [ 0.96850919, 0.79222565, 0.6764046 ], - [ 0.96893132, 0.79809112, 0.68512266], - [ 0.96935926, 0.80395415, 0.69383201], - [ 0.9698028 , 0.80981139, 0.70252255], - [ 0.97025511, 0.81566605, 0.71120296], - [ 0.97071849, 0.82151775, 0.71987163], - [ 0.97120159, 0.82736371, 0.72851999], - [ 0.97169389, 0.83320847, 0.73716071], - [ 0.97220061, 0.83905052, 0.74578903], - [ 0.97272597, 0.84488881, 0.75440141], - [ 0.97327085, 0.85072354, 0.76299805], - [ 0.97383206, 0.85655639, 0.77158353], - [ 0.97441222, 0.86238689, 0.78015619], - [ 0.97501782, 0.86821321, 0.78871034], - [ 0.97564391, 0.87403763, 0.79725261], - [ 0.97628674, 0.87986189, 0.8057883 ], - [ 0.97696114, 0.88568129, 0.81430324], - [ 0.97765722, 0.89149971, 0.82280948], - [ 0.97837585, 0.89731727, 0.83130786], - [ 0.97912374, 0.90313207, 0.83979337], - [ 0.979891 , 0.90894778, 0.84827858], - [ 0.98067764, 0.91476465, 0.85676611], - [ 0.98137749, 0.92061729, 0.86536915] -] - - -_mako_lut = [ - [ 0.04503935, 0.01482344, 0.02092227], - [ 0.04933018, 0.01709292, 0.02535719], - [ 0.05356262, 0.01950702, 0.03018802], - [ 0.05774337, 0.02205989, 0.03545515], - [ 0.06188095, 0.02474764, 0.04115287], - [ 0.06598247, 0.0275665 , 0.04691409], - [ 0.07005374, 0.03051278, 0.05264306], - [ 0.07409947, 0.03358324, 0.05834631], - [ 0.07812339, 0.03677446, 0.06403249], - [ 0.08212852, 0.0400833 , 0.06970862], - [ 0.08611731, 0.04339148, 0.07538208], - [ 0.09009161, 0.04664706, 0.08105568], - [ 0.09405308, 0.04985685, 0.08673591], - [ 0.09800301, 0.05302279, 0.09242646], - [ 0.10194255, 0.05614641, 0.09813162], - [ 0.10587261, 0.05922941, 0.103854 ], - [ 0.1097942 , 0.06227277, 0.10959847], - [ 0.11370826, 0.06527747, 0.11536893], - [ 0.11761516, 0.06824548, 0.12116393], - [ 0.12151575, 0.07117741, 0.12698763], - [ 0.12541095, 0.07407363, 0.1328442 ], - [ 0.12930083, 0.07693611, 0.13873064], - [ 0.13317849, 0.07976988, 0.14465095], - [ 0.13701138, 0.08259683, 0.15060265], - [ 0.14079223, 0.08542126, 0.15659379], - [ 0.14452486, 0.08824175, 0.16262484], - [ 0.14820351, 0.09106304, 0.16869476], - [ 0.15183185, 0.09388372, 0.17480366], - [ 0.15540398, 0.09670855, 0.18094993], - [ 0.15892417, 0.09953561, 0.18713384], - [ 0.16238588, 0.10236998, 0.19335329], - [ 0.16579435, 0.10520905, 0.19960847], - [ 0.16914226, 0.10805832, 0.20589698], - [ 0.17243586, 0.11091443, 0.21221911], - [ 0.17566717, 0.11378321, 0.21857219], - [ 0.17884322, 0.11666074, 0.2249565 ], - [ 0.18195582, 0.11955283, 0.23136943], - [ 0.18501213, 0.12245547, 0.23781116], - [ 0.18800459, 0.12537395, 0.24427914], - [ 0.19093944, 0.1283047 , 0.25077369], - [ 0.19381092, 0.13125179, 0.25729255], - [ 0.19662307, 0.13421303, 0.26383543], - [ 0.19937337, 0.13719028, 0.27040111], - [ 0.20206187, 0.14018372, 0.27698891], - [ 0.20469116, 0.14319196, 0.28359861], - [ 0.20725547, 0.14621882, 0.29022775], - [ 0.20976258, 0.14925954, 0.29687795], - [ 0.21220409, 0.15231929, 0.30354703], - [ 0.21458611, 0.15539445, 0.31023563], - [ 0.21690827, 0.15848519, 0.31694355], - [ 0.21916481, 0.16159489, 0.32366939], - [ 0.2213631 , 0.16471913, 0.33041431], - [ 0.22349947, 0.1678599 , 0.33717781], - [ 0.2255714 , 0.1710185 , 0.34395925], - [ 0.22758415, 0.17419169, 0.35075983], - [ 0.22953569, 0.17738041, 0.35757941], - [ 0.23142077, 0.18058733, 0.3644173 ], - [ 0.2332454 , 0.18380872, 0.37127514], - [ 0.2350092 , 0.18704459, 0.3781528 ], - [ 0.23670785, 0.190297 , 0.38504973], - [ 0.23834119, 0.19356547, 0.39196711], - [ 0.23991189, 0.19684817, 0.39890581], - [ 0.24141903, 0.20014508, 0.4058667 ], - [ 0.24286214, 0.20345642, 0.4128484 ], - [ 0.24423453, 0.20678459, 0.41985299], - [ 0.24554109, 0.21012669, 0.42688124], - [ 0.2467815 , 0.21348266, 0.43393244], - [ 0.24795393, 0.21685249, 0.4410088 ], - [ 0.24905614, 0.22023618, 0.448113 ], - [ 0.25007383, 0.22365053, 0.45519562], - [ 0.25098926, 0.22710664, 0.46223892], - [ 0.25179696, 0.23060342, 0.46925447], - [ 0.25249346, 0.23414353, 0.47623196], - [ 0.25307401, 0.23772973, 0.48316271], - [ 0.25353152, 0.24136961, 0.49001976], - [ 0.25386167, 0.24506548, 0.49679407], - [ 0.25406082, 0.2488164 , 0.50348932], - [ 0.25412435, 0.25262843, 0.51007843], - [ 0.25404842, 0.25650743, 0.51653282], - [ 0.25383134, 0.26044852, 0.52286845], - [ 0.2534705 , 0.26446165, 0.52903422], - [ 0.25296722, 0.2685428 , 0.53503572], - [ 0.2523226 , 0.27269346, 0.54085315], - [ 0.25153974, 0.27691629, 0.54645752], - [ 0.25062402, 0.28120467, 0.55185939], - [ 0.24958205, 0.28556371, 0.55701246], - [ 0.24842386, 0.28998148, 0.56194601], - [ 0.24715928, 0.29446327, 0.56660884], - [ 0.24580099, 0.29899398, 0.57104399], - [ 0.24436202, 0.30357852, 0.57519929], - [ 0.24285591, 0.30819938, 0.57913247], - [ 0.24129828, 0.31286235, 0.58278615], - [ 0.23970131, 0.3175495 , 0.5862272 ], - [ 0.23807973, 0.32226344, 0.58941872], - [ 0.23644557, 0.32699241, 0.59240198], - [ 0.2348113 , 0.33173196, 0.59518282], - [ 0.23318874, 0.33648036, 0.59775543], - [ 0.2315855 , 0.34122763, 0.60016456], - [ 0.23001121, 0.34597357, 0.60240251], - [ 0.2284748 , 0.35071512, 0.6044784 ], - [ 0.22698081, 0.35544612, 0.60642528], - [ 0.22553305, 0.36016515, 0.60825252], - [ 0.22413977, 0.36487341, 0.60994938], - [ 0.22280246, 0.36956728, 0.61154118], - [ 0.22152555, 0.37424409, 0.61304472], - [ 0.22030752, 0.37890437, 0.61446646], - [ 0.2191538 , 0.38354668, 0.61581561], - [ 0.21806257, 0.38817169, 0.61709794], - [ 0.21703799, 0.39277882, 0.61831922], - [ 0.21607792, 0.39736958, 0.61948028], - [ 0.21518463, 0.40194196, 0.62059763], - [ 0.21435467, 0.40649717, 0.62167507], - [ 0.21358663, 0.41103579, 0.62271724], - [ 0.21288172, 0.41555771, 0.62373011], - [ 0.21223835, 0.42006355, 0.62471794], - [ 0.21165312, 0.42455441, 0.62568371], - [ 0.21112526, 0.42903064, 0.6266318 ], - [ 0.21065161, 0.43349321, 0.62756504], - [ 0.21023306, 0.43794288, 0.62848279], - [ 0.20985996, 0.44238227, 0.62938329], - [ 0.20951045, 0.44680966, 0.63030696], - [ 0.20916709, 0.45122981, 0.63124483], - [ 0.20882976, 0.45564335, 0.63219599], - [ 0.20849798, 0.46005094, 0.63315928], - [ 0.20817199, 0.46445309, 0.63413391], - [ 0.20785149, 0.46885041, 0.63511876], - [ 0.20753716, 0.47324327, 0.63611321], - [ 0.20722876, 0.47763224, 0.63711608], - [ 0.20692679, 0.48201774, 0.63812656], - [ 0.20663156, 0.48640018, 0.63914367], - [ 0.20634336, 0.49078002, 0.64016638], - [ 0.20606303, 0.49515755, 0.6411939 ], - [ 0.20578999, 0.49953341, 0.64222457], - [ 0.20552612, 0.50390766, 0.64325811], - [ 0.20527189, 0.50828072, 0.64429331], - [ 0.20502868, 0.51265277, 0.64532947], - [ 0.20479718, 0.51702417, 0.64636539], - [ 0.20457804, 0.52139527, 0.64739979], - [ 0.20437304, 0.52576622, 0.64843198], - [ 0.20418396, 0.53013715, 0.64946117], - [ 0.20401238, 0.53450825, 0.65048638], - [ 0.20385896, 0.53887991, 0.65150606], - [ 0.20372653, 0.54325208, 0.65251978], - [ 0.20361709, 0.5476249 , 0.6535266 ], - [ 0.20353258, 0.55199854, 0.65452542], - [ 0.20347472, 0.55637318, 0.655515 ], - [ 0.20344718, 0.56074869, 0.65649508], - [ 0.20345161, 0.56512531, 0.65746419], - [ 0.20349089, 0.56950304, 0.65842151], - [ 0.20356842, 0.57388184, 0.65936642], - [ 0.20368663, 0.57826181, 0.66029768], - [ 0.20384884, 0.58264293, 0.6612145 ], - [ 0.20405904, 0.58702506, 0.66211645], - [ 0.20431921, 0.59140842, 0.66300179], - [ 0.20463464, 0.59579264, 0.66387079], - [ 0.20500731, 0.60017798, 0.66472159], - [ 0.20544449, 0.60456387, 0.66555409], - [ 0.20596097, 0.60894927, 0.66636568], - [ 0.20654832, 0.61333521, 0.66715744], - [ 0.20721003, 0.61772167, 0.66792838], - [ 0.20795035, 0.62210845, 0.66867802], - [ 0.20877302, 0.62649546, 0.66940555], - [ 0.20968223, 0.63088252, 0.6701105 ], - [ 0.21068163, 0.63526951, 0.67079211], - [ 0.21177544, 0.63965621, 0.67145005], - [ 0.21298582, 0.64404072, 0.67208182], - [ 0.21430361, 0.64842404, 0.67268861], - [ 0.21572716, 0.65280655, 0.67326978], - [ 0.21726052, 0.65718791, 0.6738255 ], - [ 0.21890636, 0.66156803, 0.67435491], - [ 0.220668 , 0.66594665, 0.67485792], - [ 0.22255447, 0.67032297, 0.67533374], - [ 0.22458372, 0.67469531, 0.67578061], - [ 0.22673713, 0.67906542, 0.67620044], - [ 0.22901625, 0.6834332 , 0.67659251], - [ 0.23142316, 0.68779836, 0.67695703], - [ 0.23395924, 0.69216072, 0.67729378], - [ 0.23663857, 0.69651881, 0.67760151], - [ 0.23946645, 0.70087194, 0.67788018], - [ 0.24242624, 0.70522162, 0.67813088], - [ 0.24549008, 0.70957083, 0.67835215], - [ 0.24863372, 0.71392166, 0.67854868], - [ 0.25187832, 0.71827158, 0.67872193], - [ 0.25524083, 0.72261873, 0.67887024], - [ 0.25870947, 0.72696469, 0.67898912], - [ 0.26229238, 0.73130855, 0.67907645], - [ 0.26604085, 0.73564353, 0.67914062], - [ 0.26993099, 0.73997282, 0.67917264], - [ 0.27397488, 0.74429484, 0.67917096], - [ 0.27822463, 0.74860229, 0.67914468], - [ 0.28264201, 0.75290034, 0.67907959], - [ 0.2873016 , 0.75717817, 0.67899164], - [ 0.29215894, 0.76144162, 0.67886578], - [ 0.29729823, 0.76567816, 0.67871894], - [ 0.30268199, 0.76989232, 0.67853896], - [ 0.30835665, 0.77407636, 0.67833512], - [ 0.31435139, 0.77822478, 0.67811118], - [ 0.3206671 , 0.78233575, 0.67786729], - [ 0.32733158, 0.78640315, 0.67761027], - [ 0.33437168, 0.79042043, 0.67734882], - [ 0.34182112, 0.79437948, 0.67709394], - [ 0.34968889, 0.79827511, 0.67685638], - [ 0.35799244, 0.80210037, 0.67664969], - [ 0.36675371, 0.80584651, 0.67649539], - [ 0.3759816 , 0.80950627, 0.67641393], - [ 0.38566792, 0.81307432, 0.67642947], - [ 0.39579804, 0.81654592, 0.67656899], - [ 0.40634556, 0.81991799, 0.67686215], - [ 0.41730243, 0.82318339, 0.67735255], - [ 0.4285828 , 0.82635051, 0.6780564 ], - [ 0.44012728, 0.82942353, 0.67900049], - [ 0.45189421, 0.83240398, 0.68021733], - [ 0.46378379, 0.83530763, 0.6817062 ], - [ 0.47573199, 0.83814472, 0.68347352], - [ 0.48769865, 0.84092197, 0.68552698], - [ 0.49962354, 0.84365379, 0.68783929], - [ 0.5114027 , 0.8463718 , 0.69029789], - [ 0.52301693, 0.84908401, 0.69288545], - [ 0.53447549, 0.85179048, 0.69561066], - [ 0.54578602, 0.8544913 , 0.69848331], - [ 0.55695565, 0.85718723, 0.70150427], - [ 0.56798832, 0.85987893, 0.70468261], - [ 0.57888639, 0.86256715, 0.70802931], - [ 0.5896541 , 0.8652532 , 0.71154204], - [ 0.60028928, 0.86793835, 0.71523675], - [ 0.61079441, 0.87062438, 0.71910895], - [ 0.62116633, 0.87331311, 0.72317003], - [ 0.63140509, 0.87600675, 0.72741689], - [ 0.64150735, 0.87870746, 0.73185717], - [ 0.65147219, 0.8814179 , 0.73648495], - [ 0.66129632, 0.8841403 , 0.74130658], - [ 0.67097934, 0.88687758, 0.74631123], - [ 0.68051833, 0.88963189, 0.75150483], - [ 0.68991419, 0.89240612, 0.75687187], - [ 0.69916533, 0.89520211, 0.76241714], - [ 0.70827373, 0.89802257, 0.76812286], - [ 0.71723995, 0.90086891, 0.77399039], - [ 0.72606665, 0.90374337, 0.7800041 ], - [ 0.73475675, 0.90664718, 0.78615802], - [ 0.74331358, 0.90958151, 0.79244474], - [ 0.75174143, 0.91254787, 0.79884925], - [ 0.76004473, 0.91554656, 0.80536823], - [ 0.76827704, 0.91856549, 0.81196513], - [ 0.77647029, 0.921603 , 0.81855729], - [ 0.78462009, 0.92466151, 0.82514119], - [ 0.79273542, 0.92773848, 0.83172131], - [ 0.8008109 , 0.93083672, 0.83829355], - [ 0.80885107, 0.93395528, 0.84485982], - [ 0.81685878, 0.9370938 , 0.85142101], - [ 0.82483206, 0.94025378, 0.8579751 ], - [ 0.83277661, 0.94343371, 0.86452477], - [ 0.84069127, 0.94663473, 0.87106853], - [ 0.84857662, 0.9498573 , 0.8776059 ], - [ 0.8564431 , 0.95309792, 0.88414253], - [ 0.86429066, 0.95635719, 0.89067759], - [ 0.87218969, 0.95960708, 0.89725384] -] - - -_vlag_lut = [ - [ 0.13850039, 0.41331206, 0.74052025], - [ 0.15077609, 0.41762684, 0.73970427], - [ 0.16235219, 0.4219191 , 0.7389667 ], - [ 0.1733322 , 0.42619024, 0.73832537], - [ 0.18382538, 0.43044226, 0.73776764], - [ 0.19394034, 0.4346772 , 0.73725867], - [ 0.20367115, 0.43889576, 0.73685314], - [ 0.21313625, 0.44310003, 0.73648045], - [ 0.22231173, 0.44729079, 0.73619681], - [ 0.23125148, 0.45146945, 0.73597803], - [ 0.23998101, 0.45563715, 0.7358223 ], - [ 0.24853358, 0.45979489, 0.73571524], - [ 0.25691416, 0.4639437 , 0.73566943], - [ 0.26513894, 0.46808455, 0.73568319], - [ 0.27322194, 0.47221835, 0.73575497], - [ 0.28117543, 0.47634598, 0.73588332], - [ 0.28901021, 0.48046826, 0.73606686], - [ 0.2967358 , 0.48458597, 0.73630433], - [ 0.30436071, 0.48869986, 0.73659451], - [ 0.3118955 , 0.49281055, 0.73693255], - [ 0.31935389, 0.49691847, 0.73730851], - [ 0.32672701, 0.5010247 , 0.73774013], - [ 0.33402607, 0.50512971, 0.73821941], - [ 0.34125337, 0.50923419, 0.73874905], - [ 0.34840921, 0.51333892, 0.73933402], - [ 0.35551826, 0.51744353, 0.73994642], - [ 0.3625676 , 0.52154929, 0.74060763], - [ 0.36956356, 0.52565656, 0.74131327], - [ 0.37649902, 0.52976642, 0.74207698], - [ 0.38340273, 0.53387791, 0.74286286], - [ 0.39025859, 0.53799253, 0.7436962 ], - [ 0.39706821, 0.54211081, 0.744578 ], - [ 0.40384046, 0.54623277, 0.74549872], - [ 0.41058241, 0.55035849, 0.74645094], - [ 0.41728385, 0.55448919, 0.74745174], - [ 0.42395178, 0.55862494, 0.74849357], - [ 0.4305964 , 0.56276546, 0.74956387], - [ 0.4372044 , 0.56691228, 0.75068412], - [ 0.4437909 , 0.57106468, 0.75183427], - [ 0.45035117, 0.5752235 , 0.75302312], - [ 0.45687824, 0.57938983, 0.75426297], - [ 0.46339713, 0.58356191, 0.75551816], - [ 0.46988778, 0.58774195, 0.75682037], - [ 0.47635605, 0.59192986, 0.75816245], - [ 0.48281101, 0.5961252 , 0.75953212], - [ 0.4892374 , 0.60032986, 0.76095418], - [ 0.49566225, 0.60454154, 0.76238852], - [ 0.50206137, 0.60876307, 0.76387371], - [ 0.50845128, 0.61299312, 0.76538551], - [ 0.5148258 , 0.61723272, 0.76693475], - [ 0.52118385, 0.62148236, 0.76852436], - [ 0.52753571, 0.62574126, 0.77013939], - [ 0.53386831, 0.63001125, 0.77180152], - [ 0.54020159, 0.63429038, 0.7734803 ], - [ 0.54651272, 0.63858165, 0.77521306], - [ 0.55282975, 0.64288207, 0.77695608], - [ 0.55912585, 0.64719519, 0.77875327], - [ 0.56542599, 0.65151828, 0.78056551], - [ 0.57170924, 0.65585426, 0.78242747], - [ 0.57799572, 0.6602009 , 0.78430751], - [ 0.58426817, 0.66456073, 0.78623458], - [ 0.590544 , 0.66893178, 0.78818117], - [ 0.59680758, 0.67331643, 0.79017369], - [ 0.60307553, 0.67771273, 0.79218572], - [ 0.60934065, 0.68212194, 0.79422987], - [ 0.61559495, 0.68654548, 0.7963202 ], - [ 0.62185554, 0.69098125, 0.79842918], - [ 0.62810662, 0.69543176, 0.80058381], - [ 0.63436425, 0.69989499, 0.80275812], - [ 0.64061445, 0.70437326, 0.80497621], - [ 0.6468706 , 0.70886488, 0.80721641], - [ 0.65312213, 0.7133717 , 0.80949719], - [ 0.65937818, 0.71789261, 0.81180392], - [ 0.66563334, 0.72242871, 0.81414642], - [ 0.67189155, 0.72697967, 0.81651872], - [ 0.67815314, 0.73154569, 0.81892097], - [ 0.68441395, 0.73612771, 0.82136094], - [ 0.69068321, 0.74072452, 0.82382353], - [ 0.69694776, 0.7453385 , 0.82633199], - [ 0.70322431, 0.74996721, 0.8288583 ], - [ 0.70949595, 0.75461368, 0.83143221], - [ 0.7157774 , 0.75927574, 0.83402904], - [ 0.72206299, 0.76395461, 0.83665922], - [ 0.72835227, 0.76865061, 0.8393242 ], - [ 0.73465238, 0.7733628 , 0.84201224], - [ 0.74094862, 0.77809393, 0.84474951], - [ 0.74725683, 0.78284158, 0.84750915], - [ 0.75357103, 0.78760701, 0.85030217], - [ 0.75988961, 0.79239077, 0.85313207], - [ 0.76621987, 0.79719185, 0.85598668], - [ 0.77255045, 0.8020125 , 0.85888658], - [ 0.77889241, 0.80685102, 0.86181298], - [ 0.78524572, 0.81170768, 0.86476656], - [ 0.79159841, 0.81658489, 0.86776906], - [ 0.79796459, 0.82148036, 0.8707962 ], - [ 0.80434168, 0.82639479, 0.87385315], - [ 0.8107221 , 0.83132983, 0.87695392], - [ 0.81711301, 0.8362844 , 0.88008641], - [ 0.82351479, 0.84125863, 0.88325045], - [ 0.82992772, 0.84625263, 0.88644594], - [ 0.83634359, 0.85126806, 0.8896878 ], - [ 0.84277295, 0.85630293, 0.89295721], - [ 0.84921192, 0.86135782, 0.89626076], - [ 0.85566206, 0.866432 , 0.89959467], - [ 0.86211514, 0.87152627, 0.90297183], - [ 0.86857483, 0.87663856, 0.90638248], - [ 0.87504231, 0.88176648, 0.90981938], - [ 0.88151194, 0.88690782, 0.91328493], - [ 0.88797938, 0.89205857, 0.91677544], - [ 0.89443865, 0.89721298, 0.9202854 ], - [ 0.90088204, 0.90236294, 0.92380601], - [ 0.90729768, 0.90749778, 0.92732797], - [ 0.91367037, 0.91260329, 0.93083814], - [ 0.91998105, 0.91766106, 0.93431861], - [ 0.92620596, 0.92264789, 0.93774647], - [ 0.93231683, 0.9275351 , 0.94109192], - [ 0.93827772, 0.9322888 , 0.94432312], - [ 0.94404755, 0.93686925, 0.94740137], - [ 0.94958284, 0.94123072, 0.95027696], - [ 0.95482682, 0.9453245 , 0.95291103], - [ 0.9597248 , 0.94909728, 0.95525103], - [ 0.96422552, 0.95249273, 0.95723271], - [ 0.96826161, 0.95545812, 0.95882188], - [ 0.97178458, 0.95793984, 0.95995705], - [ 0.97474105, 0.95989142, 0.96059997], - [ 0.97708604, 0.96127366, 0.96071853], - [ 0.97877855, 0.96205832, 0.96030095], - [ 0.97978484, 0.96222949, 0.95935496], - [ 0.9805997 , 0.96155216, 0.95813083], - [ 0.98152619, 0.95993719, 0.95639322], - [ 0.9819726 , 0.95766608, 0.95399269], - [ 0.98191855, 0.9547873 , 0.95098107], - [ 0.98138514, 0.95134771, 0.94740644], - [ 0.98040845, 0.94739906, 0.94332125], - [ 0.97902107, 0.94300131, 0.93878672], - [ 0.97729348, 0.93820409, 0.93385135], - [ 0.9752533 , 0.933073 , 0.92858252], - [ 0.97297834, 0.92765261, 0.92302309], - [ 0.97049104, 0.92200317, 0.91723505], - [ 0.96784372, 0.91616744, 0.91126063], - [ 0.96507281, 0.91018664, 0.90514124], - [ 0.96222034, 0.90409203, 0.89890756], - [ 0.9593079 , 0.89791478, 0.89259122], - [ 0.95635626, 0.89167908, 0.88621654], - [ 0.95338303, 0.88540373, 0.87980238], - [ 0.95040174, 0.87910333, 0.87336339], - [ 0.94742246, 0.87278899, 0.86691076], - [ 0.94445249, 0.86646893, 0.86045277], - [ 0.94150476, 0.86014606, 0.85399191], - [ 0.93857394, 0.85382798, 0.84753642], - [ 0.93566206, 0.84751766, 0.84108935], - [ 0.93277194, 0.8412164 , 0.83465197], - [ 0.92990106, 0.83492672, 0.82822708], - [ 0.92704736, 0.82865028, 0.82181656], - [ 0.92422703, 0.82238092, 0.81541333], - [ 0.92142581, 0.81612448, 0.80902415], - [ 0.91864501, 0.80988032, 0.80264838], - [ 0.91587578, 0.80365187, 0.79629001], - [ 0.9131367 , 0.79743115, 0.78994 ], - [ 0.91041602, 0.79122265, 0.78360361], - [ 0.90771071, 0.78502727, 0.77728196], - [ 0.90501581, 0.77884674, 0.7709771 ], - [ 0.90235365, 0.77267117, 0.76467793], - [ 0.8997019 , 0.76650962, 0.75839484], - [ 0.89705346, 0.76036481, 0.752131 ], - [ 0.89444021, 0.75422253, 0.74587047], - [ 0.89183355, 0.74809474, 0.73962689], - [ 0.88923216, 0.74198168, 0.73340061], - [ 0.88665892, 0.73587283, 0.72717995], - [ 0.88408839, 0.72977904, 0.72097718], - [ 0.88153537, 0.72369332, 0.71478461], - [ 0.87899389, 0.7176179 , 0.70860487], - [ 0.87645157, 0.71155805, 0.7024439 ], - [ 0.8739399 , 0.70549893, 0.6962854 ], - [ 0.87142626, 0.6994551 , 0.69014561], - [ 0.8689268 , 0.69341868, 0.68401597], - [ 0.86643562, 0.687392 , 0.67789917], - [ 0.86394434, 0.68137863, 0.67179927], - [ 0.86147586, 0.67536728, 0.665704 ], - [ 0.85899928, 0.66937226, 0.6596292 ], - [ 0.85654668, 0.66337773, 0.6535577 ], - [ 0.85408818, 0.65739772, 0.64750494], - [ 0.85164413, 0.65142189, 0.64145983], - [ 0.84920091, 0.6454565 , 0.63542932], - [ 0.84676427, 0.63949827, 0.62941 ], - [ 0.84433231, 0.63354773, 0.62340261], - [ 0.84190106, 0.62760645, 0.61740899], - [ 0.83947935, 0.62166951, 0.61142404], - [ 0.8370538 , 0.61574332, 0.60545478], - [ 0.83463975, 0.60981951, 0.59949247], - [ 0.83221877, 0.60390724, 0.593547 ], - [ 0.82980985, 0.59799607, 0.58760751], - [ 0.82740268, 0.59209095, 0.58167944], - [ 0.82498638, 0.5861973 , 0.57576866], - [ 0.82258181, 0.5803034 , 0.56986307], - [ 0.82016611, 0.57442123, 0.56397539], - [ 0.81776305, 0.56853725, 0.55809173], - [ 0.81534551, 0.56266602, 0.55222741], - [ 0.81294293, 0.55679056, 0.5463651 ], - [ 0.81052113, 0.55092973, 0.54052443], - [ 0.80811509, 0.54506305, 0.53468464], - [ 0.80568952, 0.53921036, 0.52886622], - [ 0.80327506, 0.53335335, 0.52305077], - [ 0.80084727, 0.52750583, 0.51725256], - [ 0.79842217, 0.5216578 , 0.51146173], - [ 0.79599382, 0.51581223, 0.50568155], - [ 0.79355781, 0.50997127, 0.49991444], - [ 0.79112596, 0.50412707, 0.49415289], - [ 0.78867442, 0.49829386, 0.48841129], - [ 0.7862306 , 0.49245398, 0.48267247], - [ 0.7837687 , 0.48662309, 0.47695216], - [ 0.78130809, 0.4807883 , 0.47123805], - [ 0.77884467, 0.47495151, 0.46553236], - [ 0.77636283, 0.46912235, 0.45984473], - [ 0.77388383, 0.46328617, 0.45416141], - [ 0.77138912, 0.45745466, 0.44849398], - [ 0.76888874, 0.45162042, 0.44283573], - [ 0.76638802, 0.44577901, 0.43718292], - [ 0.76386116, 0.43994762, 0.43155211], - [ 0.76133542, 0.43410655, 0.42592523], - [ 0.75880631, 0.42825801, 0.42030488], - [ 0.75624913, 0.42241905, 0.41470727], - [ 0.7536919 , 0.41656866, 0.40911347], - [ 0.75112748, 0.41071104, 0.40352792], - [ 0.74854331, 0.40485474, 0.3979589 ], - [ 0.74594723, 0.39899309, 0.39240088], - [ 0.74334332, 0.39312199, 0.38685075], - [ 0.74073277, 0.38723941, 0.3813074 ], - [ 0.73809409, 0.38136133, 0.37578553], - [ 0.73544692, 0.37547129, 0.37027123], - [ 0.73278943, 0.36956954, 0.36476549], - [ 0.73011829, 0.36365761, 0.35927038], - [ 0.72743485, 0.35773314, 0.35378465], - [ 0.72472722, 0.35180504, 0.34831662], - [ 0.72200473, 0.34586421, 0.34285937], - [ 0.71927052, 0.33990649, 0.33741033], - [ 0.71652049, 0.33393396, 0.33197219], - [ 0.71375362, 0.32794602, 0.32654545], - [ 0.71096951, 0.32194148, 0.32113016], - [ 0.70816772, 0.31591904, 0.31572637], - [ 0.70534784, 0.30987734, 0.31033414], - [ 0.70250944, 0.30381489, 0.30495353], - [ 0.69965211, 0.2977301 , 0.2995846 ], - [ 0.6967754 , 0.29162126, 0.29422741], - [ 0.69388446, 0.28548074, 0.28887769], - [ 0.69097561, 0.2793096 , 0.28353795], - [ 0.68803513, 0.27311993, 0.27821876], - [ 0.6850794 , 0.26689144, 0.27290694], - [ 0.682108 , 0.26062114, 0.26760246], - [ 0.67911013, 0.2543177 , 0.26231367], - [ 0.67609393, 0.24796818, 0.25703372], - [ 0.67305921, 0.24156846, 0.25176238], - [ 0.67000176, 0.23511902, 0.24650278], - [ 0.66693423, 0.22859879, 0.24124404], - [ 0.6638441 , 0.22201742, 0.2359961 ], - [ 0.66080672, 0.21526712, 0.23069468] -] - - -_icefire_lut = [ - [ 0.73936227, 0.90443867, 0.85757238], - [ 0.72888063, 0.89639109, 0.85488394], - [ 0.71834255, 0.88842162, 0.8521605 ], - [ 0.70773866, 0.88052939, 0.849422 ], - [ 0.69706215, 0.87271313, 0.84668315], - [ 0.68629021, 0.86497329, 0.84398721], - [ 0.67543654, 0.85730617, 0.84130969], - [ 0.66448539, 0.84971123, 0.83868005], - [ 0.65342679, 0.84218728, 0.83611512], - [ 0.64231804, 0.83471867, 0.83358584], - [ 0.63117745, 0.827294 , 0.83113431], - [ 0.62000484, 0.81991069, 0.82876741], - [ 0.60879435, 0.81256797, 0.82648905], - [ 0.59754118, 0.80526458, 0.82430414], - [ 0.58624247, 0.79799884, 0.82221573], - [ 0.57489525, 0.7907688 , 0.82022901], - [ 0.56349779, 0.78357215, 0.81834861], - [ 0.55204294, 0.77640827, 0.81657563], - [ 0.54052516, 0.76927562, 0.81491462], - [ 0.52894085, 0.76217215, 0.81336913], - [ 0.51728854, 0.75509528, 0.81194156], - [ 0.50555676, 0.74804469, 0.81063503], - [ 0.49373871, 0.7410187 , 0.80945242], - [ 0.48183174, 0.73401449, 0.80839675], - [ 0.46982587, 0.72703075, 0.80747097], - [ 0.45770893, 0.72006648, 0.80667756], - [ 0.44547249, 0.71311941, 0.80601991], - [ 0.43318643, 0.70617126, 0.80549278], - [ 0.42110294, 0.69916972, 0.80506683], - [ 0.40925101, 0.69211059, 0.80473246], - [ 0.3976693 , 0.68498786, 0.80448272], - [ 0.38632002, 0.67781125, 0.80431024], - [ 0.37523981, 0.67057537, 0.80420832], - [ 0.36442578, 0.66328229, 0.80417474], - [ 0.35385939, 0.65593699, 0.80420591], - [ 0.34358916, 0.64853177, 0.8043 ], - [ 0.33355526, 0.64107876, 0.80445484], - [ 0.32383062, 0.63356578, 0.80467091], - [ 0.31434372, 0.62600624, 0.8049475 ], - [ 0.30516161, 0.618389 , 0.80528692], - [ 0.29623491, 0.61072284, 0.80569021], - [ 0.28759072, 0.60300319, 0.80616055], - [ 0.27923924, 0.59522877, 0.80669803], - [ 0.27114651, 0.5874047 , 0.80730545], - [ 0.26337153, 0.57952055, 0.80799113], - [ 0.25588696, 0.57157984, 0.80875922], - [ 0.248686 , 0.56358255, 0.80961366], - [ 0.24180668, 0.55552289, 0.81055123], - [ 0.23526251, 0.54739477, 0.8115939 ], - [ 0.22921445, 0.53918506, 0.81267292], - [ 0.22397687, 0.53086094, 0.8137141 ], - [ 0.21977058, 0.52241482, 0.81457651], - [ 0.21658989, 0.51384321, 0.81528511], - [ 0.21452772, 0.50514155, 0.81577278], - [ 0.21372783, 0.49630865, 0.81589566], - [ 0.21409503, 0.48734861, 0.81566163], - [ 0.2157176 , 0.47827123, 0.81487615], - [ 0.21842857, 0.46909168, 0.81351614], - [ 0.22211705, 0.45983212, 0.81146983], - [ 0.22665681, 0.45052233, 0.80860217], - [ 0.23176013, 0.44119137, 0.80494325], - [ 0.23727775, 0.43187704, 0.80038017], - [ 0.24298285, 0.42261123, 0.79493267], - [ 0.24865068, 0.41341842, 0.78869164], - [ 0.25423116, 0.40433127, 0.78155831], - [ 0.25950239, 0.39535521, 0.77376848], - [ 0.2644736 , 0.38651212, 0.76524809], - [ 0.26901584, 0.37779582, 0.75621942], - [ 0.27318141, 0.36922056, 0.746605 ], - [ 0.27690355, 0.3607736 , 0.73659374], - [ 0.28023585, 0.35244234, 0.72622103], - [ 0.28306009, 0.34438449, 0.71500731], - [ 0.28535896, 0.33660243, 0.70303975], - [ 0.28708711, 0.32912157, 0.69034504], - [ 0.28816354, 0.32200604, 0.67684067], - [ 0.28862749, 0.31519824, 0.66278813], - [ 0.28847904, 0.30869064, 0.6482815 ], - [ 0.28770912, 0.30250126, 0.63331265], - [ 0.28640325, 0.29655509, 0.61811374], - [ 0.28458943, 0.29082155, 0.60280913], - [ 0.28233561, 0.28527482, 0.58742866], - [ 0.27967038, 0.2798938 , 0.57204225], - [ 0.27665361, 0.27465357, 0.55667809], - [ 0.27332564, 0.2695165 , 0.54145387], - [ 0.26973851, 0.26447054, 0.52634916], - [ 0.2659204 , 0.25949691, 0.511417 ], - [ 0.26190145, 0.25458123, 0.49668768], - [ 0.2577151 , 0.24971691, 0.48214874], - [ 0.25337618, 0.24490494, 0.46778758], - [ 0.24890842, 0.24013332, 0.45363816], - [ 0.24433654, 0.23539226, 0.4397245 ], - [ 0.23967922, 0.23067729, 0.4260591 ], - [ 0.23495608, 0.22598894, 0.41262952], - [ 0.23018113, 0.22132414, 0.39945577], - [ 0.22534609, 0.21670847, 0.38645794], - [ 0.22048761, 0.21211723, 0.37372555], - [ 0.2156198 , 0.20755389, 0.36125301], - [ 0.21074637, 0.20302717, 0.34903192], - [ 0.20586893, 0.19855368, 0.33701661], - [ 0.20101757, 0.19411573, 0.32529173], - [ 0.19619947, 0.18972425, 0.31383846], - [ 0.19140726, 0.18540157, 0.30260777], - [ 0.1866769 , 0.1811332 , 0.29166583], - [ 0.18201285, 0.17694992, 0.28088776], - [ 0.17745228, 0.17282141, 0.27044211], - [ 0.17300684, 0.16876921, 0.26024893], - [ 0.16868273, 0.16479861, 0.25034479], - [ 0.16448691, 0.16091728, 0.24075373], - [ 0.16043195, 0.15714351, 0.23141745], - [ 0.15652427, 0.15348248, 0.22238175], - [ 0.15277065, 0.14994111, 0.21368395], - [ 0.14918274, 0.14653431, 0.20529486], - [ 0.14577095, 0.14327403, 0.19720829], - [ 0.14254381, 0.14016944, 0.18944326], - [ 0.13951035, 0.13723063, 0.18201072], - [ 0.13667798, 0.13446606, 0.17493774], - [ 0.13405762, 0.13188822, 0.16820842], - [ 0.13165767, 0.12950667, 0.16183275], - [ 0.12948748, 0.12733187, 0.15580631], - [ 0.12755435, 0.1253723 , 0.15014098], - [ 0.12586516, 0.12363617, 0.1448459 ], - [ 0.12442647, 0.12213143, 0.13992571], - [ 0.12324241, 0.12086419, 0.13539995], - [ 0.12232067, 0.11984278, 0.13124644], - [ 0.12166209, 0.11907077, 0.12749671], - [ 0.12126982, 0.11855309, 0.12415079], - [ 0.12114244, 0.11829179, 0.1212385 ], - [ 0.12127766, 0.11828837, 0.11878534], - [ 0.12284806, 0.1179729 , 0.11772022], - [ 0.12619498, 0.11721796, 0.11770203], - [ 0.129968 , 0.11663788, 0.11792377], - [ 0.13410011, 0.11625146, 0.11839138], - [ 0.13855459, 0.11606618, 0.11910584], - [ 0.14333775, 0.11607038, 0.1200606 ], - [ 0.148417 , 0.11626929, 0.12125453], - [ 0.15377389, 0.11666192, 0.12268364], - [ 0.15941427, 0.11723486, 0.12433911], - [ 0.16533376, 0.11797856, 0.12621303], - [ 0.17152547, 0.11888403, 0.12829735], - [ 0.17797765, 0.11994436, 0.13058435], - [ 0.18468769, 0.12114722, 0.13306426], - [ 0.19165663, 0.12247737, 0.13572616], - [ 0.19884415, 0.12394381, 0.1385669 ], - [ 0.20627181, 0.12551883, 0.14157124], - [ 0.21394877, 0.12718055, 0.14472604], - [ 0.22184572, 0.12893119, 0.14802579], - [ 0.22994394, 0.13076731, 0.15146314], - [ 0.23823937, 0.13267611, 0.15502793], - [ 0.24676041, 0.13462172, 0.15870321], - [ 0.25546457, 0.13661751, 0.16248722], - [ 0.26433628, 0.13865956, 0.16637301], - [ 0.27341345, 0.14070412, 0.17034221], - [ 0.28264773, 0.14277192, 0.1743957 ], - [ 0.29202272, 0.14486161, 0.17852793], - [ 0.30159648, 0.14691224, 0.1827169 ], - [ 0.31129002, 0.14897583, 0.18695213], - [ 0.32111555, 0.15103351, 0.19119629], - [ 0.33107961, 0.1530674 , 0.19543758], - [ 0.34119892, 0.15504762, 0.1996803 ], - [ 0.35142388, 0.15701131, 0.20389086], - [ 0.36178937, 0.1589124 , 0.20807639], - [ 0.37229381, 0.16073993, 0.21223189], - [ 0.38288348, 0.16254006, 0.2163249 ], - [ 0.39359592, 0.16426336, 0.22036577], - [ 0.40444332, 0.16588767, 0.22434027], - [ 0.41537995, 0.16745325, 0.2282297 ], - [ 0.42640867, 0.16894939, 0.23202755], - [ 0.43754706, 0.17034847, 0.23572899], - [ 0.44878564, 0.1716535 , 0.23932344], - [ 0.4601126 , 0.17287365, 0.24278607], - [ 0.47151732, 0.17401641, 0.24610337], - [ 0.48300689, 0.17506676, 0.2492737 ], - [ 0.49458302, 0.17601892, 0.25227688], - [ 0.50623876, 0.17687777, 0.255096 ], - [ 0.5179623 , 0.17765528, 0.2577162 ], - [ 0.52975234, 0.17835232, 0.2601134 ], - [ 0.54159776, 0.17898292, 0.26226847], - [ 0.55348804, 0.17956232, 0.26416003], - [ 0.56541729, 0.18010175, 0.26575971], - [ 0.57736669, 0.180631 , 0.26704888], - [ 0.58932081, 0.18117827, 0.26800409], - [ 0.60127582, 0.18175888, 0.26858488], - [ 0.61319563, 0.1824336 , 0.2687872 ], - [ 0.62506376, 0.18324015, 0.26858301], - [ 0.63681202, 0.18430173, 0.26795276], - [ 0.64842603, 0.18565472, 0.26689463], - [ 0.65988195, 0.18734638, 0.26543435], - [ 0.67111966, 0.18948885, 0.26357955], - [ 0.68209194, 0.19216636, 0.26137175], - [ 0.69281185, 0.19535326, 0.25887063], - [ 0.70335022, 0.19891271, 0.25617971], - [ 0.71375229, 0.20276438, 0.25331365], - [ 0.72401436, 0.20691287, 0.25027366], - [ 0.73407638, 0.21145051, 0.24710661], - [ 0.74396983, 0.21631913, 0.24380715], - [ 0.75361506, 0.22163653, 0.24043996], - [ 0.7630579 , 0.22731637, 0.23700095], - [ 0.77222228, 0.23346231, 0.23356628], - [ 0.78115441, 0.23998404, 0.23013825], - [ 0.78979746, 0.24694858, 0.22678822], - [ 0.79819286, 0.25427223, 0.22352658], - [ 0.80630444, 0.26198807, 0.22040877], - [ 0.81417437, 0.27001406, 0.21744645], - [ 0.82177364, 0.27837336, 0.21468316], - [ 0.82915955, 0.28696963, 0.21210766], - [ 0.83628628, 0.2958499 , 0.20977813], - [ 0.84322168, 0.30491136, 0.20766435], - [ 0.84995458, 0.31415945, 0.2057863 ], - [ 0.85648867, 0.32358058, 0.20415327], - [ 0.86286243, 0.33312058, 0.20274969], - [ 0.86908321, 0.34276705, 0.20157271], - [ 0.87512876, 0.3525416 , 0.20064949], - [ 0.88100349, 0.36243385, 0.19999078], - [ 0.8866469 , 0.37249496, 0.1997976 ], - [ 0.89203964, 0.38273475, 0.20013431], - [ 0.89713496, 0.39318156, 0.20121514], - [ 0.90195099, 0.40380687, 0.20301555], - [ 0.90648379, 0.41460191, 0.20558847], - [ 0.9106967 , 0.42557857, 0.20918529], - [ 0.91463791, 0.43668557, 0.21367954], - [ 0.91830723, 0.44790913, 0.21916352], - [ 0.92171507, 0.45922856, 0.22568002], - [ 0.92491786, 0.4705936 , 0.23308207], - [ 0.92790792, 0.48200153, 0.24145932], - [ 0.93073701, 0.49341219, 0.25065486], - [ 0.93343918, 0.5048017 , 0.26056148], - [ 0.93602064, 0.51616486, 0.27118485], - [ 0.93850535, 0.52748892, 0.28242464], - [ 0.94092933, 0.53875462, 0.29416042], - [ 0.94330011, 0.5499628 , 0.30634189], - [ 0.94563159, 0.56110987, 0.31891624], - [ 0.94792955, 0.57219822, 0.33184256], - [ 0.95020929, 0.5832232 , 0.34508419], - [ 0.95247324, 0.59419035, 0.35859866], - [ 0.95471709, 0.60510869, 0.37236035], - [ 0.95698411, 0.61595766, 0.38629631], - [ 0.95923863, 0.62676473, 0.40043317], - [ 0.9615041 , 0.6375203 , 0.41474106], - [ 0.96371553, 0.64826619, 0.42928335], - [ 0.96591497, 0.65899621, 0.44380444], - [ 0.96809871, 0.66971662, 0.45830232], - [ 0.9702495 , 0.6804394 , 0.47280492], - [ 0.9723881 , 0.69115622, 0.48729272], - [ 0.97450723, 0.70187358, 0.50178034], - [ 0.9766108 , 0.712592 , 0.51626837], - [ 0.97871716, 0.72330511, 0.53074053], - [ 0.98082222, 0.73401769, 0.54520694], - [ 0.9829001 , 0.74474445, 0.5597019 ], - [ 0.98497466, 0.75547635, 0.57420239], - [ 0.98705581, 0.76621129, 0.58870185], - [ 0.98913325, 0.77695637, 0.60321626], - [ 0.99119918, 0.78771716, 0.61775821], - [ 0.9932672 , 0.79848979, 0.63231691], - [ 0.99535958, 0.80926704, 0.64687278], - [ 0.99740544, 0.82008078, 0.66150571], - [ 0.9992197 , 0.83100723, 0.6764127 ] -] - - -_flare_lut = [ - [0.92907237, 0.68878959, 0.50411509], - [0.92891402, 0.68494686, 0.50173994], - [0.92864754, 0.68116207, 0.4993754], - [0.92836112, 0.67738527, 0.49701572], - [0.9280599, 0.67361354, 0.49466044], - [0.92775569, 0.66983999, 0.49230866], - [0.9274375, 0.66607098, 0.48996097], - [0.927111, 0.66230315, 0.48761688], - [0.92677996, 0.6585342, 0.485276], - [0.92644317, 0.65476476, 0.48293832], - [0.92609759, 0.65099658, 0.48060392], - [0.925747, 0.64722729, 0.47827244], - [0.92539502, 0.64345456, 0.47594352], - [0.92503106, 0.6396848, 0.47361782], - [0.92466877, 0.6359095, 0.47129427], - [0.92429828, 0.63213463, 0.46897349], - [0.92392172, 0.62835879, 0.46665526], - [0.92354597, 0.62457749, 0.46433898], - [0.9231622, 0.6207962, 0.46202524], - [0.92277222, 0.61701365, 0.45971384], - [0.92237978, 0.61322733, 0.45740444], - [0.92198615, 0.60943622, 0.45509686], - [0.92158735, 0.60564276, 0.45279137], - [0.92118373, 0.60184659, 0.45048789], - [0.92077582, 0.59804722, 0.44818634], - [0.92036413, 0.59424414, 0.44588663], - [0.91994924, 0.5904368, 0.44358868], - [0.91952943, 0.58662619, 0.4412926], - [0.91910675, 0.58281075, 0.43899817], - [0.91868096, 0.57899046, 0.4367054], - [0.91825103, 0.57516584, 0.43441436], - [0.91781857, 0.57133556, 0.43212486], - [0.9173814, 0.56750099, 0.4298371], - [0.91694139, 0.56366058, 0.42755089], - [0.91649756, 0.55981483, 0.42526631], - [0.91604942, 0.55596387, 0.42298339], - [0.9155979, 0.55210684, 0.42070204], - [0.9151409, 0.54824485, 0.4184247], - [0.91466138, 0.54438817, 0.41617858], - [0.91416896, 0.54052962, 0.41396347], - [0.91366559, 0.53666778, 0.41177769], - [0.91315173, 0.53280208, 0.40962196], - [0.91262605, 0.52893336, 0.40749715], - [0.91208866, 0.52506133, 0.40540404], - [0.91153952, 0.52118582, 0.40334346], - [0.91097732, 0.51730767, 0.4013163], - [0.910403, 0.51342591, 0.39932342], - [0.90981494, 0.50954168, 0.39736571], - [0.90921368, 0.5056543, 0.39544411], - [0.90859797, 0.50176463, 0.39355952], - [0.90796841, 0.49787195, 0.39171297], - [0.90732341, 0.4939774, 0.38990532], - [0.90666382, 0.49008006, 0.38813773], - [0.90598815, 0.486181, 0.38641107], - [0.90529624, 0.48228017, 0.38472641], - [0.90458808, 0.47837738, 0.38308489], - [0.90386248, 0.47447348, 0.38148746], - [0.90311921, 0.4705685, 0.37993524], - [0.90235809, 0.46666239, 0.37842943], - [0.90157824, 0.46275577, 0.37697105], - [0.90077904, 0.45884905, 0.37556121], - [0.89995995, 0.45494253, 0.37420106], - [0.89912041, 0.4510366, 0.37289175], - [0.8982602, 0.44713126, 0.37163458], - [0.89737819, 0.44322747, 0.37043052], - [0.89647387, 0.43932557, 0.36928078], - [0.89554477, 0.43542759, 0.36818855], - [0.89458871, 0.4315354, 0.36715654], - [0.89360794, 0.42764714, 0.36618273], - [0.89260152, 0.42376366, 0.36526813], - [0.8915687, 0.41988565, 0.36441384], - [0.89050882, 0.41601371, 0.36362102], - [0.8894159, 0.41215334, 0.36289639], - [0.888292, 0.40830288, 0.36223756], - [0.88713784, 0.40446193, 0.36164328], - [0.88595253, 0.40063149, 0.36111438], - [0.88473115, 0.39681635, 0.3606566], - [0.88347246, 0.39301805, 0.36027074], - [0.88217931, 0.38923439, 0.35995244], - [0.880851, 0.38546632, 0.35970244], - [0.87947728, 0.38172422, 0.35953127], - [0.87806542, 0.37800172, 0.35942941], - [0.87661509, 0.37429964, 0.35939659], - [0.87511668, 0.37062819, 0.35944178], - [0.87357554, 0.36698279, 0.35955811], - [0.87199254, 0.3633634, 0.35974223], - [0.87035691, 0.35978174, 0.36000516], - [0.86867647, 0.35623087, 0.36033559], - [0.86694949, 0.35271349, 0.36073358], - [0.86516775, 0.34923921, 0.36120624], - [0.86333996, 0.34580008, 0.36174113], - [0.86145909, 0.3424046, 0.36234402], - [0.85952586, 0.33905327, 0.36301129], - [0.85754536, 0.33574168, 0.36373567], - [0.855514, 0.33247568, 0.36451271], - [0.85344392, 0.32924217, 0.36533344], - [0.8513284, 0.32604977, 0.36620106], - [0.84916723, 0.32289973, 0.36711424], - [0.84696243, 0.31979068, 0.36806976], - [0.84470627, 0.31673295, 0.36907066], - [0.84240761, 0.31371695, 0.37010969], - [0.84005337, 0.31075974, 0.37119284], - [0.83765537, 0.30784814, 0.3723105], - [0.83520234, 0.30499724, 0.37346726], - [0.83270291, 0.30219766, 0.37465552], - [0.83014895, 0.29946081, 0.37587769], - [0.82754694, 0.29677989, 0.37712733], - [0.82489111, 0.29416352, 0.37840532], - [0.82218644, 0.29160665, 0.37970606], - [0.81942908, 0.28911553, 0.38102921], - [0.81662276, 0.28668665, 0.38236999], - [0.81376555, 0.28432371, 0.383727], - [0.81085964, 0.28202508, 0.38509649], - [0.8079055, 0.27979128, 0.38647583], - [0.80490309, 0.27762348, 0.3878626], - [0.80185613, 0.2755178, 0.38925253], - [0.79876118, 0.27347974, 0.39064559], - [0.79562644, 0.27149928, 0.39203532], - [0.79244362, 0.2695883, 0.39342447], - [0.78922456, 0.26773176, 0.3948046], - [0.78596161, 0.26594053, 0.39617873], - [0.7826624, 0.26420493, 0.39754146], - [0.77932717, 0.26252522, 0.39889102], - [0.77595363, 0.2609049, 0.4002279], - [0.77254999, 0.25933319, 0.40154704], - [0.76911107, 0.25781758, 0.40284959], - [0.76564158, 0.25635173, 0.40413341], - [0.76214598, 0.25492998, 0.40539471], - [0.75861834, 0.25356035, 0.40663694], - [0.75506533, 0.25223402, 0.40785559], - [0.75148963, 0.2509473, 0.40904966], - [0.74788835, 0.24970413, 0.41022028], - [0.74426345, 0.24850191, 0.41136599], - [0.74061927, 0.24733457, 0.41248516], - [0.73695678, 0.24620072, 0.41357737], - [0.73327278, 0.24510469, 0.41464364], - [0.72957096, 0.24404127, 0.4156828], - [0.72585394, 0.24300672, 0.41669383], - [0.7221226, 0.24199971, 0.41767651], - [0.71837612, 0.24102046, 0.41863486], - [0.71463236, 0.24004289, 0.41956983], - [0.7108932, 0.23906316, 0.42048681], - [0.70715842, 0.23808142, 0.42138647], - [0.70342811, 0.2370976, 0.42226844], - [0.69970218, 0.23611179, 0.42313282], - [0.69598055, 0.2351247, 0.42397678], - [0.69226314, 0.23413578, 0.42480327], - [0.68854988, 0.23314511, 0.42561234], - [0.68484064, 0.23215279, 0.42640419], - [0.68113541, 0.23115942, 0.42717615], - [0.67743412, 0.23016472, 0.42792989], - [0.67373662, 0.22916861, 0.42866642], - [0.67004287, 0.22817117, 0.42938576], - [0.66635279, 0.22717328, 0.43008427], - [0.66266621, 0.22617435, 0.43076552], - [0.65898313, 0.22517434, 0.43142956], - [0.65530349, 0.22417381, 0.43207427], - [0.65162696, 0.22317307, 0.4327001], - [0.64795375, 0.22217149, 0.43330852], - [0.64428351, 0.22116972, 0.43389854], - [0.64061624, 0.22016818, 0.43446845], - [0.63695183, 0.21916625, 0.43502123], - [0.63329016, 0.21816454, 0.43555493], - [0.62963102, 0.2171635, 0.43606881], - [0.62597451, 0.21616235, 0.43656529], - [0.62232019, 0.21516239, 0.43704153], - [0.61866821, 0.21416307, 0.43749868], - [0.61501835, 0.21316435, 0.43793808], - [0.61137029, 0.21216761, 0.4383556], - [0.60772426, 0.2111715, 0.43875552], - [0.60407977, 0.21017746, 0.43913439], - [0.60043678, 0.20918503, 0.43949412], - [0.59679524, 0.20819447, 0.43983393], - [0.59315487, 0.20720639, 0.44015254], - [0.58951566, 0.20622027, 0.44045213], - [0.58587715, 0.20523751, 0.44072926], - [0.5822395, 0.20425693, 0.44098758], - [0.57860222, 0.20328034, 0.44122241], - [0.57496549, 0.20230637, 0.44143805], - [0.57132875, 0.20133689, 0.4416298], - [0.56769215, 0.20037071, 0.44180142], - [0.5640552, 0.19940936, 0.44194923], - [0.56041794, 0.19845221, 0.44207535], - [0.55678004, 0.1975, 0.44217824], - [0.55314129, 0.19655316, 0.44225723], - [0.54950166, 0.19561118, 0.44231412], - [0.54585987, 0.19467771, 0.44234111], - [0.54221157, 0.19375869, 0.44233698], - [0.5385549, 0.19285696, 0.44229959], - [0.5348913, 0.19197036, 0.44222958], - [0.53122177, 0.1910974, 0.44212735], - [0.52754464, 0.19024042, 0.44199159], - [0.52386353, 0.18939409, 0.44182449], - [0.52017476, 0.18856368, 0.44162345], - [0.51648277, 0.18774266, 0.44139128], - [0.51278481, 0.18693492, 0.44112605], - [0.50908361, 0.18613639, 0.4408295], - [0.50537784, 0.18534893, 0.44050064], - [0.50166912, 0.18457008, 0.44014054], - [0.49795686, 0.18380056, 0.43974881], - [0.49424218, 0.18303865, 0.43932623], - [0.49052472, 0.18228477, 0.43887255], - [0.48680565, 0.1815371, 0.43838867], - [0.48308419, 0.18079663, 0.43787408], - [0.47936222, 0.18006056, 0.43733022], - [0.47563799, 0.17933127, 0.43675585], - [0.47191466, 0.17860416, 0.43615337], - [0.46818879, 0.17788392, 0.43552047], - [0.46446454, 0.17716458, 0.43486036], - [0.46073893, 0.17645017, 0.43417097], - [0.45701462, 0.17573691, 0.43345429], - [0.45329097, 0.17502549, 0.43271025], - [0.44956744, 0.17431649, 0.4319386], - [0.44584668, 0.17360625, 0.43114133], - [0.44212538, 0.17289906, 0.43031642], - [0.43840678, 0.17219041, 0.42946642], - [0.43469046, 0.17148074, 0.42859124], - [0.4309749, 0.17077192, 0.42769008], - [0.42726297, 0.17006003, 0.42676519], - [0.42355299, 0.16934709, 0.42581586], - [0.41984535, 0.16863258, 0.42484219], - [0.41614149, 0.16791429, 0.42384614], - [0.41244029, 0.16719372, 0.42282661], - [0.40874177, 0.16647061, 0.42178429], - [0.40504765, 0.16574261, 0.42072062], - [0.401357, 0.16501079, 0.41963528], - [0.397669, 0.16427607, 0.418528], - [0.39398585, 0.16353554, 0.41740053], - [0.39030735, 0.16278924, 0.41625344], - [0.3866314, 0.16203977, 0.41508517], - [0.38295904, 0.16128519, 0.41389849], - [0.37928736, 0.16052483, 0.41270599], - [0.37562649, 0.15974704, 0.41151182], - [0.37197803, 0.15895049, 0.41031532], - [0.36833779, 0.15813871, 0.40911916], - [0.36470944, 0.15730861, 0.40792149], - [0.36109117, 0.15646169, 0.40672362], - [0.35748213, 0.15559861, 0.40552633], - [0.353885, 0.15471714, 0.40432831], - [0.35029682, 0.15381967, 0.4031316], - [0.34671861, 0.1529053, 0.40193587], - [0.34315191, 0.15197275, 0.40074049], - [0.33959331, 0.15102466, 0.3995478], - [0.33604378, 0.15006017, 0.39835754], - [0.33250529, 0.14907766, 0.39716879], - [0.32897621, 0.14807831, 0.39598285], - [0.3254559, 0.14706248, 0.39480044], - [0.32194567, 0.14602909, 0.39362106], - [0.31844477, 0.14497857, 0.39244549], - [0.31494974, 0.14391333, 0.39127626], - [0.31146605, 0.14282918, 0.39011024], - [0.30798857, 0.1417297, 0.38895105], - [0.30451661, 0.14061515, 0.38779953], - [0.30105136, 0.13948445, 0.38665531], - [0.2975886, 0.1383403, 0.38552159], - [0.29408557, 0.13721193, 0.38442775] -] - - -_crest_lut = [ - [0.6468274, 0.80289262, 0.56592265], - [0.64233318, 0.80081141, 0.56639461], - [0.63791969, 0.7987162, 0.56674976], - [0.6335316, 0.79661833, 0.56706128], - [0.62915226, 0.7945212, 0.56735066], - [0.62477862, 0.79242543, 0.56762143], - [0.62042003, 0.79032918, 0.56786129], - [0.61606327, 0.78823508, 0.56808666], - [0.61171322, 0.78614216, 0.56829092], - [0.60736933, 0.78405055, 0.56847436], - [0.60302658, 0.78196121, 0.56864272], - [0.59868708, 0.77987374, 0.56879289], - [0.59435366, 0.77778758, 0.56892099], - [0.59001953, 0.77570403, 0.56903477], - [0.58568753, 0.77362254, 0.56913028], - [0.58135593, 0.77154342, 0.56920908], - [0.57702623, 0.76946638, 0.56926895], - [0.57269165, 0.76739266, 0.5693172], - [0.56835934, 0.76532092, 0.56934507], - [0.56402533, 0.76325185, 0.56935664], - [0.55968429, 0.76118643, 0.56935732], - [0.55534159, 0.75912361, 0.56934052], - [0.55099572, 0.75706366, 0.56930743], - [0.54664626, 0.75500662, 0.56925799], - [0.54228969, 0.75295306, 0.56919546], - [0.53792417, 0.75090328, 0.56912118], - [0.53355172, 0.74885687, 0.5690324], - [0.52917169, 0.74681387, 0.56892926], - [0.52478243, 0.74477453, 0.56881287], - [0.52038338, 0.74273888, 0.56868323], - [0.5159739, 0.74070697, 0.56854039], - [0.51155269, 0.73867895, 0.56838507], - [0.50711872, 0.73665492, 0.56821764], - [0.50267118, 0.73463494, 0.56803826], - [0.49822926, 0.73261388, 0.56785146], - [0.49381422, 0.73058524, 0.56767484], - [0.48942421, 0.72854938, 0.56751036], - [0.48505993, 0.72650623, 0.56735752], - [0.48072207, 0.72445575, 0.56721583], - [0.4764113, 0.72239788, 0.56708475], - [0.47212827, 0.72033258, 0.56696376], - [0.46787361, 0.71825983, 0.56685231], - [0.46364792, 0.71617961, 0.56674986], - [0.45945271, 0.71409167, 0.56665625], - [0.45528878, 0.71199595, 0.56657103], - [0.45115557, 0.70989276, 0.5664931], - [0.44705356, 0.70778212, 0.56642189], - [0.44298321, 0.70566406, 0.56635683], - [0.43894492, 0.70353863, 0.56629734], - [0.43493911, 0.70140588, 0.56624286], - [0.43096612, 0.69926587, 0.5661928], - [0.42702625, 0.69711868, 0.56614659], - [0.42311977, 0.69496438, 0.56610368], - [0.41924689, 0.69280308, 0.56606355], - [0.41540778, 0.69063486, 0.56602564], - [0.41160259, 0.68845984, 0.56598944], - [0.40783143, 0.68627814, 0.56595436], - [0.40409434, 0.68408988, 0.56591994], - [0.40039134, 0.68189518, 0.56588564], - [0.39672238, 0.6796942, 0.56585103], - [0.39308781, 0.67748696, 0.56581581], - [0.38949137, 0.67527276, 0.56578084], - [0.38592889, 0.67305266, 0.56574422], - [0.38240013, 0.67082685, 0.56570561], - [0.37890483, 0.66859548, 0.56566462], - [0.37544276, 0.66635871, 0.56562081], - [0.37201365, 0.66411673, 0.56557372], - [0.36861709, 0.6618697, 0.5655231], - [0.36525264, 0.65961782, 0.56546873], - [0.36191986, 0.65736125, 0.56541032], - [0.35861935, 0.65509998, 0.56534768], - [0.35535621, 0.65283302, 0.56528211], - [0.35212361, 0.65056188, 0.56521171], - [0.34892097, 0.64828676, 0.56513633], - [0.34574785, 0.64600783, 0.56505539], - [0.34260357, 0.64372528, 0.5649689], - [0.33948744, 0.64143931, 0.56487679], - [0.33639887, 0.6391501, 0.56477869], - [0.33334501, 0.63685626, 0.56467661], - [0.33031952, 0.63455911, 0.564569], - [0.3273199, 0.63225924, 0.56445488], - [0.32434526, 0.62995682, 0.56433457], - [0.32139487, 0.62765201, 0.56420795], - [0.31846807, 0.62534504, 0.56407446], - [0.3155731, 0.62303426, 0.56393695], - [0.31270304, 0.62072111, 0.56379321], - [0.30985436, 0.61840624, 0.56364307], - [0.30702635, 0.61608984, 0.56348606], - [0.30421803, 0.61377205, 0.56332267], - [0.30143611, 0.61145167, 0.56315419], - [0.29867863, 0.60912907, 0.56298054], - [0.29593872, 0.60680554, 0.56280022], - [0.29321538, 0.60448121, 0.56261376], - [0.2905079, 0.60215628, 0.56242036], - [0.28782827, 0.5998285, 0.56222366], - [0.28516521, 0.59749996, 0.56202093], - [0.28251558, 0.59517119, 0.56181204], - [0.27987847, 0.59284232, 0.56159709], - [0.27726216, 0.59051189, 0.56137785], - [0.27466434, 0.58818027, 0.56115433], - [0.2720767, 0.58584893, 0.56092486], - [0.26949829, 0.58351797, 0.56068983], - [0.26693801, 0.58118582, 0.56045121], - [0.26439366, 0.57885288, 0.56020858], - [0.26185616, 0.57652063, 0.55996077], - [0.25932459, 0.57418919, 0.55970795], - [0.25681303, 0.57185614, 0.55945297], - [0.25431024, 0.56952337, 0.55919385], - [0.25180492, 0.56719255, 0.5589305], - [0.24929311, 0.56486397, 0.5586654], - [0.24678356, 0.56253666, 0.55839491], - [0.24426587, 0.56021153, 0.55812473], - [0.24174022, 0.55788852, 0.55785448], - [0.23921167, 0.55556705, 0.55758211], - [0.23668315, 0.55324675, 0.55730676], - [0.23414742, 0.55092825, 0.55703167], - [0.23160473, 0.54861143, 0.5567573], - [0.22905996, 0.54629572, 0.55648168], - [0.22651648, 0.54398082, 0.5562029], - [0.22396709, 0.54166721, 0.55592542], - [0.22141221, 0.53935481, 0.55564885], - [0.21885269, 0.53704347, 0.55537294], - [0.21629986, 0.53473208, 0.55509319], - [0.21374297, 0.53242154, 0.5548144], - [0.21118255, 0.53011166, 0.55453708], - [0.2086192, 0.52780237, 0.55426067], - [0.20605624, 0.52549322, 0.55398479], - [0.20350004, 0.5231837, 0.55370601], - [0.20094292, 0.52087429, 0.55342884], - [0.19838567, 0.51856489, 0.55315283], - [0.19582911, 0.51625531, 0.55287818], - [0.19327413, 0.51394542, 0.55260469], - [0.19072933, 0.51163448, 0.5523289], - [0.18819045, 0.50932268, 0.55205372], - [0.18565609, 0.50701014, 0.55177937], - [0.18312739, 0.50469666, 0.55150597], - [0.18060561, 0.50238204, 0.55123374], - [0.178092, 0.50006616, 0.55096224], - [0.17558808, 0.49774882, 0.55069118], - [0.17310341, 0.49542924, 0.5504176], - [0.17063111, 0.49310789, 0.55014445], - [0.1681728, 0.49078458, 0.54987159], - [0.1657302, 0.48845913, 0.54959882], - [0.16330517, 0.48613135, 0.54932605], - [0.16089963, 0.48380104, 0.54905306], - [0.15851561, 0.48146803, 0.54877953], - [0.15615526, 0.47913212, 0.54850526], - [0.15382083, 0.47679313, 0.54822991], - [0.15151471, 0.47445087, 0.54795318], - [0.14924112, 0.47210502, 0.54767411], - [0.1470032, 0.46975537, 0.54739226], - [0.14480101, 0.46740187, 0.54710832], - [0.14263736, 0.46504434, 0.54682188], - [0.14051521, 0.46268258, 0.54653253], - [0.13843761, 0.46031639, 0.54623985], - [0.13640774, 0.45794558, 0.5459434], - [0.13442887, 0.45556994, 0.54564272], - [0.1325044, 0.45318928, 0.54533736], - [0.13063777, 0.4508034, 0.54502674], - [0.12883252, 0.44841211, 0.5447104], - [0.12709242, 0.44601517, 0.54438795], - [0.1254209, 0.44361244, 0.54405855], - [0.12382162, 0.44120373, 0.54372156], - [0.12229818, 0.43878887, 0.54337634], - [0.12085453, 0.4363676, 0.54302253], - [0.11949938, 0.43393955, 0.54265715], - [0.11823166, 0.43150478, 0.54228104], - [0.11705496, 0.42906306, 0.54189388], - [0.115972, 0.42661431, 0.54149449], - [0.11498598, 0.42415835, 0.54108222], - [0.11409965, 0.42169502, 0.54065622], - [0.11331533, 0.41922424, 0.5402155], - [0.11263542, 0.41674582, 0.53975931], - [0.1120615, 0.4142597, 0.53928656], - [0.11159738, 0.41176567, 0.53879549], - [0.11125248, 0.40926325, 0.53828203], - [0.11101698, 0.40675289, 0.53774864], - [0.11089152, 0.40423445, 0.53719455], - [0.11085121, 0.4017095, 0.53662425], - [0.11087217, 0.39917938, 0.53604354], - [0.11095515, 0.39664394, 0.53545166], - [0.11110676, 0.39410282, 0.53484509], - [0.11131735, 0.39155635, 0.53422678], - [0.11158595, 0.38900446, 0.53359634], - [0.11191139, 0.38644711, 0.5329534], - [0.11229224, 0.38388426, 0.53229748], - [0.11273683, 0.38131546, 0.53162393], - [0.11323438, 0.37874109, 0.53093619], - [0.11378271, 0.37616112, 0.53023413], - [0.11437992, 0.37357557, 0.52951727], - [0.11502681, 0.37098429, 0.52878396], - [0.11572661, 0.36838709, 0.52803124], - [0.11646936, 0.36578429, 0.52726234], - [0.11725299, 0.3631759, 0.52647685], - [0.1180755, 0.36056193, 0.52567436], - [0.1189438, 0.35794203, 0.5248497], - [0.11984752, 0.35531657, 0.52400649], - [0.1207833, 0.35268564, 0.52314492], - [0.12174895, 0.35004927, 0.52226461], - [0.12274959, 0.34740723, 0.52136104], - [0.12377809, 0.34475975, 0.52043639], - [0.12482961, 0.34210702, 0.51949179], - [0.125902, 0.33944908, 0.51852688], - [0.12699998, 0.33678574, 0.51753708], - [0.12811691, 0.33411727, 0.51652464], - [0.12924811, 0.33144384, 0.51549084], - [0.13039157, 0.32876552, 0.51443538], - [0.13155228, 0.32608217, 0.51335321], - [0.13272282, 0.32339407, 0.51224759], - [0.13389954, 0.32070138, 0.51111946], - [0.13508064, 0.31800419, 0.50996862], - [0.13627149, 0.31530238, 0.50878942], - [0.13746376, 0.31259627, 0.50758645], - [0.13865499, 0.30988598, 0.50636017], - [0.13984364, 0.30717161, 0.50511042], - [0.14103515, 0.30445309, 0.50383119], - [0.14222093, 0.30173071, 0.50252813], - [0.14339946, 0.2990046, 0.50120127], - [0.14456941, 0.29627483, 0.49985054], - [0.14573579, 0.29354139, 0.49847009], - [0.14689091, 0.29080452, 0.49706566], - [0.1480336, 0.28806432, 0.49563732], - [0.1491628, 0.28532086, 0.49418508], - [0.15028228, 0.28257418, 0.49270402], - [0.15138673, 0.27982444, 0.49119848], - [0.15247457, 0.27707172, 0.48966925], - [0.15354487, 0.2743161, 0.48811641], - [0.15459955, 0.27155765, 0.4865371], - [0.15563716, 0.26879642, 0.4849321], - [0.1566572, 0.26603191, 0.48330429], - [0.15765823, 0.26326032, 0.48167456], - [0.15862147, 0.26048295, 0.48005785], - [0.15954301, 0.25770084, 0.47845341], - [0.16043267, 0.25491144, 0.4768626], - [0.16129262, 0.25211406, 0.4752857], - [0.1621119, 0.24931169, 0.47372076], - [0.16290577, 0.24649998, 0.47217025], - [0.16366819, 0.24368054, 0.47063302], - [0.1644021, 0.24085237, 0.46910949], - [0.16510882, 0.2380149, 0.46759982], - [0.16579015, 0.23516739, 0.46610429], - [0.1664433, 0.2323105, 0.46462219], - [0.16707586, 0.22944155, 0.46315508], - [0.16768475, 0.22656122, 0.46170223], - [0.16826815, 0.22366984, 0.46026308], - [0.16883174, 0.22076514, 0.45883891], - [0.16937589, 0.21784655, 0.45742976], - [0.16990129, 0.21491339, 0.45603578], - [0.1704074, 0.21196535, 0.45465677], - [0.17089473, 0.20900176, 0.4532928], - [0.17136819, 0.20602012, 0.45194524], - [0.17182683, 0.20302012, 0.45061386], - [0.17227059, 0.20000106, 0.44929865], - [0.17270583, 0.19695949, 0.44800165], - [0.17313804, 0.19389201, 0.44672488], - [0.17363177, 0.19076859, 0.44549087] -] - - -_lut_dict = dict( - rocket=_rocket_lut, - mako=_mako_lut, - icefire=_icefire_lut, - vlag=_vlag_lut, - flare=_flare_lut, - crest=_crest_lut, - -) - -for _name, _lut in _lut_dict.items(): - - _cmap = colors.ListedColormap(_lut, _name) - locals()[_name] = _cmap - - _cmap_r = colors.ListedColormap(_lut[::-1], _name + "_r") - locals()[_name + "_r"] = _cmap_r - - register_colormap(_name, _cmap) - register_colormap(_name + "_r", _cmap_r) - -del colors, register_colormap diff --git a/seaborn/colors/__init__.py b/seaborn/colors/__init__.py deleted file mode 100644 index 3d0bf1d56bdc5c0e724c8eeb95200297884337cc..0000000000000000000000000000000000000000 --- a/seaborn/colors/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .xkcd_rgb import xkcd_rgb # noqa: F401 -from .crayons import crayons # noqa: F401 diff --git a/seaborn/colors/crayons.py b/seaborn/colors/crayons.py deleted file mode 100644 index 548af1f199355e00e2b1956aa992a48ed61d090a..0000000000000000000000000000000000000000 --- a/seaborn/colors/crayons.py +++ /dev/null @@ -1,120 +0,0 @@ -crayons = {'Almond': '#EFDECD', - 'Antique Brass': '#CD9575', - 'Apricot': '#FDD9B5', - 'Aquamarine': '#78DBE2', - 'Asparagus': '#87A96B', - 'Atomic Tangerine': '#FFA474', - 'Banana Mania': '#FAE7B5', - 'Beaver': '#9F8170', - 'Bittersweet': '#FD7C6E', - 'Black': '#000000', - 'Blue': '#1F75FE', - 'Blue Bell': '#A2A2D0', - 'Blue Green': '#0D98BA', - 'Blue Violet': '#7366BD', - 'Blush': '#DE5D83', - 'Brick Red': '#CB4154', - 'Brown': '#B4674D', - 'Burnt Orange': '#FF7F49', - 'Burnt Sienna': '#EA7E5D', - 'Cadet Blue': '#B0B7C6', - 'Canary': '#FFFF99', - 'Caribbean Green': '#00CC99', - 'Carnation Pink': '#FFAACC', - 'Cerise': '#DD4492', - 'Cerulean': '#1DACD6', - 'Chestnut': '#BC5D58', - 'Copper': '#DD9475', - 'Cornflower': '#9ACEEB', - 'Cotton Candy': '#FFBCD9', - 'Dandelion': '#FDDB6D', - 'Denim': '#2B6CC4', - 'Desert Sand': '#EFCDB8', - 'Eggplant': '#6E5160', - 'Electric Lime': '#CEFF1D', - 'Fern': '#71BC78', - 'Forest Green': '#6DAE81', - 'Fuchsia': '#C364C5', - 'Fuzzy Wuzzy': '#CC6666', - 'Gold': '#E7C697', - 'Goldenrod': '#FCD975', - 'Granny Smith Apple': '#A8E4A0', - 'Gray': '#95918C', - 'Green': '#1CAC78', - 'Green Yellow': '#F0E891', - 'Hot Magenta': '#FF1DCE', - 'Inchworm': '#B2EC5D', - 'Indigo': '#5D76CB', - 'Jazzberry Jam': '#CA3767', - 'Jungle Green': '#3BB08F', - 'Laser Lemon': '#FEFE22', - 'Lavender': '#FCB4D5', - 'Macaroni and Cheese': '#FFBD88', - 'Magenta': '#F664AF', - 'Mahogany': '#CD4A4C', - 'Manatee': '#979AAA', - 'Mango Tango': '#FF8243', - 'Maroon': '#C8385A', - 'Mauvelous': '#EF98AA', - 'Melon': '#FDBCB4', - 'Midnight Blue': '#1A4876', - 'Mountain Meadow': '#30BA8F', - 'Navy Blue': '#1974D2', - 'Neon Carrot': '#FFA343', - 'Olive Green': '#BAB86C', - 'Orange': '#FF7538', - 'Orchid': '#E6A8D7', - 'Outer Space': '#414A4C', - 'Outrageous Orange': '#FF6E4A', - 'Pacific Blue': '#1CA9C9', - 'Peach': '#FFCFAB', - 'Periwinkle': '#C5D0E6', - 'Piggy Pink': '#FDDDE6', - 'Pine Green': '#158078', - 'Pink Flamingo': '#FC74FD', - 'Pink Sherbert': '#F78FA7', - 'Plum': '#8E4585', - 'Purple Heart': '#7442C8', - "Purple Mountains' Majesty": '#9D81BA', - 'Purple Pizzazz': '#FE4EDA', - 'Radical Red': '#FF496C', - 'Raw Sienna': '#D68A59', - 'Razzle Dazzle Rose': '#FF48D0', - 'Razzmatazz': '#E3256B', - 'Red': '#EE204D', - 'Red Orange': '#FF5349', - 'Red Violet': '#C0448F', - "Robin's Egg Blue": '#1FCECB', - 'Royal Purple': '#7851A9', - 'Salmon': '#FF9BAA', - 'Scarlet': '#FC2847', - "Screamin' Green": '#76FF7A', - 'Sea Green': '#93DFB8', - 'Sepia': '#A5694F', - 'Shadow': '#8A795D', - 'Shamrock': '#45CEA2', - 'Shocking Pink': '#FB7EFD', - 'Silver': '#CDC5C2', - 'Sky Blue': '#80DAEB', - 'Spring Green': '#ECEABE', - 'Sunglow': '#FFCF48', - 'Sunset Orange': '#FD5E53', - 'Tan': '#FAA76C', - 'Tickle Me Pink': '#FC89AC', - 'Timberwolf': '#DBD7D2', - 'Tropical Rain Forest': '#17806D', - 'Tumbleweed': '#DEAA88', - 'Turquoise Blue': '#77DDE7', - 'Unmellow Yellow': '#FFFF66', - 'Violet (Purple)': '#926EAE', - 'Violet Red': '#F75394', - 'Vivid Tangerine': '#FFA089', - 'Vivid Violet': '#8F509D', - 'White': '#FFFFFF', - 'Wild Blue Yonder': '#A2ADD0', - 'Wild Strawberry': '#FF43A4', - 'Wild Watermelon': '#FC6C85', - 'Wisteria': '#CDA4DE', - 'Yellow': '#FCE883', - 'Yellow Green': '#C5E384', - 'Yellow Orange': '#FFAE42'} diff --git a/seaborn/colors/xkcd_rgb.py b/seaborn/colors/xkcd_rgb.py deleted file mode 100644 index 0f775cf6512c789ee4201cc41ed5c5fcc389a500..0000000000000000000000000000000000000000 --- a/seaborn/colors/xkcd_rgb.py +++ /dev/null @@ -1,949 +0,0 @@ -xkcd_rgb = {'acid green': '#8ffe09', - 'adobe': '#bd6c48', - 'algae': '#54ac68', - 'algae green': '#21c36f', - 'almost black': '#070d0d', - 'amber': '#feb308', - 'amethyst': '#9b5fc0', - 'apple': '#6ecb3c', - 'apple green': '#76cd26', - 'apricot': '#ffb16d', - 'aqua': '#13eac9', - 'aqua blue': '#02d8e9', - 'aqua green': '#12e193', - 'aqua marine': '#2ee8bb', - 'aquamarine': '#04d8b2', - 'army green': '#4b5d16', - 'asparagus': '#77ab56', - 'aubergine': '#3d0734', - 'auburn': '#9a3001', - 'avocado': '#90b134', - 'avocado green': '#87a922', - 'azul': '#1d5dec', - 'azure': '#069af3', - 'baby blue': '#a2cffe', - 'baby green': '#8cff9e', - 'baby pink': '#ffb7ce', - 'baby poo': '#ab9004', - 'baby poop': '#937c00', - 'baby poop green': '#8f9805', - 'baby puke green': '#b6c406', - 'baby purple': '#ca9bf7', - 'baby shit brown': '#ad900d', - 'baby shit green': '#889717', - 'banana': '#ffff7e', - 'banana yellow': '#fafe4b', - 'barbie pink': '#fe46a5', - 'barf green': '#94ac02', - 'barney': '#ac1db8', - 'barney purple': '#a00498', - 'battleship grey': '#6b7c85', - 'beige': '#e6daa6', - 'berry': '#990f4b', - 'bile': '#b5c306', - 'black': '#000000', - 'bland': '#afa88b', - 'blood': '#770001', - 'blood orange': '#fe4b03', - 'blood red': '#980002', - 'blue': '#0343df', - 'blue blue': '#2242c7', - 'blue green': '#137e6d', - 'blue grey': '#607c8e', - 'blue purple': '#5729ce', - 'blue violet': '#5d06e9', - 'blue with a hint of purple': '#533cc6', - 'blue/green': '#0f9b8e', - 'blue/grey': '#758da3', - 'blue/purple': '#5a06ef', - 'blueberry': '#464196', - 'bluegreen': '#017a79', - 'bluegrey': '#85a3b2', - 'bluey green': '#2bb179', - 'bluey grey': '#89a0b0', - 'bluey purple': '#6241c7', - 'bluish': '#2976bb', - 'bluish green': '#10a674', - 'bluish grey': '#748b97', - 'bluish purple': '#703be7', - 'blurple': '#5539cc', - 'blush': '#f29e8e', - 'blush pink': '#fe828c', - 'booger': '#9bb53c', - 'booger green': '#96b403', - 'bordeaux': '#7b002c', - 'boring green': '#63b365', - 'bottle green': '#044a05', - 'brick': '#a03623', - 'brick orange': '#c14a09', - 'brick red': '#8f1402', - 'bright aqua': '#0bf9ea', - 'bright blue': '#0165fc', - 'bright cyan': '#41fdfe', - 'bright green': '#01ff07', - 'bright lavender': '#c760ff', - 'bright light blue': '#26f7fd', - 'bright light green': '#2dfe54', - 'bright lilac': '#c95efb', - 'bright lime': '#87fd05', - 'bright lime green': '#65fe08', - 'bright magenta': '#ff08e8', - 'bright olive': '#9cbb04', - 'bright orange': '#ff5b00', - 'bright pink': '#fe01b1', - 'bright purple': '#be03fd', - 'bright red': '#ff000d', - 'bright sea green': '#05ffa6', - 'bright sky blue': '#02ccfe', - 'bright teal': '#01f9c6', - 'bright turquoise': '#0ffef9', - 'bright violet': '#ad0afd', - 'bright yellow': '#fffd01', - 'bright yellow green': '#9dff00', - 'british racing green': '#05480d', - 'bronze': '#a87900', - 'brown': '#653700', - 'brown green': '#706c11', - 'brown grey': '#8d8468', - 'brown orange': '#b96902', - 'brown red': '#922b05', - 'brown yellow': '#b29705', - 'brownish': '#9c6d57', - 'brownish green': '#6a6e09', - 'brownish grey': '#86775f', - 'brownish orange': '#cb7723', - 'brownish pink': '#c27e79', - 'brownish purple': '#76424e', - 'brownish red': '#9e3623', - 'brownish yellow': '#c9b003', - 'browny green': '#6f6c0a', - 'browny orange': '#ca6b02', - 'bruise': '#7e4071', - 'bubble gum pink': '#ff69af', - 'bubblegum': '#ff6cb5', - 'bubblegum pink': '#fe83cc', - 'buff': '#fef69e', - 'burgundy': '#610023', - 'burnt orange': '#c04e01', - 'burnt red': '#9f2305', - 'burnt siena': '#b75203', - 'burnt sienna': '#b04e0f', - 'burnt umber': '#a0450e', - 'burnt yellow': '#d5ab09', - 'burple': '#6832e3', - 'butter': '#ffff81', - 'butter yellow': '#fffd74', - 'butterscotch': '#fdb147', - 'cadet blue': '#4e7496', - 'camel': '#c69f59', - 'camo': '#7f8f4e', - 'camo green': '#526525', - 'camouflage green': '#4b6113', - 'canary': '#fdff63', - 'canary yellow': '#fffe40', - 'candy pink': '#ff63e9', - 'caramel': '#af6f09', - 'carmine': '#9d0216', - 'carnation': '#fd798f', - 'carnation pink': '#ff7fa7', - 'carolina blue': '#8ab8fe', - 'celadon': '#befdb7', - 'celery': '#c1fd95', - 'cement': '#a5a391', - 'cerise': '#de0c62', - 'cerulean': '#0485d1', - 'cerulean blue': '#056eee', - 'charcoal': '#343837', - 'charcoal grey': '#3c4142', - 'chartreuse': '#c1f80a', - 'cherry': '#cf0234', - 'cherry red': '#f7022a', - 'chestnut': '#742802', - 'chocolate': '#3d1c02', - 'chocolate brown': '#411900', - 'cinnamon': '#ac4f06', - 'claret': '#680018', - 'clay': '#b66a50', - 'clay brown': '#b2713d', - 'clear blue': '#247afd', - 'cloudy blue': '#acc2d9', - 'cobalt': '#1e488f', - 'cobalt blue': '#030aa7', - 'cocoa': '#875f42', - 'coffee': '#a6814c', - 'cool blue': '#4984b8', - 'cool green': '#33b864', - 'cool grey': '#95a3a6', - 'copper': '#b66325', - 'coral': '#fc5a50', - 'coral pink': '#ff6163', - 'cornflower': '#6a79f7', - 'cornflower blue': '#5170d7', - 'cranberry': '#9e003a', - 'cream': '#ffffc2', - 'creme': '#ffffb6', - 'crimson': '#8c000f', - 'custard': '#fffd78', - 'cyan': '#00ffff', - 'dandelion': '#fedf08', - 'dark': '#1b2431', - 'dark aqua': '#05696b', - 'dark aquamarine': '#017371', - 'dark beige': '#ac9362', - 'dark blue': '#00035b', - 'dark blue green': '#005249', - 'dark blue grey': '#1f3b4d', - 'dark brown': '#341c02', - 'dark coral': '#cf524e', - 'dark cream': '#fff39a', - 'dark cyan': '#0a888a', - 'dark forest green': '#002d04', - 'dark fuchsia': '#9d0759', - 'dark gold': '#b59410', - 'dark grass green': '#388004', - 'dark green': '#033500', - 'dark green blue': '#1f6357', - 'dark grey': '#363737', - 'dark grey blue': '#29465b', - 'dark hot pink': '#d90166', - 'dark indigo': '#1f0954', - 'dark khaki': '#9b8f55', - 'dark lavender': '#856798', - 'dark lilac': '#9c6da5', - 'dark lime': '#84b701', - 'dark lime green': '#7ebd01', - 'dark magenta': '#960056', - 'dark maroon': '#3c0008', - 'dark mauve': '#874c62', - 'dark mint': '#48c072', - 'dark mint green': '#20c073', - 'dark mustard': '#a88905', - 'dark navy': '#000435', - 'dark navy blue': '#00022e', - 'dark olive': '#373e02', - 'dark olive green': '#3c4d03', - 'dark orange': '#c65102', - 'dark pastel green': '#56ae57', - 'dark peach': '#de7e5d', - 'dark periwinkle': '#665fd1', - 'dark pink': '#cb416b', - 'dark plum': '#3f012c', - 'dark purple': '#35063e', - 'dark red': '#840000', - 'dark rose': '#b5485d', - 'dark royal blue': '#02066f', - 'dark sage': '#598556', - 'dark salmon': '#c85a53', - 'dark sand': '#a88f59', - 'dark sea green': '#11875d', - 'dark seafoam': '#1fb57a', - 'dark seafoam green': '#3eaf76', - 'dark sky blue': '#448ee4', - 'dark slate blue': '#214761', - 'dark tan': '#af884a', - 'dark taupe': '#7f684e', - 'dark teal': '#014d4e', - 'dark turquoise': '#045c5a', - 'dark violet': '#34013f', - 'dark yellow': '#d5b60a', - 'dark yellow green': '#728f02', - 'darkblue': '#030764', - 'darkgreen': '#054907', - 'darkish blue': '#014182', - 'darkish green': '#287c37', - 'darkish pink': '#da467d', - 'darkish purple': '#751973', - 'darkish red': '#a90308', - 'deep aqua': '#08787f', - 'deep blue': '#040273', - 'deep brown': '#410200', - 'deep green': '#02590f', - 'deep lavender': '#8d5eb7', - 'deep lilac': '#966ebd', - 'deep magenta': '#a0025c', - 'deep orange': '#dc4d01', - 'deep pink': '#cb0162', - 'deep purple': '#36013f', - 'deep red': '#9a0200', - 'deep rose': '#c74767', - 'deep sea blue': '#015482', - 'deep sky blue': '#0d75f8', - 'deep teal': '#00555a', - 'deep turquoise': '#017374', - 'deep violet': '#490648', - 'denim': '#3b638c', - 'denim blue': '#3b5b92', - 'desert': '#ccad60', - 'diarrhea': '#9f8303', - 'dirt': '#8a6e45', - 'dirt brown': '#836539', - 'dirty blue': '#3f829d', - 'dirty green': '#667e2c', - 'dirty orange': '#c87606', - 'dirty pink': '#ca7b80', - 'dirty purple': '#734a65', - 'dirty yellow': '#cdc50a', - 'dodger blue': '#3e82fc', - 'drab': '#828344', - 'drab green': '#749551', - 'dried blood': '#4b0101', - 'duck egg blue': '#c3fbf4', - 'dull blue': '#49759c', - 'dull brown': '#876e4b', - 'dull green': '#74a662', - 'dull orange': '#d8863b', - 'dull pink': '#d5869d', - 'dull purple': '#84597e', - 'dull red': '#bb3f3f', - 'dull teal': '#5f9e8f', - 'dull yellow': '#eedc5b', - 'dusk': '#4e5481', - 'dusk blue': '#26538d', - 'dusky blue': '#475f94', - 'dusky pink': '#cc7a8b', - 'dusky purple': '#895b7b', - 'dusky rose': '#ba6873', - 'dust': '#b2996e', - 'dusty blue': '#5a86ad', - 'dusty green': '#76a973', - 'dusty lavender': '#ac86a8', - 'dusty orange': '#f0833a', - 'dusty pink': '#d58a94', - 'dusty purple': '#825f87', - 'dusty red': '#b9484e', - 'dusty rose': '#c0737a', - 'dusty teal': '#4c9085', - 'earth': '#a2653e', - 'easter green': '#8cfd7e', - 'easter purple': '#c071fe', - 'ecru': '#feffca', - 'egg shell': '#fffcc4', - 'eggplant': '#380835', - 'eggplant purple': '#430541', - 'eggshell': '#ffffd4', - 'eggshell blue': '#c4fff7', - 'electric blue': '#0652ff', - 'electric green': '#21fc0d', - 'electric lime': '#a8ff04', - 'electric pink': '#ff0490', - 'electric purple': '#aa23ff', - 'emerald': '#01a049', - 'emerald green': '#028f1e', - 'evergreen': '#05472a', - 'faded blue': '#658cbb', - 'faded green': '#7bb274', - 'faded orange': '#f0944d', - 'faded pink': '#de9dac', - 'faded purple': '#916e99', - 'faded red': '#d3494e', - 'faded yellow': '#feff7f', - 'fawn': '#cfaf7b', - 'fern': '#63a950', - 'fern green': '#548d44', - 'fire engine red': '#fe0002', - 'flat blue': '#3c73a8', - 'flat green': '#699d4c', - 'fluorescent green': '#08ff08', - 'fluro green': '#0aff02', - 'foam green': '#90fda9', - 'forest': '#0b5509', - 'forest green': '#06470c', - 'forrest green': '#154406', - 'french blue': '#436bad', - 'fresh green': '#69d84f', - 'frog green': '#58bc08', - 'fuchsia': '#ed0dd9', - 'gold': '#dbb40c', - 'golden': '#f5bf03', - 'golden brown': '#b27a01', - 'golden rod': '#f9bc08', - 'golden yellow': '#fec615', - 'goldenrod': '#fac205', - 'grape': '#6c3461', - 'grape purple': '#5d1451', - 'grapefruit': '#fd5956', - 'grass': '#5cac2d', - 'grass green': '#3f9b0b', - 'grassy green': '#419c03', - 'green': '#15b01a', - 'green apple': '#5edc1f', - 'green blue': '#06b48b', - 'green brown': '#544e03', - 'green grey': '#77926f', - 'green teal': '#0cb577', - 'green yellow': '#c9ff27', - 'green/blue': '#01c08d', - 'green/yellow': '#b5ce08', - 'greenblue': '#23c48b', - 'greenish': '#40a368', - 'greenish beige': '#c9d179', - 'greenish blue': '#0b8b87', - 'greenish brown': '#696112', - 'greenish cyan': '#2afeb7', - 'greenish grey': '#96ae8d', - 'greenish tan': '#bccb7a', - 'greenish teal': '#32bf84', - 'greenish turquoise': '#00fbb0', - 'greenish yellow': '#cdfd02', - 'greeny blue': '#42b395', - 'greeny brown': '#696006', - 'greeny grey': '#7ea07a', - 'greeny yellow': '#c6f808', - 'grey': '#929591', - 'grey blue': '#6b8ba4', - 'grey brown': '#7f7053', - 'grey green': '#789b73', - 'grey pink': '#c3909b', - 'grey purple': '#826d8c', - 'grey teal': '#5e9b8a', - 'grey/blue': '#647d8e', - 'grey/green': '#86a17d', - 'greyblue': '#77a1b5', - 'greyish': '#a8a495', - 'greyish blue': '#5e819d', - 'greyish brown': '#7a6a4f', - 'greyish green': '#82a67d', - 'greyish pink': '#c88d94', - 'greyish purple': '#887191', - 'greyish teal': '#719f91', - 'gross green': '#a0bf16', - 'gunmetal': '#536267', - 'hazel': '#8e7618', - 'heather': '#a484ac', - 'heliotrope': '#d94ff5', - 'highlighter green': '#1bfc06', - 'hospital green': '#9be5aa', - 'hot green': '#25ff29', - 'hot magenta': '#f504c9', - 'hot pink': '#ff028d', - 'hot purple': '#cb00f5', - 'hunter green': '#0b4008', - 'ice': '#d6fffa', - 'ice blue': '#d7fffe', - 'icky green': '#8fae22', - 'indian red': '#850e04', - 'indigo': '#380282', - 'indigo blue': '#3a18b1', - 'iris': '#6258c4', - 'irish green': '#019529', - 'ivory': '#ffffcb', - 'jade': '#1fa774', - 'jade green': '#2baf6a', - 'jungle green': '#048243', - 'kelley green': '#009337', - 'kelly green': '#02ab2e', - 'kermit green': '#5cb200', - 'key lime': '#aeff6e', - 'khaki': '#aaa662', - 'khaki green': '#728639', - 'kiwi': '#9cef43', - 'kiwi green': '#8ee53f', - 'lavender': '#c79fef', - 'lavender blue': '#8b88f8', - 'lavender pink': '#dd85d7', - 'lawn green': '#4da409', - 'leaf': '#71aa34', - 'leaf green': '#5ca904', - 'leafy green': '#51b73b', - 'leather': '#ac7434', - 'lemon': '#fdff52', - 'lemon green': '#adf802', - 'lemon lime': '#bffe28', - 'lemon yellow': '#fdff38', - 'lichen': '#8fb67b', - 'light aqua': '#8cffdb', - 'light aquamarine': '#7bfdc7', - 'light beige': '#fffeb6', - 'light blue': '#95d0fc', - 'light blue green': '#7efbb3', - 'light blue grey': '#b7c9e2', - 'light bluish green': '#76fda8', - 'light bright green': '#53fe5c', - 'light brown': '#ad8150', - 'light burgundy': '#a8415b', - 'light cyan': '#acfffc', - 'light eggplant': '#894585', - 'light forest green': '#4f9153', - 'light gold': '#fddc5c', - 'light grass green': '#9af764', - 'light green': '#96f97b', - 'light green blue': '#56fca2', - 'light greenish blue': '#63f7b4', - 'light grey': '#d8dcd6', - 'light grey blue': '#9dbcd4', - 'light grey green': '#b7e1a1', - 'light indigo': '#6d5acf', - 'light khaki': '#e6f2a2', - 'light lavendar': '#efc0fe', - 'light lavender': '#dfc5fe', - 'light light blue': '#cafffb', - 'light light green': '#c8ffb0', - 'light lilac': '#edc8ff', - 'light lime': '#aefd6c', - 'light lime green': '#b9ff66', - 'light magenta': '#fa5ff7', - 'light maroon': '#a24857', - 'light mauve': '#c292a1', - 'light mint': '#b6ffbb', - 'light mint green': '#a6fbb2', - 'light moss green': '#a6c875', - 'light mustard': '#f7d560', - 'light navy': '#155084', - 'light navy blue': '#2e5a88', - 'light neon green': '#4efd54', - 'light olive': '#acbf69', - 'light olive green': '#a4be5c', - 'light orange': '#fdaa48', - 'light pastel green': '#b2fba5', - 'light pea green': '#c4fe82', - 'light peach': '#ffd8b1', - 'light periwinkle': '#c1c6fc', - 'light pink': '#ffd1df', - 'light plum': '#9d5783', - 'light purple': '#bf77f6', - 'light red': '#ff474c', - 'light rose': '#ffc5cb', - 'light royal blue': '#3a2efe', - 'light sage': '#bcecac', - 'light salmon': '#fea993', - 'light sea green': '#98f6b0', - 'light seafoam': '#a0febf', - 'light seafoam green': '#a7ffb5', - 'light sky blue': '#c6fcff', - 'light tan': '#fbeeac', - 'light teal': '#90e4c1', - 'light turquoise': '#7ef4cc', - 'light urple': '#b36ff6', - 'light violet': '#d6b4fc', - 'light yellow': '#fffe7a', - 'light yellow green': '#ccfd7f', - 'light yellowish green': '#c2ff89', - 'lightblue': '#7bc8f6', - 'lighter green': '#75fd63', - 'lighter purple': '#a55af4', - 'lightgreen': '#76ff7b', - 'lightish blue': '#3d7afd', - 'lightish green': '#61e160', - 'lightish purple': '#a552e6', - 'lightish red': '#fe2f4a', - 'lilac': '#cea2fd', - 'liliac': '#c48efd', - 'lime': '#aaff32', - 'lime green': '#89fe05', - 'lime yellow': '#d0fe1d', - 'lipstick': '#d5174e', - 'lipstick red': '#c0022f', - 'macaroni and cheese': '#efb435', - 'magenta': '#c20078', - 'mahogany': '#4a0100', - 'maize': '#f4d054', - 'mango': '#ffa62b', - 'manilla': '#fffa86', - 'marigold': '#fcc006', - 'marine': '#042e60', - 'marine blue': '#01386a', - 'maroon': '#650021', - 'mauve': '#ae7181', - 'medium blue': '#2c6fbb', - 'medium brown': '#7f5112', - 'medium green': '#39ad48', - 'medium grey': '#7d7f7c', - 'medium pink': '#f36196', - 'medium purple': '#9e43a2', - 'melon': '#ff7855', - 'merlot': '#730039', - 'metallic blue': '#4f738e', - 'mid blue': '#276ab3', - 'mid green': '#50a747', - 'midnight': '#03012d', - 'midnight blue': '#020035', - 'midnight purple': '#280137', - 'military green': '#667c3e', - 'milk chocolate': '#7f4e1e', - 'mint': '#9ffeb0', - 'mint green': '#8fff9f', - 'minty green': '#0bf77d', - 'mocha': '#9d7651', - 'moss': '#769958', - 'moss green': '#658b38', - 'mossy green': '#638b27', - 'mud': '#735c12', - 'mud brown': '#60460f', - 'mud green': '#606602', - 'muddy brown': '#886806', - 'muddy green': '#657432', - 'muddy yellow': '#bfac05', - 'mulberry': '#920a4e', - 'murky green': '#6c7a0e', - 'mushroom': '#ba9e88', - 'mustard': '#ceb301', - 'mustard brown': '#ac7e04', - 'mustard green': '#a8b504', - 'mustard yellow': '#d2bd0a', - 'muted blue': '#3b719f', - 'muted green': '#5fa052', - 'muted pink': '#d1768f', - 'muted purple': '#805b87', - 'nasty green': '#70b23f', - 'navy': '#01153e', - 'navy blue': '#001146', - 'navy green': '#35530a', - 'neon blue': '#04d9ff', - 'neon green': '#0cff0c', - 'neon pink': '#fe019a', - 'neon purple': '#bc13fe', - 'neon red': '#ff073a', - 'neon yellow': '#cfff04', - 'nice blue': '#107ab0', - 'night blue': '#040348', - 'ocean': '#017b92', - 'ocean blue': '#03719c', - 'ocean green': '#3d9973', - 'ocher': '#bf9b0c', - 'ochre': '#bf9005', - 'ocre': '#c69c04', - 'off blue': '#5684ae', - 'off green': '#6ba353', - 'off white': '#ffffe4', - 'off yellow': '#f1f33f', - 'old pink': '#c77986', - 'old rose': '#c87f89', - 'olive': '#6e750e', - 'olive brown': '#645403', - 'olive drab': '#6f7632', - 'olive green': '#677a04', - 'olive yellow': '#c2b709', - 'orange': '#f97306', - 'orange brown': '#be6400', - 'orange pink': '#ff6f52', - 'orange red': '#fd411e', - 'orange yellow': '#ffad01', - 'orangeish': '#fd8d49', - 'orangered': '#fe420f', - 'orangey brown': '#b16002', - 'orangey red': '#fa4224', - 'orangey yellow': '#fdb915', - 'orangish': '#fc824a', - 'orangish brown': '#b25f03', - 'orangish red': '#f43605', - 'orchid': '#c875c4', - 'pale': '#fff9d0', - 'pale aqua': '#b8ffeb', - 'pale blue': '#d0fefe', - 'pale brown': '#b1916e', - 'pale cyan': '#b7fffa', - 'pale gold': '#fdde6c', - 'pale green': '#c7fdb5', - 'pale grey': '#fdfdfe', - 'pale lavender': '#eecffe', - 'pale light green': '#b1fc99', - 'pale lilac': '#e4cbff', - 'pale lime': '#befd73', - 'pale lime green': '#b1ff65', - 'pale magenta': '#d767ad', - 'pale mauve': '#fed0fc', - 'pale olive': '#b9cc81', - 'pale olive green': '#b1d27b', - 'pale orange': '#ffa756', - 'pale peach': '#ffe5ad', - 'pale pink': '#ffcfdc', - 'pale purple': '#b790d4', - 'pale red': '#d9544d', - 'pale rose': '#fdc1c5', - 'pale salmon': '#ffb19a', - 'pale sky blue': '#bdf6fe', - 'pale teal': '#82cbb2', - 'pale turquoise': '#a5fbd5', - 'pale violet': '#ceaefa', - 'pale yellow': '#ffff84', - 'parchment': '#fefcaf', - 'pastel blue': '#a2bffe', - 'pastel green': '#b0ff9d', - 'pastel orange': '#ff964f', - 'pastel pink': '#ffbacd', - 'pastel purple': '#caa0ff', - 'pastel red': '#db5856', - 'pastel yellow': '#fffe71', - 'pea': '#a4bf20', - 'pea green': '#8eab12', - 'pea soup': '#929901', - 'pea soup green': '#94a617', - 'peach': '#ffb07c', - 'peachy pink': '#ff9a8a', - 'peacock blue': '#016795', - 'pear': '#cbf85f', - 'periwinkle': '#8e82fe', - 'periwinkle blue': '#8f99fb', - 'perrywinkle': '#8f8ce7', - 'petrol': '#005f6a', - 'pig pink': '#e78ea5', - 'pine': '#2b5d34', - 'pine green': '#0a481e', - 'pink': '#ff81c0', - 'pink purple': '#db4bda', - 'pink red': '#f5054f', - 'pink/purple': '#ef1de7', - 'pinkish': '#d46a7e', - 'pinkish brown': '#b17261', - 'pinkish grey': '#c8aca9', - 'pinkish orange': '#ff724c', - 'pinkish purple': '#d648d7', - 'pinkish red': '#f10c45', - 'pinkish tan': '#d99b82', - 'pinky': '#fc86aa', - 'pinky purple': '#c94cbe', - 'pinky red': '#fc2647', - 'piss yellow': '#ddd618', - 'pistachio': '#c0fa8b', - 'plum': '#580f41', - 'plum purple': '#4e0550', - 'poison green': '#40fd14', - 'poo': '#8f7303', - 'poo brown': '#885f01', - 'poop': '#7f5e00', - 'poop brown': '#7a5901', - 'poop green': '#6f7c00', - 'powder blue': '#b1d1fc', - 'powder pink': '#ffb2d0', - 'primary blue': '#0804f9', - 'prussian blue': '#004577', - 'puce': '#a57e52', - 'puke': '#a5a502', - 'puke brown': '#947706', - 'puke green': '#9aae07', - 'puke yellow': '#c2be0e', - 'pumpkin': '#e17701', - 'pumpkin orange': '#fb7d07', - 'pure blue': '#0203e2', - 'purple': '#7e1e9c', - 'purple blue': '#632de9', - 'purple brown': '#673a3f', - 'purple grey': '#866f85', - 'purple pink': '#e03fd8', - 'purple red': '#990147', - 'purple/blue': '#5d21d0', - 'purple/pink': '#d725de', - 'purpleish': '#98568d', - 'purpleish blue': '#6140ef', - 'purpleish pink': '#df4ec8', - 'purpley': '#8756e4', - 'purpley blue': '#5f34e7', - 'purpley grey': '#947e94', - 'purpley pink': '#c83cb9', - 'purplish': '#94568c', - 'purplish blue': '#601ef9', - 'purplish brown': '#6b4247', - 'purplish grey': '#7a687f', - 'purplish pink': '#ce5dae', - 'purplish red': '#b0054b', - 'purply': '#983fb2', - 'purply blue': '#661aee', - 'purply pink': '#f075e6', - 'putty': '#beae8a', - 'racing green': '#014600', - 'radioactive green': '#2cfa1f', - 'raspberry': '#b00149', - 'raw sienna': '#9a6200', - 'raw umber': '#a75e09', - 'really light blue': '#d4ffff', - 'red': '#e50000', - 'red brown': '#8b2e16', - 'red orange': '#fd3c06', - 'red pink': '#fa2a55', - 'red purple': '#820747', - 'red violet': '#9e0168', - 'red wine': '#8c0034', - 'reddish': '#c44240', - 'reddish brown': '#7f2b0a', - 'reddish grey': '#997570', - 'reddish orange': '#f8481c', - 'reddish pink': '#fe2c54', - 'reddish purple': '#910951', - 'reddy brown': '#6e1005', - 'rich blue': '#021bf9', - 'rich purple': '#720058', - 'robin egg blue': '#8af1fe', - "robin's egg": '#6dedfd', - "robin's egg blue": '#98eff9', - 'rosa': '#fe86a4', - 'rose': '#cf6275', - 'rose pink': '#f7879a', - 'rose red': '#be013c', - 'rosy pink': '#f6688e', - 'rouge': '#ab1239', - 'royal': '#0c1793', - 'royal blue': '#0504aa', - 'royal purple': '#4b006e', - 'ruby': '#ca0147', - 'russet': '#a13905', - 'rust': '#a83c09', - 'rust brown': '#8b3103', - 'rust orange': '#c45508', - 'rust red': '#aa2704', - 'rusty orange': '#cd5909', - 'rusty red': '#af2f0d', - 'saffron': '#feb209', - 'sage': '#87ae73', - 'sage green': '#88b378', - 'salmon': '#ff796c', - 'salmon pink': '#fe7b7c', - 'sand': '#e2ca76', - 'sand brown': '#cba560', - 'sand yellow': '#fce166', - 'sandstone': '#c9ae74', - 'sandy': '#f1da7a', - 'sandy brown': '#c4a661', - 'sandy yellow': '#fdee73', - 'sap green': '#5c8b15', - 'sapphire': '#2138ab', - 'scarlet': '#be0119', - 'sea': '#3c9992', - 'sea blue': '#047495', - 'sea green': '#53fca1', - 'seafoam': '#80f9ad', - 'seafoam blue': '#78d1b6', - 'seafoam green': '#7af9ab', - 'seaweed': '#18d17b', - 'seaweed green': '#35ad6b', - 'sepia': '#985e2b', - 'shamrock': '#01b44c', - 'shamrock green': '#02c14d', - 'shit': '#7f5f00', - 'shit brown': '#7b5804', - 'shit green': '#758000', - 'shocking pink': '#fe02a2', - 'sick green': '#9db92c', - 'sickly green': '#94b21c', - 'sickly yellow': '#d0e429', - 'sienna': '#a9561e', - 'silver': '#c5c9c7', - 'sky': '#82cafc', - 'sky blue': '#75bbfd', - 'slate': '#516572', - 'slate blue': '#5b7c99', - 'slate green': '#658d6d', - 'slate grey': '#59656d', - 'slime green': '#99cc04', - 'snot': '#acbb0d', - 'snot green': '#9dc100', - 'soft blue': '#6488ea', - 'soft green': '#6fc276', - 'soft pink': '#fdb0c0', - 'soft purple': '#a66fb5', - 'spearmint': '#1ef876', - 'spring green': '#a9f971', - 'spruce': '#0a5f38', - 'squash': '#f2ab15', - 'steel': '#738595', - 'steel blue': '#5a7d9a', - 'steel grey': '#6f828a', - 'stone': '#ada587', - 'stormy blue': '#507b9c', - 'straw': '#fcf679', - 'strawberry': '#fb2943', - 'strong blue': '#0c06f7', - 'strong pink': '#ff0789', - 'sun yellow': '#ffdf22', - 'sunflower': '#ffc512', - 'sunflower yellow': '#ffda03', - 'sunny yellow': '#fff917', - 'sunshine yellow': '#fffd37', - 'swamp': '#698339', - 'swamp green': '#748500', - 'tan': '#d1b26f', - 'tan brown': '#ab7e4c', - 'tan green': '#a9be70', - 'tangerine': '#ff9408', - 'taupe': '#b9a281', - 'tea': '#65ab7c', - 'tea green': '#bdf8a3', - 'teal': '#029386', - 'teal blue': '#01889f', - 'teal green': '#25a36f', - 'tealish': '#24bca8', - 'tealish green': '#0cdc73', - 'terra cotta': '#c9643b', - 'terracota': '#cb6843', - 'terracotta': '#ca6641', - 'tiffany blue': '#7bf2da', - 'tomato': '#ef4026', - 'tomato red': '#ec2d01', - 'topaz': '#13bbaf', - 'toupe': '#c7ac7d', - 'toxic green': '#61de2a', - 'tree green': '#2a7e19', - 'true blue': '#010fcc', - 'true green': '#089404', - 'turquoise': '#06c2ac', - 'turquoise blue': '#06b1c4', - 'turquoise green': '#04f489', - 'turtle green': '#75b84f', - 'twilight': '#4e518b', - 'twilight blue': '#0a437a', - 'ugly blue': '#31668a', - 'ugly brown': '#7d7103', - 'ugly green': '#7a9703', - 'ugly pink': '#cd7584', - 'ugly purple': '#a442a0', - 'ugly yellow': '#d0c101', - 'ultramarine': '#2000b1', - 'ultramarine blue': '#1805db', - 'umber': '#b26400', - 'velvet': '#750851', - 'vermillion': '#f4320c', - 'very dark blue': '#000133', - 'very dark brown': '#1d0200', - 'very dark green': '#062e03', - 'very dark purple': '#2a0134', - 'very light blue': '#d5ffff', - 'very light brown': '#d3b683', - 'very light green': '#d1ffbd', - 'very light pink': '#fff4f2', - 'very light purple': '#f6cefc', - 'very pale blue': '#d6fffe', - 'very pale green': '#cffdbc', - 'vibrant blue': '#0339f8', - 'vibrant green': '#0add08', - 'vibrant purple': '#ad03de', - 'violet': '#9a0eea', - 'violet blue': '#510ac9', - 'violet pink': '#fb5ffc', - 'violet red': '#a50055', - 'viridian': '#1e9167', - 'vivid blue': '#152eff', - 'vivid green': '#2fef10', - 'vivid purple': '#9900fa', - 'vomit': '#a2a415', - 'vomit green': '#89a203', - 'vomit yellow': '#c7c10c', - 'warm blue': '#4b57db', - 'warm brown': '#964e02', - 'warm grey': '#978a84', - 'warm pink': '#fb5581', - 'warm purple': '#952e8f', - 'washed out green': '#bcf5a6', - 'water blue': '#0e87cc', - 'watermelon': '#fd4659', - 'weird green': '#3ae57f', - 'wheat': '#fbdd7e', - 'white': '#ffffff', - 'windows blue': '#3778bf', - 'wine': '#80013f', - 'wine red': '#7b0323', - 'wintergreen': '#20f986', - 'wisteria': '#a87dc2', - 'yellow': '#ffff14', - 'yellow brown': '#b79400', - 'yellow green': '#c0fb2d', - 'yellow ochre': '#cb9d06', - 'yellow orange': '#fcb001', - 'yellow tan': '#ffe36e', - 'yellow/green': '#c8fd3d', - 'yellowgreen': '#bbf90f', - 'yellowish': '#faee66', - 'yellowish brown': '#9b7a01', - 'yellowish green': '#b0dd16', - 'yellowish orange': '#ffab0f', - 'yellowish tan': '#fcfc81', - 'yellowy brown': '#ae8b0c', - 'yellowy green': '#bff128'} diff --git a/seaborn/distributions.py b/seaborn/distributions.py deleted file mode 100644 index f8ec166cf4b346417e1cb7e43f591e9dd0cc970f..0000000000000000000000000000000000000000 --- a/seaborn/distributions.py +++ /dev/null @@ -1,2531 +0,0 @@ -"""Plotting functions for visualizing distributions.""" -from numbers import Number -from functools import partial -import math -import textwrap -import warnings - -import numpy as np -import pandas as pd -import matplotlib as mpl -import matplotlib.pyplot as plt -import matplotlib.transforms as tx -from matplotlib.cbook import normalize_kwargs -from matplotlib.colors import to_rgba -from matplotlib.collections import LineCollection - -from ._base import VectorPlotter - -# We have moved univariate histogram computation over to the new Hist class, -# but still use the older Histogram for bivariate computation. -from ._statistics import ECDF, Histogram, KDE -from ._stats.counting import Hist - -from .axisgrid import ( - FacetGrid, - _facet_docs, -) -from .utils import ( - remove_na, - _get_transform_functions, - _kde_support, - _check_argument, - _assign_default_kwargs, - _default_color, -) -from .palettes import color_palette -from .external import husl -from .external.kde import gaussian_kde -from ._docstrings import ( - DocstringComponents, - _core_docs, -) - - -__all__ = ["displot", "histplot", "kdeplot", "ecdfplot", "rugplot", "distplot"] - -# ==================================================================================== # -# Module documentation -# ==================================================================================== # - -_dist_params = dict( - - multiple=""" -multiple : {{"layer", "stack", "fill"}} - Method for drawing multiple elements when semantic mapping creates subsets. - Only relevant with univariate data. - """, - log_scale=""" -log_scale : bool or number, or pair of bools or numbers - Set axis scale(s) to log. A single value sets the data axis for any numeric - axes in the plot. A pair of values sets each axis independently. - Numeric values are interpreted as the desired base (default 10). - When `None` or `False`, seaborn defers to the existing Axes scale. - """, - legend=""" -legend : bool - If False, suppress the legend for semantic variables. - """, - cbar=""" -cbar : bool - If True, add a colorbar to annotate the color mapping in a bivariate plot. - Note: Does not currently support plots with a ``hue`` variable well. - """, - cbar_ax=""" -cbar_ax : :class:`matplotlib.axes.Axes` - Pre-existing axes for the colorbar. - """, - cbar_kws=""" -cbar_kws : dict - Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`. - """, -) - -_param_docs = DocstringComponents.from_nested_components( - core=_core_docs["params"], - facets=DocstringComponents(_facet_docs), - dist=DocstringComponents(_dist_params), - kde=DocstringComponents.from_function_params(KDE.__init__), - hist=DocstringComponents.from_function_params(Histogram.__init__), - ecdf=DocstringComponents.from_function_params(ECDF.__init__), -) - - -# ==================================================================================== # -# Internal API -# ==================================================================================== # - - -class _DistributionPlotter(VectorPlotter): - - wide_structure = {"x": "@values", "hue": "@columns"} - flat_structure = {"x": "@values"} - - def __init__( - self, - data=None, - variables={}, - ): - - super().__init__(data=data, variables=variables) - - @property - def univariate(self): - """Return True if only x or y are used.""" - # TODO this could go down to core, but putting it here now. - # We'd want to be conceptually clear that univariate only applies - # to x/y and not to other semantics, which can exist. - # We haven't settled on a good conceptual name for x/y. - return bool({"x", "y"} - set(self.variables)) - - @property - def data_variable(self): - """Return the variable with data for univariate plots.""" - # TODO This could also be in core, but it should have a better name. - if not self.univariate: - raise AttributeError("This is not a univariate plot") - return {"x", "y"}.intersection(self.variables).pop() - - @property - def has_xy_data(self): - """Return True at least one of x or y is defined.""" - # TODO see above points about where this should go - return bool({"x", "y"} & set(self.variables)) - - def _add_legend( - self, - ax_obj, artist, fill, element, multiple, alpha, artist_kws, legend_kws, - ): - """Add artists that reflect semantic mappings and put then in a legend.""" - # TODO note that this doesn't handle numeric mappings like the relational plots - handles = [] - labels = [] - for level in self._hue_map.levels: - color = self._hue_map(level) - - kws = self._artist_kws( - artist_kws, fill, element, multiple, color, alpha - ) - - # color gets added to the kws to workaround an issue with barplot's color - # cycle integration but it causes problems in this context where we are - # setting artist properties directly, so pop it off here - if "facecolor" in kws: - kws.pop("color", None) - - handles.append(artist(**kws)) - labels.append(level) - - if isinstance(ax_obj, mpl.axes.Axes): - ax_obj.legend(handles, labels, title=self.variables["hue"], **legend_kws) - else: # i.e. a FacetGrid. TODO make this better - legend_data = dict(zip(labels, handles)) - ax_obj.add_legend( - legend_data, - title=self.variables["hue"], - label_order=self.var_levels["hue"], - **legend_kws - ) - - def _artist_kws(self, kws, fill, element, multiple, color, alpha): - """Handle differences between artists in filled/unfilled plots.""" - kws = kws.copy() - if fill: - kws = normalize_kwargs(kws, mpl.collections.PolyCollection) - kws.setdefault("facecolor", to_rgba(color, alpha)) - - if element == "bars": - # Make bar() interface with property cycle correctly - # https://github.com/matplotlib/matplotlib/issues/19385 - kws["color"] = "none" - - if multiple in ["stack", "fill"] or element == "bars": - kws.setdefault("edgecolor", mpl.rcParams["patch.edgecolor"]) - else: - kws.setdefault("edgecolor", to_rgba(color, 1)) - elif element == "bars": - kws["facecolor"] = "none" - kws["edgecolor"] = to_rgba(color, alpha) - else: - kws["color"] = to_rgba(color, alpha) - return kws - - def _quantile_to_level(self, data, quantile): - """Return data levels corresponding to quantile cuts of mass.""" - isoprop = np.asarray(quantile) - values = np.ravel(data) - sorted_values = np.sort(values)[::-1] - normalized_values = np.cumsum(sorted_values) / values.sum() - idx = np.searchsorted(normalized_values, 1 - isoprop) - levels = np.take(sorted_values, idx, mode="clip") - return levels - - def _cmap_from_color(self, color): - """Return a sequential colormap given a color seed.""" - # Like so much else here, this is broadly useful, but keeping it - # in this class to signify that I haven't thought overly hard about it... - r, g, b, _ = to_rgba(color) - h, s, _ = husl.rgb_to_husl(r, g, b) - xx = np.linspace(-1, 1, int(1.15 * 256))[:256] - ramp = np.zeros((256, 3)) - ramp[:, 0] = h - ramp[:, 1] = s * np.cos(xx) - ramp[:, 2] = np.linspace(35, 80, 256) - colors = np.clip([husl.husl_to_rgb(*hsl) for hsl in ramp], 0, 1) - return mpl.colors.ListedColormap(colors[::-1]) - - def _default_discrete(self): - """Find default values for discrete hist estimation based on variable type.""" - if self.univariate: - discrete = self.var_types[self.data_variable] == "categorical" - else: - discrete_x = self.var_types["x"] == "categorical" - discrete_y = self.var_types["y"] == "categorical" - discrete = discrete_x, discrete_y - return discrete - - def _resolve_multiple(self, curves, multiple): - """Modify the density data structure to handle multiple densities.""" - - # Default baselines have all densities starting at 0 - baselines = {k: np.zeros_like(v) for k, v in curves.items()} - - # TODO we should have some central clearinghouse for checking if any - # "grouping" (terminnology?) semantics have been assigned - if "hue" not in self.variables: - return curves, baselines - - if multiple in ("stack", "fill"): - - # Setting stack or fill means that the curves share a - # support grid / set of bin edges, so we can make a dataframe - # Reverse the column order to plot from top to bottom - curves = pd.DataFrame(curves).iloc[:, ::-1] - - # Find column groups that are nested within col/row variables - column_groups = {} - for i, keyd in enumerate(map(dict, curves.columns)): - facet_key = keyd.get("col", None), keyd.get("row", None) - column_groups.setdefault(facet_key, []) - column_groups[facet_key].append(i) - - baselines = curves.copy() - - for col_idxs in column_groups.values(): - cols = curves.columns[col_idxs] - - norm_constant = curves[cols].sum(axis="columns") - - # Take the cumulative sum to stack - curves[cols] = curves[cols].cumsum(axis="columns") - - # Normalize by row sum to fill - if multiple == "fill": - curves[cols] = curves[cols].div(norm_constant, axis="index") - - # Define where each segment starts - baselines[cols] = curves[cols].shift(1, axis=1).fillna(0) - - if multiple == "dodge": - - # Account for the unique semantic (non-faceting) levels - # This will require rethiniking if we add other semantics! - hue_levels = self.var_levels["hue"] - n = len(hue_levels) - f_fwd, f_inv = self._get_scale_transforms(self.data_variable) - for key in curves: - - level = dict(key)["hue"] - hist = curves[key].reset_index(name="heights") - level_idx = hue_levels.index(level) - - a = f_fwd(hist["edges"]) - b = f_fwd(hist["edges"] + hist["widths"]) - w = (b - a) / n - new_min = f_inv(a + level_idx * w) - new_max = f_inv(a + (level_idx + 1) * w) - hist["widths"] = new_max - new_min - hist["edges"] = new_min - - curves[key] = hist.set_index(["edges", "widths"])["heights"] - - return curves, baselines - - # -------------------------------------------------------------------------------- # - # Computation - # -------------------------------------------------------------------------------- # - - def _compute_univariate_density( - self, - data_variable, - common_norm, - common_grid, - estimate_kws, - warn_singular=True, - ): - - # Initialize the estimator object - estimator = KDE(**estimate_kws) - - if set(self.variables) - {"x", "y"}: - if common_grid: - all_observations = self.comp_data.dropna() - estimator.define_support(all_observations[data_variable]) - else: - common_norm = False - - all_data = self.plot_data.dropna() - if common_norm and "weights" in all_data: - whole_weight = all_data["weights"].sum() - else: - whole_weight = len(all_data) - - densities = {} - - for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True): - - # Extract the data points from this sub set and remove nulls - observations = sub_data[data_variable] - - # Extract the weights for this subset of observations - if "weights" in self.variables: - weights = sub_data["weights"] - part_weight = weights.sum() - else: - weights = None - part_weight = len(sub_data) - - # Estimate the density of observations at this level - variance = np.nan_to_num(observations.var()) - singular = len(observations) < 2 or math.isclose(variance, 0) - try: - if not singular: - # Convoluted approach needed because numerical failures - # can manifest in a few different ways. - density, support = estimator(observations, weights=weights) - except np.linalg.LinAlgError: - singular = True - - if singular: - msg = ( - "Dataset has 0 variance; skipping density estimate. " - "Pass `warn_singular=False` to disable this warning." - ) - if warn_singular: - warnings.warn(msg, UserWarning, stacklevel=4) - continue - - # Invert the scaling of the support points - _, f_inv = self._get_scale_transforms(self.data_variable) - support = f_inv(support) - - # Apply a scaling factor so that the integral over all subsets is 1 - if common_norm: - density *= part_weight / whole_weight - - # Store the density for this level - key = tuple(sub_vars.items()) - densities[key] = pd.Series(density, index=support) - - return densities - - # -------------------------------------------------------------------------------- # - # Plotting - # -------------------------------------------------------------------------------- # - - def plot_univariate_histogram( - self, - multiple, - element, - fill, - common_norm, - common_bins, - shrink, - kde, - kde_kws, - color, - legend, - line_kws, - estimate_kws, - **plot_kws, - ): - - # -- Default keyword dicts - kde_kws = {} if kde_kws is None else kde_kws.copy() - line_kws = {} if line_kws is None else line_kws.copy() - estimate_kws = {} if estimate_kws is None else estimate_kws.copy() - - # -- Input checking - _check_argument("multiple", ["layer", "stack", "fill", "dodge"], multiple) - _check_argument("element", ["bars", "step", "poly"], element) - - auto_bins_with_weights = ( - "weights" in self.variables - and estimate_kws["bins"] == "auto" - and estimate_kws["binwidth"] is None - and not estimate_kws["discrete"] - ) - if auto_bins_with_weights: - msg = ( - "`bins` cannot be 'auto' when using weights. " - "Setting `bins=10`, but you will likely want to adjust." - ) - warnings.warn(msg, UserWarning) - estimate_kws["bins"] = 10 - - # Simplify downstream code if we are not normalizing - if estimate_kws["stat"] == "count": - common_norm = False - - orient = self.data_variable - - # Now initialize the Histogram estimator - estimator = Hist(**estimate_kws) - histograms = {} - - # Do pre-compute housekeeping related to multiple groups - all_data = self.comp_data.dropna() - all_weights = all_data.get("weights", None) - - multiple_histograms = set(self.variables) - {"x", "y"} - if multiple_histograms: - if common_bins: - bin_kws = estimator._define_bin_params(all_data, orient, None) - else: - common_norm = False - - if common_norm and all_weights is not None: - whole_weight = all_weights.sum() - else: - whole_weight = len(all_data) - - # Estimate the smoothed kernel densities, for use later - if kde: - # TODO alternatively, clip at min/max bins? - kde_kws.setdefault("cut", 0) - kde_kws["cumulative"] = estimate_kws["cumulative"] - densities = self._compute_univariate_density( - self.data_variable, - common_norm, - common_bins, - kde_kws, - warn_singular=False, - ) - - # First pass through the data to compute the histograms - for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True): - - # Prepare the relevant data - key = tuple(sub_vars.items()) - orient = self.data_variable - - if "weights" in self.variables: - sub_data["weight"] = sub_data.pop("weights") - part_weight = sub_data["weight"].sum() - else: - part_weight = len(sub_data) - - # Do the histogram computation - if not (multiple_histograms and common_bins): - bin_kws = estimator._define_bin_params(sub_data, orient, None) - res = estimator._normalize(estimator._eval(sub_data, orient, bin_kws)) - heights = res[estimator.stat].to_numpy() - widths = res["space"].to_numpy() - edges = res[orient].to_numpy() - widths / 2 - - # Rescale the smoothed curve to match the histogram - if kde and key in densities: - density = densities[key] - if estimator.cumulative: - hist_norm = heights.max() - else: - hist_norm = (heights * widths).sum() - densities[key] *= hist_norm - - # Convert edges back to original units for plotting - ax = self._get_axes(sub_vars) - _, inv = _get_transform_functions(ax, self.data_variable) - widths = inv(edges + widths) - inv(edges) - edges = inv(edges) - - # Pack the histogram data and metadata together - edges = edges + (1 - shrink) / 2 * widths - widths *= shrink - index = pd.MultiIndex.from_arrays([ - pd.Index(edges, name="edges"), - pd.Index(widths, name="widths"), - ]) - hist = pd.Series(heights, index=index, name="heights") - - # Apply scaling to normalize across groups - if common_norm: - hist *= part_weight / whole_weight - - # Store the finalized histogram data for future plotting - histograms[key] = hist - - # Modify the histogram and density data to resolve multiple groups - histograms, baselines = self._resolve_multiple(histograms, multiple) - if kde: - densities, _ = self._resolve_multiple( - densities, None if multiple == "dodge" else multiple - ) - - # Set autoscaling-related meta - sticky_stat = (0, 1) if multiple == "fill" else (0, np.inf) - if multiple == "fill": - # Filled plots should not have any margins - bin_vals = histograms.index.to_frame() - edges = bin_vals["edges"] - widths = bin_vals["widths"] - sticky_data = ( - edges.min(), - edges.max() + widths.loc[edges.idxmax()] - ) - else: - sticky_data = [] - - # --- Handle default visual attributes - - # Note: default linewidth is determined after plotting - - # Default alpha should depend on other parameters - if fill: - # Note: will need to account for other grouping semantics if added - if "hue" in self.variables and multiple == "layer": - default_alpha = .5 if element == "bars" else .25 - elif kde: - default_alpha = .5 - else: - default_alpha = .75 - else: - default_alpha = 1 - alpha = plot_kws.pop("alpha", default_alpha) # TODO make parameter? - - hist_artists = [] - - # Go back through the dataset and draw the plots - for sub_vars, _ in self.iter_data("hue", reverse=True): - - key = tuple(sub_vars.items()) - hist = histograms[key].rename("heights").reset_index() - bottom = np.asarray(baselines[key]) - - ax = self._get_axes(sub_vars) - - # Define the matplotlib attributes that depend on semantic mapping - if "hue" in self.variables: - sub_color = self._hue_map(sub_vars["hue"]) - else: - sub_color = color - - artist_kws = self._artist_kws( - plot_kws, fill, element, multiple, sub_color, alpha - ) - - if element == "bars": - - # Use matplotlib bar plotting - - plot_func = ax.bar if self.data_variable == "x" else ax.barh - artists = plot_func( - hist["edges"], - hist["heights"] - bottom, - hist["widths"], - bottom, - align="edge", - **artist_kws, - ) - - for bar in artists: - if self.data_variable == "x": - bar.sticky_edges.x[:] = sticky_data - bar.sticky_edges.y[:] = sticky_stat - else: - bar.sticky_edges.x[:] = sticky_stat - bar.sticky_edges.y[:] = sticky_data - - hist_artists.extend(artists) - - else: - - # Use either fill_between or plot to draw hull of histogram - if element == "step": - - final = hist.iloc[-1] - x = np.append(hist["edges"], final["edges"] + final["widths"]) - y = np.append(hist["heights"], final["heights"]) - b = np.append(bottom, bottom[-1]) - - if self.data_variable == "x": - step = "post" - drawstyle = "steps-post" - else: - step = "post" # fillbetweenx handles mapping internally - drawstyle = "steps-pre" - - elif element == "poly": - - x = hist["edges"] + hist["widths"] / 2 - y = hist["heights"] - b = bottom - - step = None - drawstyle = None - - if self.data_variable == "x": - if fill: - artist = ax.fill_between(x, b, y, step=step, **artist_kws) - else: - artist, = ax.plot(x, y, drawstyle=drawstyle, **artist_kws) - artist.sticky_edges.x[:] = sticky_data - artist.sticky_edges.y[:] = sticky_stat - else: - if fill: - artist = ax.fill_betweenx(x, b, y, step=step, **artist_kws) - else: - artist, = ax.plot(y, x, drawstyle=drawstyle, **artist_kws) - artist.sticky_edges.x[:] = sticky_stat - artist.sticky_edges.y[:] = sticky_data - - hist_artists.append(artist) - - if kde: - - # Add in the density curves - - try: - density = densities[key] - except KeyError: - continue - support = density.index - - if "x" in self.variables: - line_args = support, density - sticky_x, sticky_y = None, (0, np.inf) - else: - line_args = density, support - sticky_x, sticky_y = (0, np.inf), None - - line_kws["color"] = to_rgba(sub_color, 1) - line, = ax.plot( - *line_args, **line_kws, - ) - - if sticky_x is not None: - line.sticky_edges.x[:] = sticky_x - if sticky_y is not None: - line.sticky_edges.y[:] = sticky_y - - if element == "bars" and "linewidth" not in plot_kws: - - # Now we handle linewidth, which depends on the scaling of the plot - - # We will base everything on the minimum bin width - hist_metadata = pd.concat([ - # Use .items for generality over dict or df - h.index.to_frame() for _, h in histograms.items() - ]).reset_index(drop=True) - thin_bar_idx = hist_metadata["widths"].idxmin() - binwidth = hist_metadata.loc[thin_bar_idx, "widths"] - left_edge = hist_metadata.loc[thin_bar_idx, "edges"] - - # Set initial value - default_linewidth = math.inf - - # Loop through subsets based only on facet variables - for sub_vars, _ in self.iter_data(): - - ax = self._get_axes(sub_vars) - - # Needed in some cases to get valid transforms. - # Innocuous in other cases? - ax.autoscale_view() - - # Convert binwidth from data coordinates to pixels - pts_x, pts_y = 72 / ax.figure.dpi * abs( - ax.transData.transform([left_edge + binwidth] * 2) - - ax.transData.transform([left_edge] * 2) - ) - if self.data_variable == "x": - binwidth_points = pts_x - else: - binwidth_points = pts_y - - # The relative size of the lines depends on the appearance - # This is a provisional value and may need more tweaking - default_linewidth = min(.1 * binwidth_points, default_linewidth) - - # Set the attributes - for bar in hist_artists: - - # Don't let the lines get too thick - max_linewidth = bar.get_linewidth() - if not fill: - max_linewidth *= 1.5 - - linewidth = min(default_linewidth, max_linewidth) - - # If not filling, don't let lines disappear - if not fill: - min_linewidth = .5 - linewidth = max(linewidth, min_linewidth) - - bar.set_linewidth(linewidth) - - # --- Finalize the plot ---- - - # Axis labels - ax = self.ax if self.ax is not None else self.facets.axes.flat[0] - default_x = default_y = "" - if self.data_variable == "x": - default_y = estimator.stat.capitalize() - if self.data_variable == "y": - default_x = estimator.stat.capitalize() - self._add_axis_labels(ax, default_x, default_y) - - # Legend for semantic variables - if "hue" in self.variables and legend: - - if fill or element == "bars": - artist = partial(mpl.patches.Patch) - else: - artist = partial(mpl.lines.Line2D, [], []) - - ax_obj = self.ax if self.ax is not None else self.facets - self._add_legend( - ax_obj, artist, fill, element, multiple, alpha, plot_kws, {}, - ) - - def plot_bivariate_histogram( - self, - common_bins, common_norm, - thresh, pthresh, pmax, - color, legend, - cbar, cbar_ax, cbar_kws, - estimate_kws, - **plot_kws, - ): - - # Default keyword dicts - cbar_kws = {} if cbar_kws is None else cbar_kws.copy() - - # Now initialize the Histogram estimator - estimator = Histogram(**estimate_kws) - - # Do pre-compute housekeeping related to multiple groups - if set(self.variables) - {"x", "y"}: - all_data = self.comp_data.dropna() - if common_bins: - estimator.define_bin_params( - all_data["x"], - all_data["y"], - all_data.get("weights", None), - ) - else: - common_norm = False - - # -- Determine colormap threshold and norm based on the full data - - full_heights = [] - for _, sub_data in self.iter_data(from_comp_data=True): - sub_heights, _ = estimator( - sub_data["x"], sub_data["y"], sub_data.get("weights", None) - ) - full_heights.append(sub_heights) - - common_color_norm = not set(self.variables) - {"x", "y"} or common_norm - - if pthresh is not None and common_color_norm: - thresh = self._quantile_to_level(full_heights, pthresh) - - plot_kws.setdefault("vmin", 0) - if common_color_norm: - if pmax is not None: - vmax = self._quantile_to_level(full_heights, pmax) - else: - vmax = plot_kws.pop("vmax", max(map(np.max, full_heights))) - else: - vmax = None - - # Get a default color - # (We won't follow the color cycle here, as multiple plots are unlikely) - if color is None: - color = "C0" - - # --- Loop over data (subsets) and draw the histograms - for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True): - - if sub_data.empty: - continue - - # Do the histogram computation - heights, (x_edges, y_edges) = estimator( - sub_data["x"], - sub_data["y"], - weights=sub_data.get("weights", None), - ) - - # Get the axes for this plot - ax = self._get_axes(sub_vars) - - # Invert the scale for the edges - _, inv_x = _get_transform_functions(ax, "x") - _, inv_y = _get_transform_functions(ax, "y") - x_edges = inv_x(x_edges) - y_edges = inv_y(y_edges) - - # Apply scaling to normalize across groups - if estimator.stat != "count" and common_norm: - heights *= len(sub_data) / len(all_data) - - # Define the specific kwargs for this artist - artist_kws = plot_kws.copy() - if "hue" in self.variables: - color = self._hue_map(sub_vars["hue"]) - cmap = self._cmap_from_color(color) - artist_kws["cmap"] = cmap - else: - cmap = artist_kws.pop("cmap", None) - if isinstance(cmap, str): - cmap = color_palette(cmap, as_cmap=True) - elif cmap is None: - cmap = self._cmap_from_color(color) - artist_kws["cmap"] = cmap - - # Set the upper norm on the colormap - if not common_color_norm and pmax is not None: - vmax = self._quantile_to_level(heights, pmax) - if vmax is not None: - artist_kws["vmax"] = vmax - - # Make cells at or below the threshold transparent - if not common_color_norm and pthresh: - thresh = self._quantile_to_level(heights, pthresh) - if thresh is not None: - heights = np.ma.masked_less_equal(heights, thresh) - - # pcolormesh is going to turn the grid off, but we want to keep it - # I'm not sure if there's a better way to get the grid state - x_grid = any([l.get_visible() for l in ax.xaxis.get_gridlines()]) - y_grid = any([l.get_visible() for l in ax.yaxis.get_gridlines()]) - - mesh = ax.pcolormesh( - x_edges, - y_edges, - heights.T, - **artist_kws, - ) - - # pcolormesh sets sticky edges, but we only want them if not thresholding - if thresh is not None: - mesh.sticky_edges.x[:] = [] - mesh.sticky_edges.y[:] = [] - - # Add an optional colorbar - # Note, we want to improve this. When hue is used, it will stack - # multiple colorbars with redundant ticks in an ugly way. - # But it's going to take some work to have multiple colorbars that - # share ticks nicely. - if cbar: - ax.figure.colorbar(mesh, cbar_ax, ax, **cbar_kws) - - # Reset the grid state - if x_grid: - ax.grid(True, axis="x") - if y_grid: - ax.grid(True, axis="y") - - # --- Finalize the plot - - ax = self.ax if self.ax is not None else self.facets.axes.flat[0] - self._add_axis_labels(ax) - - if "hue" in self.variables and legend: - - # TODO if possible, I would like to move the contour - # intensity information into the legend too and label the - # iso proportions rather than the raw density values - - artist_kws = {} - artist = partial(mpl.patches.Patch) - ax_obj = self.ax if self.ax is not None else self.facets - self._add_legend( - ax_obj, artist, True, False, "layer", 1, artist_kws, {}, - ) - - def plot_univariate_density( - self, - multiple, - common_norm, - common_grid, - warn_singular, - fill, - color, - legend, - estimate_kws, - **plot_kws, - ): - - # Handle conditional defaults - if fill is None: - fill = multiple in ("stack", "fill") - - # Preprocess the matplotlib keyword dictionaries - if fill: - artist = mpl.collections.PolyCollection - else: - artist = mpl.lines.Line2D - plot_kws = normalize_kwargs(plot_kws, artist) - - # Input checking - _check_argument("multiple", ["layer", "stack", "fill"], multiple) - - # Always share the evaluation grid when stacking - subsets = bool(set(self.variables) - {"x", "y"}) - if subsets and multiple in ("stack", "fill"): - common_grid = True - - # Do the computation - densities = self._compute_univariate_density( - self.data_variable, - common_norm, - common_grid, - estimate_kws, - warn_singular, - ) - - # Adjust densities based on the `multiple` rule - densities, baselines = self._resolve_multiple(densities, multiple) - - # Control the interaction with autoscaling by defining sticky_edges - # i.e. we don't want autoscale margins below the density curve - sticky_density = (0, 1) if multiple == "fill" else (0, np.inf) - - if multiple == "fill": - # Filled plots should not have any margins - sticky_support = densities.index.min(), densities.index.max() - else: - sticky_support = [] - - if fill: - if multiple == "layer": - default_alpha = .25 - else: - default_alpha = .75 - else: - default_alpha = 1 - alpha = plot_kws.pop("alpha", default_alpha) # TODO make parameter? - - # Now iterate through the subsets and draw the densities - # We go backwards so stacked densities read from top-to-bottom - for sub_vars, _ in self.iter_data("hue", reverse=True): - - # Extract the support grid and density curve for this level - key = tuple(sub_vars.items()) - try: - density = densities[key] - except KeyError: - continue - support = density.index - fill_from = baselines[key] - - ax = self._get_axes(sub_vars) - - if "hue" in self.variables: - sub_color = self._hue_map(sub_vars["hue"]) - else: - sub_color = color - - artist_kws = self._artist_kws( - plot_kws, fill, False, multiple, sub_color, alpha - ) - - # Either plot a curve with observation values on the x axis - if "x" in self.variables: - - if fill: - artist = ax.fill_between(support, fill_from, density, **artist_kws) - - else: - artist, = ax.plot(support, density, **artist_kws) - - artist.sticky_edges.x[:] = sticky_support - artist.sticky_edges.y[:] = sticky_density - - # Or plot a curve with observation values on the y axis - else: - if fill: - artist = ax.fill_betweenx(support, fill_from, density, **artist_kws) - else: - artist, = ax.plot(density, support, **artist_kws) - - artist.sticky_edges.x[:] = sticky_density - artist.sticky_edges.y[:] = sticky_support - - # --- Finalize the plot ---- - - ax = self.ax if self.ax is not None else self.facets.axes.flat[0] - default_x = default_y = "" - if self.data_variable == "x": - default_y = "Density" - if self.data_variable == "y": - default_x = "Density" - self._add_axis_labels(ax, default_x, default_y) - - if "hue" in self.variables and legend: - - if fill: - artist = partial(mpl.patches.Patch) - else: - artist = partial(mpl.lines.Line2D, [], []) - - ax_obj = self.ax if self.ax is not None else self.facets - self._add_legend( - ax_obj, artist, fill, False, multiple, alpha, plot_kws, {}, - ) - - def plot_bivariate_density( - self, - common_norm, - fill, - levels, - thresh, - color, - legend, - cbar, - warn_singular, - cbar_ax, - cbar_kws, - estimate_kws, - **contour_kws, - ): - - contour_kws = contour_kws.copy() - - estimator = KDE(**estimate_kws) - - if not set(self.variables) - {"x", "y"}: - common_norm = False - - all_data = self.plot_data.dropna() - - # Loop through the subsets and estimate the KDEs - densities, supports = {}, {} - - for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True): - - # Extract the data points from this sub set - observations = sub_data[["x", "y"]] - min_variance = observations.var().fillna(0).min() - observations = observations["x"], observations["y"] - - # Extract the weights for this subset of observations - if "weights" in self.variables: - weights = sub_data["weights"] - else: - weights = None - - # Estimate the density of observations at this level - singular = math.isclose(min_variance, 0) - try: - if not singular: - density, support = estimator(*observations, weights=weights) - except np.linalg.LinAlgError: - # Testing for 0 variance doesn't catch all cases where scipy raises, - # but we can also get a ValueError, so we need this convoluted approach - singular = True - - if singular: - msg = ( - "KDE cannot be estimated (0 variance or perfect covariance). " - "Pass `warn_singular=False` to disable this warning." - ) - if warn_singular: - warnings.warn(msg, UserWarning, stacklevel=3) - continue - - # Transform the support grid back to the original scale - ax = self._get_axes(sub_vars) - _, inv_x = _get_transform_functions(ax, "x") - _, inv_y = _get_transform_functions(ax, "y") - support = inv_x(support[0]), inv_y(support[1]) - - # Apply a scaling factor so that the integral over all subsets is 1 - if common_norm: - density *= len(sub_data) / len(all_data) - - key = tuple(sub_vars.items()) - densities[key] = density - supports[key] = support - - # Define a grid of iso-proportion levels - if thresh is None: - thresh = 0 - if isinstance(levels, Number): - levels = np.linspace(thresh, 1, levels) - else: - if min(levels) < 0 or max(levels) > 1: - raise ValueError("levels must be in [0, 1]") - - # Transform from iso-proportions to iso-densities - if common_norm: - common_levels = self._quantile_to_level( - list(densities.values()), levels, - ) - draw_levels = {k: common_levels for k in densities} - else: - draw_levels = { - k: self._quantile_to_level(d, levels) - for k, d in densities.items() - } - - # Define the coloring of the contours - if "hue" in self.variables: - for param in ["cmap", "colors"]: - if param in contour_kws: - msg = f"{param} parameter ignored when using hue mapping." - warnings.warn(msg, UserWarning) - contour_kws.pop(param) - else: - - # Work out a default coloring of the contours - coloring_given = set(contour_kws) & {"cmap", "colors"} - if fill and not coloring_given: - cmap = self._cmap_from_color(color) - contour_kws["cmap"] = cmap - if not fill and not coloring_given: - contour_kws["colors"] = [color] - - # Use our internal colormap lookup - cmap = contour_kws.pop("cmap", None) - if isinstance(cmap, str): - cmap = color_palette(cmap, as_cmap=True) - if cmap is not None: - contour_kws["cmap"] = cmap - - # Loop through the subsets again and plot the data - for sub_vars, _ in self.iter_data("hue"): - - if "hue" in sub_vars: - color = self._hue_map(sub_vars["hue"]) - if fill: - contour_kws["cmap"] = self._cmap_from_color(color) - else: - contour_kws["colors"] = [color] - - ax = self._get_axes(sub_vars) - - # Choose the function to plot with - # TODO could add a pcolormesh based option as well - # Which would look something like element="raster" - if fill: - contour_func = ax.contourf - else: - contour_func = ax.contour - - key = tuple(sub_vars.items()) - if key not in densities: - continue - density = densities[key] - xx, yy = supports[key] - - # Pop the label kwarg which is unused by contour_func (but warns) - contour_kws.pop("label", None) - - cset = contour_func( - xx, yy, density, - levels=draw_levels[key], - **contour_kws, - ) - - # Add a color bar representing the contour heights - # Note: this shows iso densities, not iso proportions - # See more notes in histplot about how this could be improved - if cbar: - cbar_kws = {} if cbar_kws is None else cbar_kws - ax.figure.colorbar(cset, cbar_ax, ax, **cbar_kws) - - # --- Finalize the plot - ax = self.ax if self.ax is not None else self.facets.axes.flat[0] - self._add_axis_labels(ax) - - if "hue" in self.variables and legend: - - # TODO if possible, I would like to move the contour - # intensity information into the legend too and label the - # iso proportions rather than the raw density values - - artist_kws = {} - if fill: - artist = partial(mpl.patches.Patch) - else: - artist = partial(mpl.lines.Line2D, [], []) - - ax_obj = self.ax if self.ax is not None else self.facets - self._add_legend( - ax_obj, artist, fill, False, "layer", 1, artist_kws, {}, - ) - - def plot_univariate_ecdf(self, estimate_kws, legend, **plot_kws): - - estimator = ECDF(**estimate_kws) - - # Set the draw style to step the right way for the data variable - drawstyles = dict(x="steps-post", y="steps-pre") - plot_kws["drawstyle"] = drawstyles[self.data_variable] - - # Loop through the subsets, transform and plot the data - for sub_vars, sub_data in self.iter_data( - "hue", reverse=True, from_comp_data=True, - ): - - # Compute the ECDF - if sub_data.empty: - continue - - observations = sub_data[self.data_variable] - weights = sub_data.get("weights", None) - stat, vals = estimator(observations, weights=weights) - - # Assign attributes based on semantic mapping - artist_kws = plot_kws.copy() - if "hue" in self.variables: - artist_kws["color"] = self._hue_map(sub_vars["hue"]) - - # Return the data variable to the linear domain - ax = self._get_axes(sub_vars) - _, inv = _get_transform_functions(ax, self.data_variable) - vals = inv(vals) - - # Manually set the minimum value on a "log" scale - if isinstance(inv.__self__, mpl.scale.LogTransform): - vals[0] = -np.inf - - # Work out the orientation of the plot - if self.data_variable == "x": - plot_args = vals, stat - stat_variable = "y" - else: - plot_args = stat, vals - stat_variable = "x" - - if estimator.stat == "count": - top_edge = len(observations) - else: - top_edge = 1 - - # Draw the line for this subset - artist, = ax.plot(*plot_args, **artist_kws) - sticky_edges = getattr(artist.sticky_edges, stat_variable) - sticky_edges[:] = 0, top_edge - - # --- Finalize the plot ---- - ax = self.ax if self.ax is not None else self.facets.axes.flat[0] - stat = estimator.stat.capitalize() - default_x = default_y = "" - if self.data_variable == "x": - default_y = stat - if self.data_variable == "y": - default_x = stat - self._add_axis_labels(ax, default_x, default_y) - - if "hue" in self.variables and legend: - artist = partial(mpl.lines.Line2D, [], []) - alpha = plot_kws.get("alpha", 1) - ax_obj = self.ax if self.ax is not None else self.facets - self._add_legend( - ax_obj, artist, False, False, None, alpha, plot_kws, {}, - ) - - def plot_rug(self, height, expand_margins, legend, **kws): - - for sub_vars, sub_data, in self.iter_data(from_comp_data=True): - - ax = self._get_axes(sub_vars) - - kws.setdefault("linewidth", 1) - - if expand_margins: - xmarg, ymarg = ax.margins() - if "x" in self.variables: - ymarg += height * 2 - if "y" in self.variables: - xmarg += height * 2 - ax.margins(x=xmarg, y=ymarg) - - if "hue" in self.variables: - kws.pop("c", None) - kws.pop("color", None) - - if "x" in self.variables: - self._plot_single_rug(sub_data, "x", height, ax, kws) - if "y" in self.variables: - self._plot_single_rug(sub_data, "y", height, ax, kws) - - # --- Finalize the plot - self._add_axis_labels(ax) - if "hue" in self.variables and legend: - # TODO ideally i'd like the legend artist to look like a rug - legend_artist = partial(mpl.lines.Line2D, [], []) - self._add_legend( - ax, legend_artist, False, False, None, 1, {}, {}, - ) - - def _plot_single_rug(self, sub_data, var, height, ax, kws): - """Draw a rugplot along one axis of the plot.""" - vector = sub_data[var] - n = len(vector) - - # Return data to linear domain - _, inv = _get_transform_functions(ax, var) - vector = inv(vector) - - # We'll always add a single collection with varying colors - if "hue" in self.variables: - colors = self._hue_map(sub_data["hue"]) - else: - colors = None - - # Build the array of values for the LineCollection - if var == "x": - - trans = tx.blended_transform_factory(ax.transData, ax.transAxes) - xy_pairs = np.column_stack([ - np.repeat(vector, 2), np.tile([0, height], n) - ]) - - if var == "y": - - trans = tx.blended_transform_factory(ax.transAxes, ax.transData) - xy_pairs = np.column_stack([ - np.tile([0, height], n), np.repeat(vector, 2) - ]) - - # Draw the lines on the plot - line_segs = xy_pairs.reshape([n, 2, 2]) - ax.add_collection(LineCollection( - line_segs, transform=trans, colors=colors, **kws - )) - - ax.autoscale_view(scalex=var == "x", scaley=var == "y") - - -# ==================================================================================== # -# External API -# ==================================================================================== # - -def histplot( - data=None, *, - # Vector variables - x=None, y=None, hue=None, weights=None, - # Histogram computation parameters - stat="count", bins="auto", binwidth=None, binrange=None, - discrete=None, cumulative=False, common_bins=True, common_norm=True, - # Histogram appearance parameters - multiple="layer", element="bars", fill=True, shrink=1, - # Histogram smoothing with a kernel density estimate - kde=False, kde_kws=None, line_kws=None, - # Bivariate histogram parameters - thresh=0, pthresh=None, pmax=None, cbar=False, cbar_ax=None, cbar_kws=None, - # Hue mapping parameters - palette=None, hue_order=None, hue_norm=None, color=None, - # Axes information - log_scale=None, legend=True, ax=None, - # Other appearance keywords - **kwargs, -): - - p = _DistributionPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, weights=weights), - ) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - - if ax is None: - ax = plt.gca() - - p._attach(ax, log_scale=log_scale) - - if p.univariate: # Note, bivariate plots won't cycle - if fill: - method = ax.bar if element == "bars" else ax.fill_between - else: - method = ax.plot - color = _default_color(method, hue, color, kwargs) - - if not p.has_xy_data: - return ax - - # Default to discrete bins for categorical variables - if discrete is None: - discrete = p._default_discrete() - - estimate_kws = dict( - stat=stat, - bins=bins, - binwidth=binwidth, - binrange=binrange, - discrete=discrete, - cumulative=cumulative, - ) - - if p.univariate: - - p.plot_univariate_histogram( - multiple=multiple, - element=element, - fill=fill, - shrink=shrink, - common_norm=common_norm, - common_bins=common_bins, - kde=kde, - kde_kws=kde_kws, - color=color, - legend=legend, - estimate_kws=estimate_kws, - line_kws=line_kws, - **kwargs, - ) - - else: - - p.plot_bivariate_histogram( - common_bins=common_bins, - common_norm=common_norm, - thresh=thresh, - pthresh=pthresh, - pmax=pmax, - color=color, - legend=legend, - cbar=cbar, - cbar_ax=cbar_ax, - cbar_kws=cbar_kws, - estimate_kws=estimate_kws, - **kwargs, - ) - - return ax - - -histplot.__doc__ = """\ -Plot univariate or bivariate histograms to show distributions of datasets. - -A histogram is a classic visualization tool that represents the distribution -of one or more variables by counting the number of observations that fall within -discrete bins. - -This function can normalize the statistic computed within each bin to estimate -frequency, density or probability mass, and it can add a smooth curve obtained -using a kernel density estimate, similar to :func:`kdeplot`. - -More information is provided in the :ref:`user guide <tutorial_hist>`. - -Parameters ----------- -{params.core.data} -{params.core.xy} -{params.core.hue} -weights : vector or key in ``data`` - If provided, weight the contribution of the corresponding data points - towards the count in each bin by these factors. -{params.hist.stat} -{params.hist.bins} -{params.hist.binwidth} -{params.hist.binrange} -discrete : bool - If True, default to ``binwidth=1`` and draw the bars so that they are - centered on their corresponding data points. This avoids "gaps" that may - otherwise appear when using discrete (integer) data. -cumulative : bool - If True, plot the cumulative counts as bins increase. -common_bins : bool - If True, use the same bins when semantic variables produce multiple - plots. If using a reference rule to determine the bins, it will be computed - with the full dataset. -common_norm : bool - If True and using a normalized statistic, the normalization will apply over - the full dataset. Otherwise, normalize each histogram independently. -multiple : {{"layer", "dodge", "stack", "fill"}} - Approach to resolving multiple elements when semantic mapping creates subsets. - Only relevant with univariate data. -element : {{"bars", "step", "poly"}} - Visual representation of the histogram statistic. - Only relevant with univariate data. -fill : bool - If True, fill in the space under the histogram. - Only relevant with univariate data. -shrink : number - Scale the width of each bar relative to the binwidth by this factor. - Only relevant with univariate data. -kde : bool - If True, compute a kernel density estimate to smooth the distribution - and show on the plot as (one or more) line(s). - Only relevant with univariate data. -kde_kws : dict - Parameters that control the KDE computation, as in :func:`kdeplot`. -line_kws : dict - Parameters that control the KDE visualization, passed to - :meth:`matplotlib.axes.Axes.plot`. -thresh : number or None - Cells with a statistic less than or equal to this value will be transparent. - Only relevant with bivariate data. -pthresh : number or None - Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts - (or other statistics, when used) up to this proportion of the total will be - transparent. -pmax : number or None - A value in [0, 1] that sets that saturation point for the colormap at a value - such that cells below constitute this proportion of the total count (or - other statistic, when used). -{params.dist.cbar} -{params.dist.cbar_ax} -{params.dist.cbar_kws} -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.core.color} -{params.dist.log_scale} -{params.dist.legend} -{params.core.ax} -kwargs - Other keyword arguments are passed to one of the following matplotlib - functions: - - - :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars") - - :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True) - - :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False) - - :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate) - -Returns -------- -{returns.ax} - -See Also --------- -{seealso.displot} -{seealso.kdeplot} -{seealso.rugplot} -{seealso.ecdfplot} -{seealso.jointplot} - -Notes ------ - -The choice of bins for computing and plotting a histogram can exert -substantial influence on the insights that one is able to draw from the -visualization. If the bins are too large, they may erase important features. -On the other hand, bins that are too small may be dominated by random -variability, obscuring the shape of the true underlying distribution. The -default bin size is determined using a reference rule that depends on the -sample size and variance. This works well in many cases, (i.e., with -"well-behaved" data) but it fails in others. It is always a good to try -different bin sizes to be sure that you are not missing something important. -This function allows you to specify bins in several different ways, such as -by setting the total number of bins to use, the width of each bin, or the -specific locations where the bins should break. - -Examples --------- - -.. include:: ../docstrings/histplot.rst - -""".format( - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) - - -def kdeplot( - data=None, *, x=None, y=None, hue=None, weights=None, - palette=None, hue_order=None, hue_norm=None, color=None, fill=None, - multiple="layer", common_norm=True, common_grid=False, cumulative=False, - bw_method="scott", bw_adjust=1, warn_singular=True, log_scale=None, - levels=10, thresh=.05, gridsize=200, cut=3, clip=None, - legend=True, cbar=False, cbar_ax=None, cbar_kws=None, ax=None, - **kwargs, -): - - # --- Start with backwards compatability for versions < 0.11.0 ---------------- - - # Handle (past) deprecation of `data2` - if "data2" in kwargs: - msg = "`data2` has been removed (replaced by `y`); please update your code." - raise TypeError(msg) - - # Handle deprecation of `vertical` - vertical = kwargs.pop("vertical", None) - if vertical is not None: - if vertical: - action_taken = "assigning data to `y`." - if x is None: - data, y = y, data - else: - x, y = y, x - else: - action_taken = "assigning data to `x`." - msg = textwrap.dedent(f"""\n - The `vertical` parameter is deprecated; {action_taken} - This will become an error in seaborn v0.14.0; please update your code. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - # Handle deprecation of `bw` - bw = kwargs.pop("bw", None) - if bw is not None: - msg = textwrap.dedent(f"""\n - The `bw` parameter is deprecated in favor of `bw_method` and `bw_adjust`. - Setting `bw_method={bw}`, but please see the docs for the new parameters - and update your code. This will become an error in seaborn v0.14.0. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - bw_method = bw - - # Handle deprecation of `kernel` - if kwargs.pop("kernel", None) is not None: - msg = textwrap.dedent("""\n - Support for alternate kernels has been removed; using Gaussian kernel. - This will become an error in seaborn v0.14.0; please update your code. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - # Handle deprecation of shade_lowest - shade_lowest = kwargs.pop("shade_lowest", None) - if shade_lowest is not None: - if shade_lowest: - thresh = 0 - msg = textwrap.dedent(f"""\n - `shade_lowest` has been replaced by `thresh`; setting `thresh={thresh}. - This will become an error in seaborn v0.14.0; please update your code. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - # Handle "soft" deprecation of shade `shade` is not really the right - # terminology here, but unlike some of the other deprecated parameters it - # is probably very commonly used and much hard to remove. This is therefore - # going to be a longer process where, first, `fill` will be introduced and - # be used throughout the documentation. In 0.12, when kwarg-only - # enforcement hits, we can remove the shade/shade_lowest out of the - # function signature all together and pull them out of the kwargs. Then we - # can actually fire a FutureWarning, and eventually remove. - shade = kwargs.pop("shade", None) - if shade is not None: - fill = shade - msg = textwrap.dedent(f"""\n - `shade` is now deprecated in favor of `fill`; setting `fill={shade}`. - This will become an error in seaborn v0.14.0; please update your code. - """) - warnings.warn(msg, FutureWarning, stacklevel=2) - - # Handle `n_levels` - # This was never in the formal API but it was processed, and appeared in an - # example. We can treat as an alias for `levels` now and deprecate later. - levels = kwargs.pop("n_levels", levels) - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - p = _DistributionPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, weights=weights), - ) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - - if ax is None: - ax = plt.gca() - - p._attach(ax, allowed_types=["numeric", "datetime"], log_scale=log_scale) - - method = ax.fill_between if fill else ax.plot - color = _default_color(method, hue, color, kwargs) - - if not p.has_xy_data: - return ax - - # Pack the kwargs for statistics.KDE - estimate_kws = dict( - bw_method=bw_method, - bw_adjust=bw_adjust, - gridsize=gridsize, - cut=cut, - clip=clip, - cumulative=cumulative, - ) - - if p.univariate: - - plot_kws = kwargs.copy() - - p.plot_univariate_density( - multiple=multiple, - common_norm=common_norm, - common_grid=common_grid, - fill=fill, - color=color, - legend=legend, - warn_singular=warn_singular, - estimate_kws=estimate_kws, - **plot_kws, - ) - - else: - - p.plot_bivariate_density( - common_norm=common_norm, - fill=fill, - levels=levels, - thresh=thresh, - legend=legend, - color=color, - warn_singular=warn_singular, - cbar=cbar, - cbar_ax=cbar_ax, - cbar_kws=cbar_kws, - estimate_kws=estimate_kws, - **kwargs, - ) - - return ax - - -kdeplot.__doc__ = """\ -Plot univariate or bivariate distributions using kernel density estimation. - -A kernel density estimate (KDE) plot is a method for visualizing the -distribution of observations in a dataset, analogous to a histogram. KDE -represents the data using a continuous probability density curve in one or -more dimensions. - -The approach is explained further in the :ref:`user guide <tutorial_kde>`. - -Relative to a histogram, KDE can produce a plot that is less cluttered and -more interpretable, especially when drawing multiple distributions. But it -has the potential to introduce distortions if the underlying distribution is -bounded or not smooth. Like a histogram, the quality of the representation -also depends on the selection of good smoothing parameters. - -Parameters ----------- -{params.core.data} -{params.core.xy} -{params.core.hue} -weights : vector or key in ``data`` - If provided, weight the kernel density estimation using these values. -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.core.color} -fill : bool or None - If True, fill in the area under univariate density curves or between - bivariate contours. If None, the default depends on ``multiple``. -{params.dist.multiple} -common_norm : bool - If True, scale each conditional density by the number of observations - such that the total area under all densities sums to 1. Otherwise, - normalize each density independently. -common_grid : bool - If True, use the same evaluation grid for each kernel density estimate. - Only relevant with univariate data. -{params.kde.cumulative} -{params.kde.bw_method} -{params.kde.bw_adjust} -warn_singular : bool - If True, issue a warning when trying to estimate the density of data - with zero variance. -{params.dist.log_scale} -levels : int or vector - Number of contour levels or values to draw contours at. A vector argument - must have increasing values in [0, 1]. Levels correspond to iso-proportions - of the density: e.g., 20% of the probability mass will lie below the - contour drawn for 0.2. Only relevant with bivariate data. -thresh : number in [0, 1] - Lowest iso-proportion level at which to draw a contour line. Ignored when - ``levels`` is a vector. Only relevant with bivariate data. -gridsize : int - Number of points on each dimension of the evaluation grid. -{params.kde.cut} -{params.kde.clip} -{params.dist.legend} -{params.dist.cbar} -{params.dist.cbar_ax} -{params.dist.cbar_kws} -{params.core.ax} -kwargs - Other keyword arguments are passed to one of the following matplotlib - functions: - - - :meth:`matplotlib.axes.Axes.plot` (univariate, ``fill=False``), - - :meth:`matplotlib.axes.Axes.fill_between` (univariate, ``fill=True``), - - :meth:`matplotlib.axes.Axes.contour` (bivariate, ``fill=False``), - - :meth:`matplotlib.axes.contourf` (bivariate, ``fill=True``). - -Returns -------- -{returns.ax} - -See Also --------- -{seealso.displot} -{seealso.histplot} -{seealso.ecdfplot} -{seealso.jointplot} -{seealso.violinplot} - -Notes ------ - -The *bandwidth*, or standard deviation of the smoothing kernel, is an -important parameter. Misspecification of the bandwidth can produce a -distorted representation of the data. Much like the choice of bin width in a -histogram, an over-smoothed curve can erase true features of a -distribution, while an under-smoothed curve can create false features out of -random variability. The rule-of-thumb that sets the default bandwidth works -best when the true distribution is smooth, unimodal, and roughly bell-shaped. -It is always a good idea to check the default behavior by using ``bw_adjust`` -to increase or decrease the amount of smoothing. - -Because the smoothing algorithm uses a Gaussian kernel, the estimated density -curve can extend to values that do not make sense for a particular dataset. -For example, the curve may be drawn over negative values when smoothing data -that are naturally positive. The ``cut`` and ``clip`` parameters can be used -to control the extent of the curve, but datasets that have many observations -close to a natural boundary may be better served by a different visualization -method. - -Similar considerations apply when a dataset is naturally discrete or "spiky" -(containing many repeated observations of the same value). Kernel density -estimation will always produce a smooth curve, which would be misleading -in these situations. - -The units on the density axis are a common source of confusion. While kernel -density estimation produces a probability distribution, the height of the curve -at each point gives a density, not a probability. A probability can be obtained -only by integrating the density across a range. The curve is normalized so -that the integral over all possible values is 1, meaning that the scale of -the density axis depends on the data values. - -Examples --------- - -.. include:: ../docstrings/kdeplot.rst - -""".format( - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) - - -def ecdfplot( - data=None, *, - # Vector variables - x=None, y=None, hue=None, weights=None, - # Computation parameters - stat="proportion", complementary=False, - # Hue mapping parameters - palette=None, hue_order=None, hue_norm=None, - # Axes information - log_scale=None, legend=True, ax=None, - # Other appearance keywords - **kwargs, -): - - p = _DistributionPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, weights=weights), - ) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - - # We could support other semantics (size, style) here fairly easily - # But it would make distplot a bit more complicated. - # It's always possible to add features like that later, so I am going to defer. - # It will be even easier to wait until after there is a more general/abstract - # way to go from semantic specs to artist attributes. - - if ax is None: - ax = plt.gca() - - p._attach(ax, log_scale=log_scale) - - color = kwargs.pop("color", kwargs.pop("c", None)) - kwargs["color"] = _default_color(ax.plot, hue, color, kwargs) - - if not p.has_xy_data: - return ax - - # We could add this one day, but it's of dubious value - if not p.univariate: - raise NotImplementedError("Bivariate ECDF plots are not implemented") - - estimate_kws = dict( - stat=stat, - complementary=complementary, - ) - - p.plot_univariate_ecdf( - estimate_kws=estimate_kws, - legend=legend, - **kwargs, - ) - - return ax - - -ecdfplot.__doc__ = """\ -Plot empirical cumulative distribution functions. - -An ECDF represents the proportion or count of observations falling below each -unique value in a dataset. Compared to a histogram or density plot, it has the -advantage that each observation is visualized directly, meaning that there are -no binning or smoothing parameters that need to be adjusted. It also aids direct -comparisons between multiple distributions. A downside is that the relationship -between the appearance of the plot and the basic properties of the distribution -(such as its central tendency, variance, and the presence of any bimodality) -may not be as intuitive. - -More information is provided in the :ref:`user guide <tutorial_ecdf>`. - -Parameters ----------- -{params.core.data} -{params.core.xy} -{params.core.hue} -weights : vector or key in ``data`` - If provided, weight the contribution of the corresponding data points - towards the cumulative distribution using these values. -{params.ecdf.stat} -{params.ecdf.complementary} -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.dist.log_scale} -{params.dist.legend} -{params.core.ax} -kwargs - Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.plot`. - -Returns -------- -{returns.ax} - -See Also --------- -{seealso.displot} -{seealso.histplot} -{seealso.kdeplot} -{seealso.rugplot} - -Examples --------- - -.. include:: ../docstrings/ecdfplot.rst - -""".format( - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) - - -def rugplot( - data=None, *, x=None, y=None, hue=None, height=.025, expand_margins=True, - palette=None, hue_order=None, hue_norm=None, legend=True, ax=None, **kwargs -): - - # A note: I think it would make sense to add multiple= to rugplot and allow - # rugs for different hue variables to be shifted orthogonal to the data axis - # But is this stacking, or dodging? - - # A note: if we want to add a style semantic to rugplot, - # we could make an option that draws the rug using scatterplot - - # A note, it would also be nice to offer some kind of histogram/density - # rugplot, since alpha blending doesn't work great in the large n regime - - # --- Start with backwards compatability for versions < 0.11.0 ---------------- - - a = kwargs.pop("a", None) - axis = kwargs.pop("axis", None) - - if a is not None: - data = a - msg = textwrap.dedent("""\n - The `a` parameter has been replaced; use `x`, `y`, and/or `data` instead. - Please update your code; This will become an error in seaborn v0.14.0. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - if axis is not None: - if axis == "x": - x = data - elif axis == "y": - y = data - data = None - msg = textwrap.dedent(f"""\n - The `axis` parameter has been deprecated; use the `{axis}` parameter instead. - Please update your code; this will become an error in seaborn v0.14.0. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - vertical = kwargs.pop("vertical", None) - if vertical is not None: - if vertical: - action_taken = "assigning data to `y`." - if x is None: - data, y = y, data - else: - x, y = y, x - else: - action_taken = "assigning data to `x`." - msg = textwrap.dedent(f"""\n - The `vertical` parameter is deprecated; {action_taken} - This will become an error in seaborn v0.14.0; please update your code. - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - p = _DistributionPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue), - ) - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - - if ax is None: - ax = plt.gca() - - p._attach(ax) - - color = kwargs.pop("color", kwargs.pop("c", None)) - kwargs["color"] = _default_color(ax.plot, hue, color, kwargs) - - if not p.has_xy_data: - return ax - - p.plot_rug(height, expand_margins, legend, **kwargs) - - return ax - - -rugplot.__doc__ = """\ -Plot marginal distributions by drawing ticks along the x and y axes. - -This function is intended to complement other plots by showing the location -of individual observations in an unobtrusive way. - -Parameters ----------- -{params.core.data} -{params.core.xy} -{params.core.hue} -height : float - Proportion of axes extent covered by each rug element. Can be negative. -expand_margins : bool - If True, increase the axes margins by the height of the rug to avoid - overlap with other elements. -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -legend : bool - If False, do not add a legend for semantic variables. -{params.core.ax} -kwargs - Other keyword arguments are passed to - :meth:`matplotlib.collections.LineCollection` - -Returns -------- -{returns.ax} - -Examples --------- - -.. include:: ../docstrings/rugplot.rst - -""".format( - params=_param_docs, - returns=_core_docs["returns"], -) - - -def displot( - data=None, *, - # Vector variables - x=None, y=None, hue=None, row=None, col=None, weights=None, - # Other plot parameters - kind="hist", rug=False, rug_kws=None, log_scale=None, legend=True, - # Hue-mapping parameters - palette=None, hue_order=None, hue_norm=None, color=None, - # Faceting parameters - col_wrap=None, row_order=None, col_order=None, - height=5, aspect=1, facet_kws=None, - **kwargs, -): - - p = _DistributionPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, weights=weights, row=row, col=col), - ) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - - _check_argument("kind", ["hist", "kde", "ecdf"], kind) - - # --- Initialize the FacetGrid object - - # Check for attempt to plot onto specific axes and warn - if "ax" in kwargs: - msg = ( - "`displot` is a figure-level function and does not accept " - "the ax= parameter. You may wish to try {}plot.".format(kind) - ) - warnings.warn(msg, UserWarning) - kwargs.pop("ax") - - for var in ["row", "col"]: - # Handle faceting variables that lack name information - if var in p.variables and p.variables[var] is None: - p.variables[var] = f"_{var}_" - - # Adapt the plot_data dataframe for use with FacetGrid - grid_data = p.plot_data.rename(columns=p.variables) - grid_data = grid_data.loc[:, ~grid_data.columns.duplicated()] - - col_name = p.variables.get("col") - row_name = p.variables.get("row") - - if facet_kws is None: - facet_kws = {} - - g = FacetGrid( - data=grid_data, row=row_name, col=col_name, - col_wrap=col_wrap, row_order=row_order, - col_order=col_order, height=height, - aspect=aspect, - **facet_kws, - ) - - # Now attach the axes object to the plotter object - if kind == "kde": - allowed_types = ["numeric", "datetime"] - else: - allowed_types = None - p._attach(g, allowed_types=allowed_types, log_scale=log_scale) - - # Check for a specification that lacks x/y data and return early - if not p.has_xy_data: - return g - - if color is None and hue is None: - color = "C0" - # XXX else warn if hue is not None? - - kwargs["legend"] = legend - - # --- Draw the plots - - if kind == "hist": - - hist_kws = kwargs.copy() - - # Extract the parameters that will go directly to Histogram - estimate_defaults = {} - _assign_default_kwargs(estimate_defaults, Histogram.__init__, histplot) - - estimate_kws = {} - for key, default_val in estimate_defaults.items(): - estimate_kws[key] = hist_kws.pop(key, default_val) - - # Handle derivative defaults - if estimate_kws["discrete"] is None: - estimate_kws["discrete"] = p._default_discrete() - - hist_kws["estimate_kws"] = estimate_kws - - hist_kws.setdefault("color", color) - - if p.univariate: - - _assign_default_kwargs(hist_kws, p.plot_univariate_histogram, histplot) - p.plot_univariate_histogram(**hist_kws) - - else: - - _assign_default_kwargs(hist_kws, p.plot_bivariate_histogram, histplot) - p.plot_bivariate_histogram(**hist_kws) - - elif kind == "kde": - - kde_kws = kwargs.copy() - - # Extract the parameters that will go directly to KDE - estimate_defaults = {} - _assign_default_kwargs(estimate_defaults, KDE.__init__, kdeplot) - - estimate_kws = {} - for key, default_val in estimate_defaults.items(): - estimate_kws[key] = kde_kws.pop(key, default_val) - - kde_kws["estimate_kws"] = estimate_kws - kde_kws["color"] = color - - if p.univariate: - - _assign_default_kwargs(kde_kws, p.plot_univariate_density, kdeplot) - p.plot_univariate_density(**kde_kws) - - else: - - _assign_default_kwargs(kde_kws, p.plot_bivariate_density, kdeplot) - p.plot_bivariate_density(**kde_kws) - - elif kind == "ecdf": - - ecdf_kws = kwargs.copy() - - # Extract the parameters that will go directly to the estimator - estimate_kws = {} - estimate_defaults = {} - _assign_default_kwargs(estimate_defaults, ECDF.__init__, ecdfplot) - for key, default_val in estimate_defaults.items(): - estimate_kws[key] = ecdf_kws.pop(key, default_val) - - ecdf_kws["estimate_kws"] = estimate_kws - ecdf_kws["color"] = color - - if p.univariate: - - _assign_default_kwargs(ecdf_kws, p.plot_univariate_ecdf, ecdfplot) - p.plot_univariate_ecdf(**ecdf_kws) - - else: - - raise NotImplementedError("Bivariate ECDF plots are not implemented") - - # All plot kinds can include a rug - if rug: - # TODO with expand_margins=True, each facet expands margins... annoying! - if rug_kws is None: - rug_kws = {} - _assign_default_kwargs(rug_kws, p.plot_rug, rugplot) - rug_kws["legend"] = False - if color is not None: - rug_kws["color"] = color - p.plot_rug(**rug_kws) - - # Call FacetGrid annotation methods - # Note that the legend is currently set inside the plotting method - g.set_axis_labels( - x_var=p.variables.get("x", g.axes.flat[0].get_xlabel()), - y_var=p.variables.get("y", g.axes.flat[0].get_ylabel()), - ) - g.set_titles() - g.tight_layout() - - if data is not None and (x is not None or y is not None): - if not isinstance(data, pd.DataFrame): - data = pd.DataFrame(data) - g.data = pd.merge( - data, - g.data[g.data.columns.difference(data.columns)], - left_index=True, - right_index=True, - ) - else: - wide_cols = { - k: f"_{k}_" if v is None else v for k, v in p.variables.items() - } - g.data = p.plot_data.rename(columns=wide_cols) - - return g - - -displot.__doc__ = """\ -Figure-level interface for drawing distribution plots onto a FacetGrid. - -This function provides access to several approaches for visualizing the -univariate or bivariate distribution of data, including subsets of data -defined by semantic mapping and faceting across multiple subplots. The -``kind`` parameter selects the approach to use: - -- :func:`histplot` (with ``kind="hist"``; the default) -- :func:`kdeplot` (with ``kind="kde"``) -- :func:`ecdfplot` (with ``kind="ecdf"``; univariate-only) - -Additionally, a :func:`rugplot` can be added to any kind of plot to show -individual observations. - -Extra keyword arguments are passed to the underlying function, so you should -refer to the documentation for each to understand the complete set of options -for making plots with this interface. - -See the :doc:`distribution plots tutorial <../tutorial/distributions>` for a more -in-depth discussion of the relative strengths and weaknesses of each approach. -The distinction between figure-level and axes-level functions is explained -further in the :doc:`user guide <../tutorial/function_overview>`. - -Parameters ----------- -{params.core.data} -{params.core.xy} -{params.core.hue} -{params.facets.rowcol} -weights : vector or key in ``data`` - Observation weights used for computing the distribution function. -kind : {{"hist", "kde", "ecdf"}} - Approach for visualizing the data. Selects the underlying plotting function - and determines the additional set of valid parameters. -rug : bool - If True, show each observation with marginal ticks (as in :func:`rugplot`). -rug_kws : dict - Parameters to control the appearance of the rug plot. -{params.dist.log_scale} -{params.dist.legend} -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.core.color} -{params.facets.col_wrap} -{params.facets.rowcol_order} -{params.facets.height} -{params.facets.aspect} -{params.facets.facet_kws} -kwargs - Other keyword arguments are documented with the relevant axes-level function: - - - :func:`histplot` (with ``kind="hist"``) - - :func:`kdeplot` (with ``kind="kde"``) - - :func:`ecdfplot` (with ``kind="ecdf"``) - -Returns -------- -{returns.facetgrid} - -See Also --------- -{seealso.histplot} -{seealso.kdeplot} -{seealso.rugplot} -{seealso.ecdfplot} -{seealso.jointplot} - -Examples --------- - -See the API documentation for the axes-level functions for more details -about the breadth of options available for each plot kind. - -.. include:: ../docstrings/displot.rst - -""".format( - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) - - -# =========================================================================== # -# DEPRECATED FUNCTIONS LIVE BELOW HERE -# =========================================================================== # - - -def _freedman_diaconis_bins(a): - """Calculate number of hist bins using Freedman-Diaconis rule.""" - # From https://stats.stackexchange.com/questions/798/ - a = np.asarray(a) - if len(a) < 2: - return 1 - iqr = np.subtract.reduce(np.nanpercentile(a, [75, 25])) - h = 2 * iqr / (len(a) ** (1 / 3)) - # fall back to sqrt(a) bins if iqr is 0 - if h == 0: - return int(np.sqrt(a.size)) - else: - return int(np.ceil((a.max() - a.min()) / h)) - - -def distplot(a=None, bins=None, hist=True, kde=True, rug=False, fit=None, - hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None, - color=None, vertical=False, norm_hist=False, axlabel=None, - label=None, ax=None, x=None): - """ - DEPRECATED - - This function has been deprecated and will be removed in seaborn v0.14.0. - It has been replaced by :func:`histplot` and :func:`displot`, two functions - with a modern API and many more capabilities. - - For a guide to updating, please see this notebook: - - https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 - - """ - - if kde and not hist: - axes_level_suggestion = ( - "`kdeplot` (an axes-level function for kernel density plots)" - ) - else: - axes_level_suggestion = ( - "`histplot` (an axes-level function for histograms)" - ) - - msg = textwrap.dedent(f""" - - `distplot` is a deprecated function and will be removed in seaborn v0.14.0. - - Please adapt your code to use either `displot` (a figure-level function with - similar flexibility) or {axes_level_suggestion}. - - For a guide to updating your code to use the new functions, please see - https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 - """) - warnings.warn(msg, UserWarning, stacklevel=2) - - if ax is None: - ax = plt.gca() - - # Intelligently label the support axis - label_ax = bool(axlabel) - if axlabel is None and hasattr(a, "name"): - axlabel = a.name - if axlabel is not None: - label_ax = True - - # Support new-style API - if x is not None: - a = x - - # Make a a 1-d float array - a = np.asarray(a, float) - if a.ndim > 1: - a = a.squeeze() - - # Drop null values from array - a = remove_na(a) - - # Decide if the hist is normed - norm_hist = norm_hist or kde or (fit is not None) - - # Handle dictionary defaults - hist_kws = {} if hist_kws is None else hist_kws.copy() - kde_kws = {} if kde_kws is None else kde_kws.copy() - rug_kws = {} if rug_kws is None else rug_kws.copy() - fit_kws = {} if fit_kws is None else fit_kws.copy() - - # Get the color from the current color cycle - if color is None: - if vertical: - line, = ax.plot(0, a.mean()) - else: - line, = ax.plot(a.mean(), 0) - color = line.get_color() - line.remove() - - # Plug the label into the right kwarg dictionary - if label is not None: - if hist: - hist_kws["label"] = label - elif kde: - kde_kws["label"] = label - elif rug: - rug_kws["label"] = label - elif fit: - fit_kws["label"] = label - - if hist: - if bins is None: - bins = min(_freedman_diaconis_bins(a), 50) - hist_kws.setdefault("alpha", 0.4) - hist_kws.setdefault("density", norm_hist) - - orientation = "horizontal" if vertical else "vertical" - hist_color = hist_kws.pop("color", color) - ax.hist(a, bins, orientation=orientation, - color=hist_color, **hist_kws) - if hist_color != color: - hist_kws["color"] = hist_color - - axis = "y" if vertical else "x" - - if kde: - kde_color = kde_kws.pop("color", color) - kdeplot(**{axis: a}, ax=ax, color=kde_color, **kde_kws) - if kde_color != color: - kde_kws["color"] = kde_color - - if rug: - rug_color = rug_kws.pop("color", color) - rugplot(**{axis: a}, ax=ax, color=rug_color, **rug_kws) - if rug_color != color: - rug_kws["color"] = rug_color - - if fit is not None: - - def pdf(x): - return fit.pdf(x, *params) - - fit_color = fit_kws.pop("color", "#282828") - gridsize = fit_kws.pop("gridsize", 200) - cut = fit_kws.pop("cut", 3) - clip = fit_kws.pop("clip", (-np.inf, np.inf)) - bw = gaussian_kde(a).scotts_factor() * a.std(ddof=1) - x = _kde_support(a, bw, gridsize, cut, clip) - params = fit.fit(a) - y = pdf(x) - if vertical: - x, y = y, x - ax.plot(x, y, color=fit_color, **fit_kws) - if fit_color != "#282828": - fit_kws["color"] = fit_color - - if label_ax: - if vertical: - ax.set_ylabel(axlabel) - else: - ax.set_xlabel(axlabel) - - return ax diff --git a/seaborn/external/__init__.py b/seaborn/external/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/seaborn/external/appdirs.py b/seaborn/external/appdirs.py deleted file mode 100644 index 70c382964824fe0fce175f44cf6061b44cd4f922..0000000000000000000000000000000000000000 --- a/seaborn/external/appdirs.py +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2005-2010 ActiveState Software Inc. -# Copyright (c) 2013 Eddy Petrișor - -# flake8: noqa - -""" -This file is directly from -https://github.com/ActiveState/appdirs/blob/3fe6a83776843a46f20c2e5587afcffe05e03b39/appdirs.py - -The license of https://github.com/ActiveState/appdirs copied below: - - -# This is the MIT license - -Copyright (c) 2010 ActiveState Software Inc. - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be included -in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -"""Utilities for determining application-specific dirs. - -See <https://github.com/ActiveState/appdirs> for details and usage. -""" -# Dev Notes: -# - MSDN on where to store app data files: -# http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120 -# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html -# - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html - -__version__ = "1.4.4" -__version_info__ = tuple(int(segment) for segment in __version__.split(".")) - - -import sys -import os - -unicode = str - -if sys.platform.startswith('java'): - import platform - os_name = platform.java_ver()[3][0] - if os_name.startswith('Windows'): # "Windows XP", "Windows 7", etc. - system = 'win32' - elif os_name.startswith('Mac'): # "Mac OS X", etc. - system = 'darwin' - else: # "Linux", "SunOS", "FreeBSD", etc. - # Setting this to "linux2" is not ideal, but only Windows or Mac - # are actually checked for and the rest of the module expects - # *sys.platform* style strings. - system = 'linux2' -else: - system = sys.platform - - -def user_cache_dir(appname=None, appauthor=None, version=None, opinion=True): - r"""Return full path to the user-specific cache dir for this application. - - "appname" is the name of application. - If None, just the system directory is returned. - "appauthor" (only used on Windows) is the name of the - appauthor or distributing body for this application. Typically - it is the owning company name. This falls back to appname. You may - pass False to disable it. - "version" is an optional version path element to append to the - path. You might want to use this if you want multiple versions - of your app to be able to run independently. If used, this - would typically be "<major>.<minor>". - Only applied when appname is present. - "opinion" (boolean) can be False to disable the appending of - "Cache" to the base app data dir for Windows. See - discussion below. - - Typical user cache directories are: - Mac OS X: ~/Library/Caches/<AppName> - Unix: ~/.cache/<AppName> (XDG default) - Win XP: C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Cache - Vista: C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache - - On Windows the only suggestion in the MSDN docs is that local settings go in - the `CSIDL_LOCAL_APPDATA` directory. This is identical to the non-roaming - app data dir (the default returned by `user_data_dir` above). Apps typically - put cache data somewhere *under* the given dir here. Some examples: - ...\Mozilla\Firefox\Profiles\<ProfileName>\Cache - ...\Acme\SuperApp\Cache\1.0 - OPINION: This function appends "Cache" to the `CSIDL_LOCAL_APPDATA` value. - This can be disabled with the `opinion=False` option. - """ - if system == "win32": - if appauthor is None: - appauthor = appname - path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA")) - if appname: - if appauthor is not False: - path = os.path.join(path, appauthor, appname) - else: - path = os.path.join(path, appname) - if opinion: - path = os.path.join(path, "Cache") - elif system == 'darwin': - path = os.path.expanduser('~/Library/Caches') - if appname: - path = os.path.join(path, appname) - else: - path = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache')) - if appname: - path = os.path.join(path, appname) - if appname and version: - path = os.path.join(path, version) - return path - - -#---- internal support stuff - -def _get_win_folder_from_registry(csidl_name): - """This is a fallback technique at best. I'm not sure if using the - registry for this guarantees us the correct answer for all CSIDL_* - names. - """ - import winreg as _winreg - - shell_folder_name = { - "CSIDL_APPDATA": "AppData", - "CSIDL_COMMON_APPDATA": "Common AppData", - "CSIDL_LOCAL_APPDATA": "Local AppData", - }[csidl_name] - - key = _winreg.OpenKey( - _winreg.HKEY_CURRENT_USER, - r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders" - ) - dir, type = _winreg.QueryValueEx(key, shell_folder_name) - return dir - - -def _get_win_folder_with_pywin32(csidl_name): - from win32com.shell import shellcon, shell - dir = shell.SHGetFolderPath(0, getattr(shellcon, csidl_name), 0, 0) - # Try to make this a unicode path because SHGetFolderPath does - # not return unicode strings when there is unicode data in the - # path. - try: - dir = unicode(dir) - - # Downgrade to short path name if have highbit chars. See - # <http://bugs.activestate.com/show_bug.cgi?id=85099>. - has_high_char = False - for c in dir: - if ord(c) > 255: - has_high_char = True - break - if has_high_char: - try: - import win32api - dir = win32api.GetShortPathName(dir) - except ImportError: - pass - except UnicodeError: - pass - return dir - - -def _get_win_folder_with_ctypes(csidl_name): - import ctypes - - csidl_const = { - "CSIDL_APPDATA": 26, - "CSIDL_COMMON_APPDATA": 35, - "CSIDL_LOCAL_APPDATA": 28, - }[csidl_name] - - buf = ctypes.create_unicode_buffer(1024) - ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf) - - # Downgrade to short path name if have highbit chars. See - # <http://bugs.activestate.com/show_bug.cgi?id=85099>. - has_high_char = False - for c in buf: - if ord(c) > 255: - has_high_char = True - break - if has_high_char: - buf2 = ctypes.create_unicode_buffer(1024) - if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024): - buf = buf2 - - return buf.value - -def _get_win_folder_with_jna(csidl_name): - import array - from com.sun import jna - from com.sun.jna.platform import win32 - - buf_size = win32.WinDef.MAX_PATH * 2 - buf = array.zeros('c', buf_size) - shell = win32.Shell32.INSTANCE - shell.SHGetFolderPath(None, getattr(win32.ShlObj, csidl_name), None, win32.ShlObj.SHGFP_TYPE_CURRENT, buf) - dir = jna.Native.toString(buf.tostring()).rstrip("\0") - - # Downgrade to short path name if have highbit chars. See - # <http://bugs.activestate.com/show_bug.cgi?id=85099>. - has_high_char = False - for c in dir: - if ord(c) > 255: - has_high_char = True - break - if has_high_char: - buf = array.zeros('c', buf_size) - kernel = win32.Kernel32.INSTANCE - if kernel.GetShortPathName(dir, buf, buf_size): - dir = jna.Native.toString(buf.tostring()).rstrip("\0") - - return dir - -if system == "win32": - try: - import win32com.shell - _get_win_folder = _get_win_folder_with_pywin32 - except ImportError: - try: - from ctypes import windll - _get_win_folder = _get_win_folder_with_ctypes - except ImportError: - try: - import com.sun.jna - _get_win_folder = _get_win_folder_with_jna - except ImportError: - _get_win_folder = _get_win_folder_from_registry diff --git a/seaborn/external/docscrape.py b/seaborn/external/docscrape.py deleted file mode 100644 index 99dc3ff797f5faf21ec4f53bf0c6cd036e38c9c9..0000000000000000000000000000000000000000 --- a/seaborn/external/docscrape.py +++ /dev/null @@ -1,715 +0,0 @@ -"""Extract reference documentation from the NumPy source tree. - -Copyright (C) 2008 Stefan van der Walt <stefan@mentat.za.net>, Pauli Virtanen <pav@iki.fi> - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - -THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR -IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, -INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, -STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING -IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -""" -import inspect -import textwrap -import re -import pydoc -from warnings import warn -from collections import namedtuple -from collections.abc import Callable, Mapping -import copy -import sys - - -def strip_blank_lines(l): - "Remove leading and trailing blank lines from a list of lines" - while l and not l[0].strip(): - del l[0] - while l and not l[-1].strip(): - del l[-1] - return l - - -class Reader: - """A line-based string reader. - - """ - def __init__(self, data): - """ - Parameters - ---------- - data : str - String with lines separated by '\n'. - - """ - if isinstance(data, list): - self._str = data - else: - self._str = data.split('\n') # store string as list of lines - - self.reset() - - def __getitem__(self, n): - return self._str[n] - - def reset(self): - self._l = 0 # current line nr - - def read(self): - if not self.eof(): - out = self[self._l] - self._l += 1 - return out - else: - return '' - - def seek_next_non_empty_line(self): - for l in self[self._l:]: - if l.strip(): - break - else: - self._l += 1 - - def eof(self): - return self._l >= len(self._str) - - def read_to_condition(self, condition_func): - start = self._l - for line in self[start:]: - if condition_func(line): - return self[start:self._l] - self._l += 1 - if self.eof(): - return self[start:self._l+1] - return [] - - def read_to_next_empty_line(self): - self.seek_next_non_empty_line() - - def is_empty(line): - return not line.strip() - - return self.read_to_condition(is_empty) - - def read_to_next_unindented_line(self): - def is_unindented(line): - return (line.strip() and (len(line.lstrip()) == len(line))) - return self.read_to_condition(is_unindented) - - def peek(self, n=0): - if self._l + n < len(self._str): - return self[self._l + n] - else: - return '' - - def is_empty(self): - return not ''.join(self._str).strip() - - -class ParseError(Exception): - def __str__(self): - message = self.args[0] - if hasattr(self, 'docstring'): - message = f"{message} in {self.docstring!r}" - return message - - -Parameter = namedtuple('Parameter', ['name', 'type', 'desc']) - - -class NumpyDocString(Mapping): - """Parses a numpydoc string to an abstract representation - - Instances define a mapping from section title to structured data. - - """ - - sections = { - 'Signature': '', - 'Summary': [''], - 'Extended Summary': [], - 'Parameters': [], - 'Returns': [], - 'Yields': [], - 'Receives': [], - 'Raises': [], - 'Warns': [], - 'Other Parameters': [], - 'Attributes': [], - 'Methods': [], - 'See Also': [], - 'Notes': [], - 'Warnings': [], - 'References': '', - 'Examples': '', - 'index': {} - } - - def __init__(self, docstring, config={}): - orig_docstring = docstring - docstring = textwrap.dedent(docstring).split('\n') - - self._doc = Reader(docstring) - self._parsed_data = copy.deepcopy(self.sections) - - try: - self._parse() - except ParseError as e: - e.docstring = orig_docstring - raise - - def __getitem__(self, key): - return self._parsed_data[key] - - def __setitem__(self, key, val): - if key not in self._parsed_data: - self._error_location(f"Unknown section {key}", error=False) - else: - self._parsed_data[key] = val - - def __iter__(self): - return iter(self._parsed_data) - - def __len__(self): - return len(self._parsed_data) - - def _is_at_section(self): - self._doc.seek_next_non_empty_line() - - if self._doc.eof(): - return False - - l1 = self._doc.peek().strip() # e.g. Parameters - - if l1.startswith('.. index::'): - return True - - l2 = self._doc.peek(1).strip() # ---------- or ========== - return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) - - def _strip(self, doc): - i = 0 - j = 0 - for i, line in enumerate(doc): - if line.strip(): - break - - for j, line in enumerate(doc[::-1]): - if line.strip(): - break - - return doc[i:len(doc)-j] - - def _read_to_next_section(self): - section = self._doc.read_to_next_empty_line() - - while not self._is_at_section() and not self._doc.eof(): - if not self._doc.peek(-1).strip(): # previous line was empty - section += [''] - - section += self._doc.read_to_next_empty_line() - - return section - - def _read_sections(self): - while not self._doc.eof(): - data = self._read_to_next_section() - name = data[0].strip() - - if name.startswith('..'): # index section - yield name, data[1:] - elif len(data) < 2: - yield StopIteration - else: - yield name, self._strip(data[2:]) - - def _parse_param_list(self, content, single_element_is_type=False): - r = Reader(content) - params = [] - while not r.eof(): - header = r.read().strip() - if ' : ' in header: - arg_name, arg_type = header.split(' : ')[:2] - else: - if single_element_is_type: - arg_name, arg_type = '', header - else: - arg_name, arg_type = header, '' - - desc = r.read_to_next_unindented_line() - desc = dedent_lines(desc) - desc = strip_blank_lines(desc) - - params.append(Parameter(arg_name, arg_type, desc)) - - return params - - # See also supports the following formats. - # - # <FUNCNAME> - # <FUNCNAME> SPACE* COLON SPACE+ <DESC> SPACE* - # <FUNCNAME> ( COMMA SPACE+ <FUNCNAME>)+ (COMMA | PERIOD)? SPACE* - # <FUNCNAME> ( COMMA SPACE+ <FUNCNAME>)* SPACE* COLON SPACE+ <DESC> SPACE* - - # <FUNCNAME> is one of - # <PLAIN_FUNCNAME> - # COLON <ROLE> COLON BACKTICK <PLAIN_FUNCNAME> BACKTICK - # where - # <PLAIN_FUNCNAME> is a legal function name, and - # <ROLE> is any nonempty sequence of word characters. - # Examples: func_f1 :meth:`func_h1` :obj:`~baz.obj_r` :class:`class_j` - # <DESC> is a string describing the function. - - _role = r":(?P<role>\w+):" - _funcbacktick = r"`(?P<name>(?:~\w+\.)?[a-zA-Z0-9_\.-]+)`" - _funcplain = r"(?P<name2>[a-zA-Z0-9_\.-]+)" - _funcname = r"(" + _role + _funcbacktick + r"|" + _funcplain + r")" - _funcnamenext = _funcname.replace('role', 'rolenext') - _funcnamenext = _funcnamenext.replace('name', 'namenext') - _description = r"(?P<description>\s*:(\s+(?P<desc>\S+.*))?)?\s*$" - _func_rgx = re.compile(r"^\s*" + _funcname + r"\s*") - _line_rgx = re.compile( - r"^\s*" + - r"(?P<allfuncs>" + # group for all function names - _funcname + - r"(?P<morefuncs>([,]\s+" + _funcnamenext + r")*)" + - r")" + # end of "allfuncs" - r"(?P<trailing>[,\.])?" + # Some function lists have a trailing comma (or period) '\s*' - _description) - - # Empty <DESC> elements are replaced with '..' - empty_description = '..' - - def _parse_see_also(self, content): - """ - func_name : Descriptive text - continued text - another_func_name : Descriptive text - func_name1, func_name2, :meth:`func_name`, func_name3 - - """ - - items = [] - - def parse_item_name(text): - """Match ':role:`name`' or 'name'.""" - m = self._func_rgx.match(text) - if not m: - raise ParseError(f"{text} is not a item name") - role = m.group('role') - name = m.group('name') if role else m.group('name2') - return name, role, m.end() - - rest = [] - for line in content: - if not line.strip(): - continue - - line_match = self._line_rgx.match(line) - description = None - if line_match: - description = line_match.group('desc') - if line_match.group('trailing') and description: - self._error_location( - 'Unexpected comma or period after function list at index %d of ' - 'line "%s"' % (line_match.end('trailing'), line), - error=False) - if not description and line.startswith(' '): - rest.append(line.strip()) - elif line_match: - funcs = [] - text = line_match.group('allfuncs') - while True: - if not text.strip(): - break - name, role, match_end = parse_item_name(text) - funcs.append((name, role)) - text = text[match_end:].strip() - if text and text[0] == ',': - text = text[1:].strip() - rest = list(filter(None, [description])) - items.append((funcs, rest)) - else: - raise ParseError(f"{line} is not a item name") - return items - - def _parse_index(self, section, content): - """ - .. index: default - :refguide: something, else, and more - - """ - def strip_each_in(lst): - return [s.strip() for s in lst] - - out = {} - section = section.split('::') - if len(section) > 1: - out['default'] = strip_each_in(section[1].split(','))[0] - for line in content: - line = line.split(':') - if len(line) > 2: - out[line[1]] = strip_each_in(line[2].split(',')) - return out - - def _parse_summary(self): - """Grab signature (if given) and summary""" - if self._is_at_section(): - return - - # If several signatures present, take the last one - while True: - summary = self._doc.read_to_next_empty_line() - summary_str = " ".join([s.strip() for s in summary]).strip() - compiled = re.compile(r'^([\w., ]+=)?\s*[\w\.]+\(.*\)$') - if compiled.match(summary_str): - self['Signature'] = summary_str - if not self._is_at_section(): - continue - break - - if summary is not None: - self['Summary'] = summary - - if not self._is_at_section(): - self['Extended Summary'] = self._read_to_next_section() - - def _parse(self): - self._doc.reset() - self._parse_summary() - - sections = list(self._read_sections()) - section_names = {section for section, content in sections} - - has_returns = 'Returns' in section_names - has_yields = 'Yields' in section_names - # We could do more tests, but we are not. Arbitrarily. - if has_returns and has_yields: - msg = 'Docstring contains both a Returns and Yields section.' - raise ValueError(msg) - if not has_yields and 'Receives' in section_names: - msg = 'Docstring contains a Receives section but not Yields.' - raise ValueError(msg) - - for (section, content) in sections: - if not section.startswith('..'): - section = (s.capitalize() for s in section.split(' ')) - section = ' '.join(section) - if self.get(section): - self._error_location(f"The section {section} appears twice") - - if section in ('Parameters', 'Other Parameters', 'Attributes', - 'Methods'): - self[section] = self._parse_param_list(content) - elif section in ('Returns', 'Yields', 'Raises', 'Warns', 'Receives'): - self[section] = self._parse_param_list( - content, single_element_is_type=True) - elif section.startswith('.. index::'): - self['index'] = self._parse_index(section, content) - elif section == 'See Also': - self['See Also'] = self._parse_see_also(content) - else: - self[section] = content - - def _error_location(self, msg, error=True): - if hasattr(self, '_obj'): - # we know where the docs came from: - try: - filename = inspect.getsourcefile(self._obj) - except TypeError: - filename = None - msg = msg + f" in the docstring of {self._obj} in {filename}." - if error: - raise ValueError(msg) - else: - warn(msg) - - # string conversion routines - - def _str_header(self, name, symbol='-'): - return [name, len(name)*symbol] - - def _str_indent(self, doc, indent=4): - out = [] - for line in doc: - out += [' '*indent + line] - return out - - def _str_signature(self): - if self['Signature']: - return [self['Signature'].replace('*', r'\*')] + [''] - else: - return [''] - - def _str_summary(self): - if self['Summary']: - return self['Summary'] + [''] - else: - return [] - - def _str_extended_summary(self): - if self['Extended Summary']: - return self['Extended Summary'] + [''] - else: - return [] - - def _str_param_list(self, name): - out = [] - if self[name]: - out += self._str_header(name) - for param in self[name]: - parts = [] - if param.name: - parts.append(param.name) - if param.type: - parts.append(param.type) - out += [' : '.join(parts)] - if param.desc and ''.join(param.desc).strip(): - out += self._str_indent(param.desc) - out += [''] - return out - - def _str_section(self, name): - out = [] - if self[name]: - out += self._str_header(name) - out += self[name] - out += [''] - return out - - def _str_see_also(self, func_role): - if not self['See Also']: - return [] - out = [] - out += self._str_header("See Also") - out += [''] - last_had_desc = True - for funcs, desc in self['See Also']: - assert isinstance(funcs, list) - links = [] - for func, role in funcs: - if role: - link = f':{role}:`{func}`' - elif func_role: - link = f':{func_role}:`{func}`' - else: - link = f"`{func}`_" - links.append(link) - link = ', '.join(links) - out += [link] - if desc: - out += self._str_indent([' '.join(desc)]) - last_had_desc = True - else: - last_had_desc = False - out += self._str_indent([self.empty_description]) - - if last_had_desc: - out += [''] - out += [''] - return out - - def _str_index(self): - idx = self['index'] - out = [] - output_index = False - default_index = idx.get('default', '') - if default_index: - output_index = True - out += [f'.. index:: {default_index}'] - for section, references in idx.items(): - if section == 'default': - continue - output_index = True - out += [f" :{section}: {', '.join(references)}"] - if output_index: - return out - else: - return '' - - def __str__(self, func_role=''): - out = [] - out += self._str_signature() - out += self._str_summary() - out += self._str_extended_summary() - for param_list in ('Parameters', 'Returns', 'Yields', 'Receives', - 'Other Parameters', 'Raises', 'Warns'): - out += self._str_param_list(param_list) - out += self._str_section('Warnings') - out += self._str_see_also(func_role) - for s in ('Notes', 'References', 'Examples'): - out += self._str_section(s) - for param_list in ('Attributes', 'Methods'): - out += self._str_param_list(param_list) - out += self._str_index() - return '\n'.join(out) - - -def indent(str, indent=4): - indent_str = ' '*indent - if str is None: - return indent_str - lines = str.split('\n') - return '\n'.join(indent_str + l for l in lines) - - -def dedent_lines(lines): - """Deindent a list of lines maximally""" - return textwrap.dedent("\n".join(lines)).split("\n") - - -def header(text, style='-'): - return text + '\n' + style*len(text) + '\n' - - -class FunctionDoc(NumpyDocString): - def __init__(self, func, role='func', doc=None, config={}): - self._f = func - self._role = role # e.g. "func" or "meth" - - if doc is None: - if func is None: - raise ValueError("No function or docstring given") - doc = inspect.getdoc(func) or '' - NumpyDocString.__init__(self, doc, config) - - if not self['Signature'] and func is not None: - func, func_name = self.get_func() - try: - try: - signature = str(inspect.signature(func)) - except (AttributeError, ValueError): - # try to read signature, backward compat for older Python - if sys.version_info[0] >= 3: - argspec = inspect.getfullargspec(func) - else: - argspec = inspect.getargspec(func) - signature = inspect.formatargspec(*argspec) - signature = f'{func_name}{signature}' - except TypeError: - signature = f'{func_name}()' - self['Signature'] = signature - - def get_func(self): - func_name = getattr(self._f, '__name__', self.__class__.__name__) - if inspect.isclass(self._f): - func = getattr(self._f, '__call__', self._f.__init__) - else: - func = self._f - return func, func_name - - def __str__(self): - out = '' - - func, func_name = self.get_func() - - roles = {'func': 'function', - 'meth': 'method'} - - if self._role: - if self._role not in roles: - print(f"Warning: invalid role {self._role}") - out += f".. {roles.get(self._role, '')}:: {func_name}\n \n\n" - - out += super().__str__(func_role=self._role) - return out - - -class ClassDoc(NumpyDocString): - - extra_public_methods = ['__call__'] - - def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, - config={}): - if not inspect.isclass(cls) and cls is not None: - raise ValueError(f"Expected a class or None, but got {cls!r}") - self._cls = cls - - if 'sphinx' in sys.modules: - from sphinx.ext.autodoc import ALL - else: - ALL = object() - - self.show_inherited_members = config.get( - 'show_inherited_class_members', True) - - if modulename and not modulename.endswith('.'): - modulename += '.' - self._mod = modulename - - if doc is None: - if cls is None: - raise ValueError("No class or documentation string given") - doc = pydoc.getdoc(cls) - - NumpyDocString.__init__(self, doc) - - _members = config.get('members', []) - if _members is ALL: - _members = None - _exclude = config.get('exclude-members', []) - - if config.get('show_class_members', True) and _exclude is not ALL: - def splitlines_x(s): - if not s: - return [] - else: - return s.splitlines() - for field, items in [('Methods', self.methods), - ('Attributes', self.properties)]: - if not self[field]: - doc_list = [] - for name in sorted(items): - if (name in _exclude or - (_members and name not in _members)): - continue - try: - doc_item = pydoc.getdoc(getattr(self._cls, name)) - doc_list.append( - Parameter(name, '', splitlines_x(doc_item))) - except AttributeError: - pass # method doesn't exist - self[field] = doc_list - - @property - def methods(self): - if self._cls is None: - return [] - return [name for name, func in inspect.getmembers(self._cls) - if ((not name.startswith('_') - or name in self.extra_public_methods) - and isinstance(func, Callable) - and self._is_show_member(name))] - - @property - def properties(self): - if self._cls is None: - return [] - return [name for name, func in inspect.getmembers(self._cls) - if (not name.startswith('_') and - (func is None or isinstance(func, property) or - inspect.isdatadescriptor(func)) - and self._is_show_member(name))] - - def _is_show_member(self, name): - if self.show_inherited_members: - return True # show all class members - if name not in self._cls.__dict__: - return False # class member is inherited, we do not show it - return True diff --git a/seaborn/external/husl.py b/seaborn/external/husl.py deleted file mode 100644 index 63e98cbb71640f24a0d5e0eda697bc97d12ffc5b..0000000000000000000000000000000000000000 --- a/seaborn/external/husl.py +++ /dev/null @@ -1,313 +0,0 @@ -import operator -import math - -__version__ = "2.1.0" - - -m = [ - [3.2406, -1.5372, -0.4986], - [-0.9689, 1.8758, 0.0415], - [0.0557, -0.2040, 1.0570] -] - -m_inv = [ - [0.4124, 0.3576, 0.1805], - [0.2126, 0.7152, 0.0722], - [0.0193, 0.1192, 0.9505] -] - -# Hard-coded D65 illuminant -refX = 0.95047 -refY = 1.00000 -refZ = 1.08883 -refU = 0.19784 -refV = 0.46834 -lab_e = 0.008856 -lab_k = 903.3 - - -# Public API - -def husl_to_rgb(h, s, l): - return lch_to_rgb(*husl_to_lch([h, s, l])) - - -def husl_to_hex(h, s, l): - return rgb_to_hex(husl_to_rgb(h, s, l)) - - -def rgb_to_husl(r, g, b): - return lch_to_husl(rgb_to_lch(r, g, b)) - - -def hex_to_husl(hex): - return rgb_to_husl(*hex_to_rgb(hex)) - - -def huslp_to_rgb(h, s, l): - return lch_to_rgb(*huslp_to_lch([h, s, l])) - - -def huslp_to_hex(h, s, l): - return rgb_to_hex(huslp_to_rgb(h, s, l)) - - -def rgb_to_huslp(r, g, b): - return lch_to_huslp(rgb_to_lch(r, g, b)) - - -def hex_to_huslp(hex): - return rgb_to_huslp(*hex_to_rgb(hex)) - - -def lch_to_rgb(l, c, h): - return xyz_to_rgb(luv_to_xyz(lch_to_luv([l, c, h]))) - - -def rgb_to_lch(r, g, b): - return luv_to_lch(xyz_to_luv(rgb_to_xyz([r, g, b]))) - - -def max_chroma(L, H): - hrad = math.radians(H) - sinH = (math.sin(hrad)) - cosH = (math.cos(hrad)) - sub1 = (math.pow(L + 16, 3.0) / 1560896.0) - sub2 = sub1 if sub1 > 0.008856 else (L / 903.3) - result = float("inf") - for row in m: - m1 = row[0] - m2 = row[1] - m3 = row[2] - top = ((0.99915 * m1 + 1.05122 * m2 + 1.14460 * m3) * sub2) - rbottom = (0.86330 * m3 - 0.17266 * m2) - lbottom = (0.12949 * m3 - 0.38848 * m1) - bottom = (rbottom * sinH + lbottom * cosH) * sub2 - - for t in (0.0, 1.0): - C = (L * (top - 1.05122 * t) / (bottom + 0.17266 * sinH * t)) - if C > 0.0 and C < result: - result = C - return result - - -def _hrad_extremum(L): - lhs = (math.pow(L, 3.0) + 48.0 * math.pow(L, 2.0) + 768.0 * L + 4096.0) / 1560896.0 - rhs = 1107.0 / 125000.0 - sub = lhs if lhs > rhs else 10.0 * L / 9033.0 - chroma = float("inf") - result = None - for row in m: - for limit in (0.0, 1.0): - [m1, m2, m3] = row - top = -3015466475.0 * m3 * sub + 603093295.0 * m2 * sub - 603093295.0 * limit - bottom = 1356959916.0 * m1 * sub - 452319972.0 * m3 * sub - hrad = math.atan2(top, bottom) - # This is a math hack to deal with tan quadrants, I'm too lazy to figure - # out how to do this properly - if limit == 0.0: - hrad += math.pi - test = max_chroma(L, math.degrees(hrad)) - if test < chroma: - chroma = test - result = hrad - return result - - -def max_chroma_pastel(L): - H = math.degrees(_hrad_extremum(L)) - return max_chroma(L, H) - - -def dot_product(a, b): - return sum(map(operator.mul, a, b)) - - -def f(t): - if t > lab_e: - return (math.pow(t, 1.0 / 3.0)) - else: - return (7.787 * t + 16.0 / 116.0) - - -def f_inv(t): - if math.pow(t, 3.0) > lab_e: - return (math.pow(t, 3.0)) - else: - return (116.0 * t - 16.0) / lab_k - - -def from_linear(c): - if c <= 0.0031308: - return 12.92 * c - else: - return (1.055 * math.pow(c, 1.0 / 2.4) - 0.055) - - -def to_linear(c): - a = 0.055 - - if c > 0.04045: - return (math.pow((c + a) / (1.0 + a), 2.4)) - else: - return (c / 12.92) - - -def rgb_prepare(triple): - ret = [] - for ch in triple: - ch = round(ch, 3) - - if ch < -0.0001 or ch > 1.0001: - raise Exception(f"Illegal RGB value {ch:f}") - - if ch < 0: - ch = 0 - if ch > 1: - ch = 1 - - # Fix for Python 3 which by default rounds 4.5 down to 4.0 - # instead of Python 2 which is rounded to 5.0 which caused - # a couple off by one errors in the tests. Tests now all pass - # in Python 2 and Python 3 - ret.append(int(round(ch * 255 + 0.001, 0))) - - return ret - - -def hex_to_rgb(hex): - if hex.startswith('#'): - hex = hex[1:] - r = int(hex[0:2], 16) / 255.0 - g = int(hex[2:4], 16) / 255.0 - b = int(hex[4:6], 16) / 255.0 - return [r, g, b] - - -def rgb_to_hex(triple): - [r, g, b] = triple - return '#%02x%02x%02x' % tuple(rgb_prepare([r, g, b])) - - -def xyz_to_rgb(triple): - xyz = map(lambda row: dot_product(row, triple), m) - return list(map(from_linear, xyz)) - - -def rgb_to_xyz(triple): - rgbl = list(map(to_linear, triple)) - return list(map(lambda row: dot_product(row, rgbl), m_inv)) - - -def xyz_to_luv(triple): - X, Y, Z = triple - - if X == Y == Z == 0.0: - return [0.0, 0.0, 0.0] - - varU = (4.0 * X) / (X + (15.0 * Y) + (3.0 * Z)) - varV = (9.0 * Y) / (X + (15.0 * Y) + (3.0 * Z)) - L = 116.0 * f(Y / refY) - 16.0 - - # Black will create a divide-by-zero error - if L == 0.0: - return [0.0, 0.0, 0.0] - - U = 13.0 * L * (varU - refU) - V = 13.0 * L * (varV - refV) - - return [L, U, V] - - -def luv_to_xyz(triple): - L, U, V = triple - - if L == 0: - return [0.0, 0.0, 0.0] - - varY = f_inv((L + 16.0) / 116.0) - varU = U / (13.0 * L) + refU - varV = V / (13.0 * L) + refV - Y = varY * refY - X = 0.0 - (9.0 * Y * varU) / ((varU - 4.0) * varV - varU * varV) - Z = (9.0 * Y - (15.0 * varV * Y) - (varV * X)) / (3.0 * varV) - - return [X, Y, Z] - - -def luv_to_lch(triple): - L, U, V = triple - - C = (math.pow(math.pow(U, 2) + math.pow(V, 2), (1.0 / 2.0))) - hrad = (math.atan2(V, U)) - H = math.degrees(hrad) - if H < 0.0: - H = 360.0 + H - - return [L, C, H] - - -def lch_to_luv(triple): - L, C, H = triple - - Hrad = math.radians(H) - U = (math.cos(Hrad) * C) - V = (math.sin(Hrad) * C) - - return [L, U, V] - - -def husl_to_lch(triple): - H, S, L = triple - - if L > 99.9999999: - return [100, 0.0, H] - if L < 0.00000001: - return [0.0, 0.0, H] - - mx = max_chroma(L, H) - C = mx / 100.0 * S - - return [L, C, H] - - -def lch_to_husl(triple): - L, C, H = triple - - if L > 99.9999999: - return [H, 0.0, 100.0] - if L < 0.00000001: - return [H, 0.0, 0.0] - - mx = max_chroma(L, H) - S = C / mx * 100.0 - - return [H, S, L] - - -def huslp_to_lch(triple): - H, S, L = triple - - if L > 99.9999999: - return [100, 0.0, H] - if L < 0.00000001: - return [0.0, 0.0, H] - - mx = max_chroma_pastel(L) - C = mx / 100.0 * S - - return [L, C, H] - - -def lch_to_huslp(triple): - L, C, H = triple - - if L > 99.9999999: - return [H, 0.0, 100.0] - if L < 0.00000001: - return [H, 0.0, 0.0] - - mx = max_chroma_pastel(L) - S = C / mx * 100.0 - - return [H, S, L] diff --git a/seaborn/external/kde.py b/seaborn/external/kde.py deleted file mode 100644 index 6add4e19127895817b42f8602b5deb43ba3b725d..0000000000000000000000000000000000000000 --- a/seaborn/external/kde.py +++ /dev/null @@ -1,380 +0,0 @@ -""" -This module was copied from the scipy project. - -In the process of copying, some methods were removed because they depended on -other parts of scipy (especially on compiled components), allowing seaborn to -have a simple and pure Python implementation. These include: - -- integrate_gaussian -- integrate_box -- integrate_box_1d -- integrate_kde -- logpdf -- resample - -Additionally, the numpy.linalg module was substituted for scipy.linalg, -and the examples section (with doctests) was removed from the docstring - -The original scipy license is copied below: - -Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -""" - -# ------------------------------------------------------------------------------- -# -# Define classes for (uni/multi)-variate kernel density estimation. -# -# Currently, only Gaussian kernels are implemented. -# -# Written by: Robert Kern -# -# Date: 2004-08-09 -# -# Modified: 2005-02-10 by Robert Kern. -# Contributed to SciPy -# 2005-10-07 by Robert Kern. -# Some fixes to match the new scipy_core -# -# Copyright 2004-2005 by Enthought, Inc. -# -# ------------------------------------------------------------------------------- - -import numpy as np -from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, dot, exp, pi, - sqrt, power, atleast_1d, sum, ones, cov) -from numpy import linalg - - -__all__ = ['gaussian_kde'] - - -class gaussian_kde: - """Representation of a kernel-density estimate using Gaussian kernels. - - Kernel density estimation is a way to estimate the probability density - function (PDF) of a random variable in a non-parametric way. - `gaussian_kde` works for both uni-variate and multi-variate data. It - includes automatic bandwidth determination. The estimation works best for - a unimodal distribution; bimodal or multi-modal distributions tend to be - oversmoothed. - - Parameters - ---------- - dataset : array_like - Datapoints to estimate from. In case of univariate data this is a 1-D - array, otherwise a 2-D array with shape (# of dims, # of data). - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. If a scalar, - this will be used directly as `kde.factor`. If a callable, it should - take a `gaussian_kde` instance as only parameter and return a scalar. - If None (default), 'scott' is used. See Notes for more details. - weights : array_like, optional - weights of datapoints. This must be the same shape as dataset. - If None (default), the samples are assumed to be equally weighted - - Attributes - ---------- - dataset : ndarray - The dataset with which `gaussian_kde` was initialized. - d : int - Number of dimensions. - n : int - Number of datapoints. - neff : int - Effective number of datapoints. - - .. versionadded:: 1.2.0 - factor : float - The bandwidth factor, obtained from `kde.covariance_factor`, with which - the covariance matrix is multiplied. - covariance : ndarray - The covariance matrix of `dataset`, scaled by the calculated bandwidth - (`kde.factor`). - inv_cov : ndarray - The inverse of `covariance`. - - Methods - ------- - evaluate - __call__ - integrate_gaussian - integrate_box_1d - integrate_box - integrate_kde - pdf - logpdf - resample - set_bandwidth - covariance_factor - - Notes - ----- - Bandwidth selection strongly influences the estimate obtained from the KDE - (much more so than the actual shape of the kernel). Bandwidth selection - can be done by a "rule of thumb", by cross-validation, by "plug-in - methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde` - uses a rule of thumb, the default is Scott's Rule. - - Scott's Rule [1]_, implemented as `scotts_factor`, is:: - - n**(-1./(d+4)), - - with ``n`` the number of data points and ``d`` the number of dimensions. - In the case of unequally weighted points, `scotts_factor` becomes:: - - neff**(-1./(d+4)), - - with ``neff`` the effective number of datapoints. - Silverman's Rule [2]_, implemented as `silverman_factor`, is:: - - (n * (d + 2) / 4.)**(-1. / (d + 4)). - - or in the case of unequally weighted points:: - - (neff * (d + 2) / 4.)**(-1. / (d + 4)). - - Good general descriptions of kernel density estimation can be found in [1]_ - and [2]_, the mathematics for this multi-dimensional implementation can be - found in [1]_. - - With a set of weighted samples, the effective number of datapoints ``neff`` - is defined by:: - - neff = sum(weights)^2 / sum(weights^2) - - as detailed in [5]_. - - References - ---------- - .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and - Visualization", John Wiley & Sons, New York, Chicester, 1992. - .. [2] B.W. Silverman, "Density Estimation for Statistics and Data - Analysis", Vol. 26, Monographs on Statistics and Applied Probability, - Chapman and Hall, London, 1986. - .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A - Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993. - .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel - conditional density estimation", Computational Statistics & Data - Analysis, Vol. 36, pp. 279-298, 2001. - .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society. - Series A (General), 132, 272 - - """ - def __init__(self, dataset, bw_method=None, weights=None): - self.dataset = atleast_2d(asarray(dataset)) - if not self.dataset.size > 1: - raise ValueError("`dataset` input should have multiple elements.") - - self.d, self.n = self.dataset.shape - - if weights is not None: - self._weights = atleast_1d(weights).astype(float) - self._weights /= sum(self._weights) - if self.weights.ndim != 1: - raise ValueError("`weights` input should be one-dimensional.") - if len(self._weights) != self.n: - raise ValueError("`weights` input should be of length n") - self._neff = 1/sum(self._weights**2) - - self.set_bandwidth(bw_method=bw_method) - - def evaluate(self, points): - """Evaluate the estimated pdf on a set of points. - - Parameters - ---------- - points : (# of dimensions, # of points)-array - Alternatively, a (# of dimensions,) vector can be passed in and - treated as a single point. - - Returns - ------- - values : (# of points,)-array - The values at each point. - - Raises - ------ - ValueError : if the dimensionality of the input points is different than - the dimensionality of the KDE. - - """ - points = atleast_2d(asarray(points)) - - d, m = points.shape - if d != self.d: - if d == 1 and m == self.d: - # points was passed in as a row vector - points = reshape(points, (self.d, 1)) - m = 1 - else: - msg = f"points have dimension {d}, dataset has dimension {self.d}" - raise ValueError(msg) - - output_dtype = np.common_type(self.covariance, points) - result = zeros((m,), dtype=output_dtype) - - whitening = linalg.cholesky(self.inv_cov) - scaled_dataset = dot(whitening, self.dataset) - scaled_points = dot(whitening, points) - - if m >= self.n: - # there are more points than data, so loop over data - for i in range(self.n): - diff = scaled_dataset[:, i, newaxis] - scaled_points - energy = sum(diff * diff, axis=0) / 2.0 - result += self.weights[i]*exp(-energy) - else: - # loop over points - for i in range(m): - diff = scaled_dataset - scaled_points[:, i, newaxis] - energy = sum(diff * diff, axis=0) / 2.0 - result[i] = sum(exp(-energy)*self.weights, axis=0) - - result = result / self._norm_factor - - return result - - __call__ = evaluate - - def scotts_factor(self): - """Compute Scott's factor. - - Returns - ------- - s : float - Scott's factor. - """ - return power(self.neff, -1./(self.d+4)) - - def silverman_factor(self): - """Compute the Silverman factor. - - Returns - ------- - s : float - The silverman factor. - """ - return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4)) - - # Default method to calculate bandwidth, can be overwritten by subclass - covariance_factor = scotts_factor - covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that - multiplies the data covariance matrix to obtain the kernel covariance - matrix. The default is `scotts_factor`. A subclass can overwrite this - method to provide a different method, or set it through a call to - `kde.set_bandwidth`.""" - - def set_bandwidth(self, bw_method=None): - """Compute the estimator bandwidth with given method. - - The new bandwidth calculated after a call to `set_bandwidth` is used - for subsequent evaluations of the estimated density. - - Parameters - ---------- - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. If a - scalar, this will be used directly as `kde.factor`. If a callable, - it should take a `gaussian_kde` instance as only parameter and - return a scalar. If None (default), nothing happens; the current - `kde.covariance_factor` method is kept. - - Notes - ----- - .. versionadded:: 0.11 - - """ - if bw_method is None: - pass - elif bw_method == 'scott': - self.covariance_factor = self.scotts_factor - elif bw_method == 'silverman': - self.covariance_factor = self.silverman_factor - elif np.isscalar(bw_method) and not isinstance(bw_method, str): - self._bw_method = 'use constant' - self.covariance_factor = lambda: bw_method - elif callable(bw_method): - self._bw_method = bw_method - self.covariance_factor = lambda: self._bw_method(self) - else: - msg = "`bw_method` should be 'scott', 'silverman', a scalar " \ - "or a callable." - raise ValueError(msg) - - self._compute_covariance() - - def _compute_covariance(self): - """Computes the covariance matrix for each Gaussian kernel using - covariance_factor(). - """ - self.factor = self.covariance_factor() - # Cache covariance and inverse covariance of the data - if not hasattr(self, '_data_inv_cov'): - self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1, - bias=False, - aweights=self.weights)) - self._data_inv_cov = linalg.inv(self._data_covariance) - - self.covariance = self._data_covariance * self.factor**2 - self.inv_cov = self._data_inv_cov / self.factor**2 - self._norm_factor = sqrt(linalg.det(2*pi*self.covariance)) - - def pdf(self, x): - """ - Evaluate the estimated pdf on a provided set of points. - - Notes - ----- - This is an alias for `gaussian_kde.evaluate`. See the ``evaluate`` - docstring for more details. - - """ - return self.evaluate(x) - - @property - def weights(self): - try: - return self._weights - except AttributeError: - self._weights = ones(self.n)/self.n - return self._weights - - @property - def neff(self): - try: - return self._neff - except AttributeError: - self._neff = 1/sum(self.weights**2) - return self._neff diff --git a/seaborn/external/version.py b/seaborn/external/version.py deleted file mode 100644 index 7eb57d32ce3e811d4460b1b9a93513a986347e25..0000000000000000000000000000000000000000 --- a/seaborn/external/version.py +++ /dev/null @@ -1,461 +0,0 @@ -"""Extract reference documentation from the pypa/packaging source tree. - -In the process of copying, some unused methods / classes were removed. -These include: - -- parse() -- anything involving LegacyVersion - -This software is made available under the terms of *either* of the licenses -found in LICENSE.APACHE or LICENSE.BSD. Contributions to this software is made -under the terms of *both* these licenses. - -Vendored from: -- https://github.com/pypa/packaging/ -- commit ba07d8287b4554754ac7178d177033ea3f75d489 (09/09/2021) -""" - - -# This file is dual licensed under the terms of the Apache License, Version -# 2.0, and the BSD License. See the LICENSE file in the root of this repository -# for complete details. - - -import collections -import itertools -import re -from typing import Callable, Optional, SupportsInt, Tuple, Union - -__all__ = ["Version", "InvalidVersion", "VERSION_PATTERN"] - - -# Vendored from https://github.com/pypa/packaging/blob/main/packaging/_structures.py - -class InfinityType: - def __repr__(self) -> str: - return "Infinity" - - def __hash__(self) -> int: - return hash(repr(self)) - - def __lt__(self, other: object) -> bool: - return False - - def __le__(self, other: object) -> bool: - return False - - def __eq__(self, other: object) -> bool: - return isinstance(other, self.__class__) - - def __ne__(self, other: object) -> bool: - return not isinstance(other, self.__class__) - - def __gt__(self, other: object) -> bool: - return True - - def __ge__(self, other: object) -> bool: - return True - - def __neg__(self: object) -> "NegativeInfinityType": - return NegativeInfinity - - -Infinity = InfinityType() - - -class NegativeInfinityType: - def __repr__(self) -> str: - return "-Infinity" - - def __hash__(self) -> int: - return hash(repr(self)) - - def __lt__(self, other: object) -> bool: - return True - - def __le__(self, other: object) -> bool: - return True - - def __eq__(self, other: object) -> bool: - return isinstance(other, self.__class__) - - def __ne__(self, other: object) -> bool: - return not isinstance(other, self.__class__) - - def __gt__(self, other: object) -> bool: - return False - - def __ge__(self, other: object) -> bool: - return False - - def __neg__(self: object) -> InfinityType: - return Infinity - - -NegativeInfinity = NegativeInfinityType() - - -# Vendored from https://github.com/pypa/packaging/blob/main/packaging/version.py - -InfiniteTypes = Union[InfinityType, NegativeInfinityType] -PrePostDevType = Union[InfiniteTypes, Tuple[str, int]] -SubLocalType = Union[InfiniteTypes, int, str] -LocalType = Union[ - NegativeInfinityType, - Tuple[ - Union[ - SubLocalType, - Tuple[SubLocalType, str], - Tuple[NegativeInfinityType, SubLocalType], - ], - ..., - ], -] -CmpKey = Tuple[ - int, Tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType -] -LegacyCmpKey = Tuple[int, Tuple[str, ...]] -VersionComparisonMethod = Callable[ - [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool -] - -_Version = collections.namedtuple( - "_Version", ["epoch", "release", "dev", "pre", "post", "local"] -) - - - -class InvalidVersion(ValueError): - """ - An invalid version was found, users should refer to PEP 440. - """ - - -class _BaseVersion: - _key: Union[CmpKey, LegacyCmpKey] - - def __hash__(self) -> int: - return hash(self._key) - - # Please keep the duplicated `isinstance` check - # in the six comparisons hereunder - # unless you find a way to avoid adding overhead function calls. - def __lt__(self, other: "_BaseVersion") -> bool: - if not isinstance(other, _BaseVersion): - return NotImplemented - - return self._key < other._key - - def __le__(self, other: "_BaseVersion") -> bool: - if not isinstance(other, _BaseVersion): - return NotImplemented - - return self._key <= other._key - - def __eq__(self, other: object) -> bool: - if not isinstance(other, _BaseVersion): - return NotImplemented - - return self._key == other._key - - def __ge__(self, other: "_BaseVersion") -> bool: - if not isinstance(other, _BaseVersion): - return NotImplemented - - return self._key >= other._key - - def __gt__(self, other: "_BaseVersion") -> bool: - if not isinstance(other, _BaseVersion): - return NotImplemented - - return self._key > other._key - - def __ne__(self, other: object) -> bool: - if not isinstance(other, _BaseVersion): - return NotImplemented - - return self._key != other._key - - -# Deliberately not anchored to the start and end of the string, to make it -# easier for 3rd party code to reuse -VERSION_PATTERN = r""" - v? - (?: - (?:(?P<epoch>[0-9]+)!)? # epoch - (?P<release>[0-9]+(?:\.[0-9]+)*) # release segment - (?P<pre> # pre-release - [-_\.]? - (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview)) - [-_\.]? - (?P<pre_n>[0-9]+)? - )? - (?P<post> # post release - (?:-(?P<post_n1>[0-9]+)) - | - (?: - [-_\.]? - (?P<post_l>post|rev|r) - [-_\.]? - (?P<post_n2>[0-9]+)? - ) - )? - (?P<dev> # dev release - [-_\.]? - (?P<dev_l>dev) - [-_\.]? - (?P<dev_n>[0-9]+)? - )? - ) - (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))? # local version -""" - - -class Version(_BaseVersion): - - _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE) - - def __init__(self, version: str) -> None: - - # Validate the version and parse it into pieces - match = self._regex.search(version) - if not match: - raise InvalidVersion(f"Invalid version: '{version}'") - - # Store the parsed out pieces of the version - self._version = _Version( - epoch=int(match.group("epoch")) if match.group("epoch") else 0, - release=tuple(int(i) for i in match.group("release").split(".")), - pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")), - post=_parse_letter_version( - match.group("post_l"), match.group("post_n1") or match.group("post_n2") - ), - dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")), - local=_parse_local_version(match.group("local")), - ) - - # Generate a key which will be used for sorting - self._key = _cmpkey( - self._version.epoch, - self._version.release, - self._version.pre, - self._version.post, - self._version.dev, - self._version.local, - ) - - def __repr__(self) -> str: - return f"<Version('{self}')>" - - def __str__(self) -> str: - parts = [] - - # Epoch - if self.epoch != 0: - parts.append(f"{self.epoch}!") - - # Release segment - parts.append(".".join(str(x) for x in self.release)) - - # Pre-release - if self.pre is not None: - parts.append("".join(str(x) for x in self.pre)) - - # Post-release - if self.post is not None: - parts.append(f".post{self.post}") - - # Development release - if self.dev is not None: - parts.append(f".dev{self.dev}") - - # Local version segment - if self.local is not None: - parts.append(f"+{self.local}") - - return "".join(parts) - - @property - def epoch(self) -> int: - _epoch: int = self._version.epoch - return _epoch - - @property - def release(self) -> Tuple[int, ...]: - _release: Tuple[int, ...] = self._version.release - return _release - - @property - def pre(self) -> Optional[Tuple[str, int]]: - _pre: Optional[Tuple[str, int]] = self._version.pre - return _pre - - @property - def post(self) -> Optional[int]: - return self._version.post[1] if self._version.post else None - - @property - def dev(self) -> Optional[int]: - return self._version.dev[1] if self._version.dev else None - - @property - def local(self) -> Optional[str]: - if self._version.local: - return ".".join(str(x) for x in self._version.local) - else: - return None - - @property - def public(self) -> str: - return str(self).split("+", 1)[0] - - @property - def base_version(self) -> str: - parts = [] - - # Epoch - if self.epoch != 0: - parts.append(f"{self.epoch}!") - - # Release segment - parts.append(".".join(str(x) for x in self.release)) - - return "".join(parts) - - @property - def is_prerelease(self) -> bool: - return self.dev is not None or self.pre is not None - - @property - def is_postrelease(self) -> bool: - return self.post is not None - - @property - def is_devrelease(self) -> bool: - return self.dev is not None - - @property - def major(self) -> int: - return self.release[0] if len(self.release) >= 1 else 0 - - @property - def minor(self) -> int: - return self.release[1] if len(self.release) >= 2 else 0 - - @property - def micro(self) -> int: - return self.release[2] if len(self.release) >= 3 else 0 - - -def _parse_letter_version( - letter: str, number: Union[str, bytes, SupportsInt] -) -> Optional[Tuple[str, int]]: - - if letter: - # We consider there to be an implicit 0 in a pre-release if there is - # not a numeral associated with it. - if number is None: - number = 0 - - # We normalize any letters to their lower case form - letter = letter.lower() - - # We consider some words to be alternate spellings of other words and - # in those cases we want to normalize the spellings to our preferred - # spelling. - if letter == "alpha": - letter = "a" - elif letter == "beta": - letter = "b" - elif letter in ["c", "pre", "preview"]: - letter = "rc" - elif letter in ["rev", "r"]: - letter = "post" - - return letter, int(number) - if not letter and number: - # We assume if we are given a number, but we are not given a letter - # then this is using the implicit post release syntax (e.g. 1.0-1) - letter = "post" - - return letter, int(number) - - return None - - -_local_version_separators = re.compile(r"[\._-]") - - -def _parse_local_version(local: str) -> Optional[LocalType]: - """ - Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve"). - """ - if local is not None: - return tuple( - part.lower() if not part.isdigit() else int(part) - for part in _local_version_separators.split(local) - ) - return None - - -def _cmpkey( - epoch: int, - release: Tuple[int, ...], - pre: Optional[Tuple[str, int]], - post: Optional[Tuple[str, int]], - dev: Optional[Tuple[str, int]], - local: Optional[Tuple[SubLocalType]], -) -> CmpKey: - - # When we compare a release version, we want to compare it with all of the - # trailing zeros removed. So we'll use a reverse the list, drop all the now - # leading zeros until we come to something non zero, then take the rest - # re-reverse it back into the correct order and make it a tuple and use - # that for our sorting key. - _release = tuple( - reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release)))) - ) - - # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0. - # We'll do this by abusing the pre segment, but we _only_ want to do this - # if there is not a pre or a post segment. If we have one of those then - # the normal sorting rules will handle this case correctly. - if pre is None and post is None and dev is not None: - _pre: PrePostDevType = NegativeInfinity - # Versions without a pre-release (except as noted above) should sort after - # those with one. - elif pre is None: - _pre = Infinity - else: - _pre = pre - - # Versions without a post segment should sort before those with one. - if post is None: - _post: PrePostDevType = NegativeInfinity - - else: - _post = post - - # Versions without a development segment should sort after those with one. - if dev is None: - _dev: PrePostDevType = Infinity - - else: - _dev = dev - - if local is None: - # Versions without a local segment should sort before those with one. - _local: LocalType = NegativeInfinity - else: - # Versions with a local segment need that segment parsed to implement - # the sorting rules in PEP440. - # - Alpha numeric segments sort before numeric segments - # - Alpha numeric segments sort lexicographically - # - Numeric segments sort numerically - # - Shorter versions sort before longer versions when the prefixes - # match exactly - _local = tuple( - (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local - ) - - return epoch, _release, _pre, _post, _dev, _local diff --git a/seaborn/matrix.py b/seaborn/matrix.py deleted file mode 100644 index 6b99c118b62858c31b1c7db499fb0af4b4d99cb3..0000000000000000000000000000000000000000 --- a/seaborn/matrix.py +++ /dev/null @@ -1,1262 +0,0 @@ -"""Functions to visualize matrices of data.""" -import warnings - -import matplotlib as mpl -from matplotlib.collections import LineCollection -import matplotlib.pyplot as plt -from matplotlib import gridspec -import numpy as np -import pandas as pd -try: - from scipy.cluster import hierarchy - _no_scipy = False -except ImportError: - _no_scipy = True - -from . import cm -from .axisgrid import Grid -from ._compat import get_colormap -from .utils import ( - despine, - axis_ticklabels_overlap, - relative_luminance, - to_utf8, - _draw_figure, -) - - -__all__ = ["heatmap", "clustermap"] - - -def _index_to_label(index): - """Convert a pandas index or multiindex to an axis label.""" - if isinstance(index, pd.MultiIndex): - return "-".join(map(to_utf8, index.names)) - else: - return index.name - - -def _index_to_ticklabels(index): - """Convert a pandas index or multiindex into ticklabels.""" - if isinstance(index, pd.MultiIndex): - return ["-".join(map(to_utf8, i)) for i in index.values] - else: - return index.values - - -def _convert_colors(colors): - """Convert either a list of colors or nested lists of colors to RGB.""" - to_rgb = mpl.colors.to_rgb - - try: - to_rgb(colors[0]) - # If this works, there is only one level of colors - return list(map(to_rgb, colors)) - except ValueError: - # If we get here, we have nested lists - return [list(map(to_rgb, color_list)) for color_list in colors] - - -def _matrix_mask(data, mask): - """Ensure that data and mask are compatible and add missing values. - - Values will be plotted for cells where ``mask`` is ``False``. - - ``data`` is expected to be a DataFrame; ``mask`` can be an array or - a DataFrame. - - """ - if mask is None: - mask = np.zeros(data.shape, bool) - - if isinstance(mask, np.ndarray): - # For array masks, ensure that shape matches data then convert - if mask.shape != data.shape: - raise ValueError("Mask must have the same shape as data.") - - mask = pd.DataFrame(mask, - index=data.index, - columns=data.columns, - dtype=bool) - - elif isinstance(mask, pd.DataFrame): - # For DataFrame masks, ensure that semantic labels match data - if not mask.index.equals(data.index) \ - and mask.columns.equals(data.columns): - err = "Mask must have the same index and columns as data." - raise ValueError(err) - - # Add any cells with missing data to the mask - # This works around an issue where `plt.pcolormesh` doesn't represent - # missing data properly - mask = mask | pd.isnull(data) - - return mask - - -class _HeatMapper: - """Draw a heatmap plot of a matrix with nice labels and colormaps.""" - - def __init__(self, data, vmin, vmax, cmap, center, robust, annot, fmt, - annot_kws, cbar, cbar_kws, - xticklabels=True, yticklabels=True, mask=None): - """Initialize the plotting object.""" - # We always want to have a DataFrame with semantic information - # and an ndarray to pass to matplotlib - if isinstance(data, pd.DataFrame): - plot_data = data.values - else: - plot_data = np.asarray(data) - data = pd.DataFrame(plot_data) - - # Validate the mask and convert to DataFrame - mask = _matrix_mask(data, mask) - - plot_data = np.ma.masked_where(np.asarray(mask), plot_data) - - # Get good names for the rows and columns - xtickevery = 1 - if isinstance(xticklabels, int): - xtickevery = xticklabels - xticklabels = _index_to_ticklabels(data.columns) - elif xticklabels is True: - xticklabels = _index_to_ticklabels(data.columns) - elif xticklabels is False: - xticklabels = [] - - ytickevery = 1 - if isinstance(yticklabels, int): - ytickevery = yticklabels - yticklabels = _index_to_ticklabels(data.index) - elif yticklabels is True: - yticklabels = _index_to_ticklabels(data.index) - elif yticklabels is False: - yticklabels = [] - - if not len(xticklabels): - self.xticks = [] - self.xticklabels = [] - elif isinstance(xticklabels, str) and xticklabels == "auto": - self.xticks = "auto" - self.xticklabels = _index_to_ticklabels(data.columns) - else: - self.xticks, self.xticklabels = self._skip_ticks(xticklabels, - xtickevery) - - if not len(yticklabels): - self.yticks = [] - self.yticklabels = [] - elif isinstance(yticklabels, str) and yticklabels == "auto": - self.yticks = "auto" - self.yticklabels = _index_to_ticklabels(data.index) - else: - self.yticks, self.yticklabels = self._skip_ticks(yticklabels, - ytickevery) - - # Get good names for the axis labels - xlabel = _index_to_label(data.columns) - ylabel = _index_to_label(data.index) - self.xlabel = xlabel if xlabel is not None else "" - self.ylabel = ylabel if ylabel is not None else "" - - # Determine good default values for the colormapping - self._determine_cmap_params(plot_data, vmin, vmax, - cmap, center, robust) - - # Sort out the annotations - if annot is None or annot is False: - annot = False - annot_data = None - else: - if isinstance(annot, bool): - annot_data = plot_data - else: - annot_data = np.asarray(annot) - if annot_data.shape != plot_data.shape: - err = "`data` and `annot` must have same shape." - raise ValueError(err) - annot = True - - # Save other attributes to the object - self.data = data - self.plot_data = plot_data - - self.annot = annot - self.annot_data = annot_data - - self.fmt = fmt - self.annot_kws = {} if annot_kws is None else annot_kws.copy() - self.cbar = cbar - self.cbar_kws = {} if cbar_kws is None else cbar_kws.copy() - - def _determine_cmap_params(self, plot_data, vmin, vmax, - cmap, center, robust): - """Use some heuristics to set good defaults for colorbar and range.""" - - # plot_data is a np.ma.array instance - calc_data = plot_data.astype(float).filled(np.nan) - if vmin is None: - if robust: - vmin = np.nanpercentile(calc_data, 2) - else: - vmin = np.nanmin(calc_data) - if vmax is None: - if robust: - vmax = np.nanpercentile(calc_data, 98) - else: - vmax = np.nanmax(calc_data) - self.vmin, self.vmax = vmin, vmax - - # Choose default colormaps if not provided - if cmap is None: - if center is None: - self.cmap = cm.rocket - else: - self.cmap = cm.icefire - elif isinstance(cmap, str): - self.cmap = get_colormap(cmap) - elif isinstance(cmap, list): - self.cmap = mpl.colors.ListedColormap(cmap) - else: - self.cmap = cmap - - # Recenter a divergent colormap - if center is not None: - - # Copy bad values - # in mpl<3.2 only masked values are honored with "bad" color spec - # (see https://github.com/matplotlib/matplotlib/pull/14257) - bad = self.cmap(np.ma.masked_invalid([np.nan]))[0] - - # under/over values are set for sure when cmap extremes - # do not map to the same color as +-inf - under = self.cmap(-np.inf) - over = self.cmap(np.inf) - under_set = under != self.cmap(0) - over_set = over != self.cmap(self.cmap.N - 1) - - vrange = max(vmax - center, center - vmin) - normlize = mpl.colors.Normalize(center - vrange, center + vrange) - cmin, cmax = normlize([vmin, vmax]) - cc = np.linspace(cmin, cmax, 256) - self.cmap = mpl.colors.ListedColormap(self.cmap(cc)) - self.cmap.set_bad(bad) - if under_set: - self.cmap.set_under(under) - if over_set: - self.cmap.set_over(over) - - def _annotate_heatmap(self, ax, mesh): - """Add textual labels with the value in each cell.""" - mesh.update_scalarmappable() - height, width = self.annot_data.shape - xpos, ypos = np.meshgrid(np.arange(width) + .5, np.arange(height) + .5) - for x, y, m, color, val in zip(xpos.flat, ypos.flat, - mesh.get_array().flat, mesh.get_facecolors(), - self.annot_data.flat): - if m is not np.ma.masked: - lum = relative_luminance(color) - text_color = ".15" if lum > .408 else "w" - annotation = ("{:" + self.fmt + "}").format(val) - text_kwargs = dict(color=text_color, ha="center", va="center") - text_kwargs.update(self.annot_kws) - ax.text(x, y, annotation, **text_kwargs) - - def _skip_ticks(self, labels, tickevery): - """Return ticks and labels at evenly spaced intervals.""" - n = len(labels) - if tickevery == 0: - ticks, labels = [], [] - elif tickevery == 1: - ticks, labels = np.arange(n) + .5, labels - else: - start, end, step = 0, n, tickevery - ticks = np.arange(start, end, step) + .5 - labels = labels[start:end:step] - return ticks, labels - - def _auto_ticks(self, ax, labels, axis): - """Determine ticks and ticklabels that minimize overlap.""" - transform = ax.figure.dpi_scale_trans.inverted() - bbox = ax.get_window_extent().transformed(transform) - size = [bbox.width, bbox.height][axis] - axis = [ax.xaxis, ax.yaxis][axis] - tick, = axis.set_ticks([0]) - fontsize = tick.label1.get_size() - max_ticks = int(size // (fontsize / 72)) - if max_ticks < 1: - return [], [] - tick_every = len(labels) // max_ticks + 1 - tick_every = 1 if tick_every == 0 else tick_every - ticks, labels = self._skip_ticks(labels, tick_every) - return ticks, labels - - def plot(self, ax, cax, kws): - """Draw the heatmap on the provided Axes.""" - # Remove all the Axes spines - despine(ax=ax, left=True, bottom=True) - - # setting vmin/vmax in addition to norm is deprecated - # so avoid setting if norm is set - if kws.get("norm") is None: - kws.setdefault("vmin", self.vmin) - kws.setdefault("vmax", self.vmax) - - # Draw the heatmap - mesh = ax.pcolormesh(self.plot_data, cmap=self.cmap, **kws) - - # Set the axis limits - ax.set(xlim=(0, self.data.shape[1]), ylim=(0, self.data.shape[0])) - - # Invert the y axis to show the plot in matrix form - ax.invert_yaxis() - - # Possibly add a colorbar - if self.cbar: - cb = ax.figure.colorbar(mesh, cax, ax, **self.cbar_kws) - cb.outline.set_linewidth(0) - # If rasterized is passed to pcolormesh, also rasterize the - # colorbar to avoid white lines on the PDF rendering - if kws.get('rasterized', False): - cb.solids.set_rasterized(True) - - # Add row and column labels - if isinstance(self.xticks, str) and self.xticks == "auto": - xticks, xticklabels = self._auto_ticks(ax, self.xticklabels, 0) - else: - xticks, xticklabels = self.xticks, self.xticklabels - - if isinstance(self.yticks, str) and self.yticks == "auto": - yticks, yticklabels = self._auto_ticks(ax, self.yticklabels, 1) - else: - yticks, yticklabels = self.yticks, self.yticklabels - - ax.set(xticks=xticks, yticks=yticks) - xtl = ax.set_xticklabels(xticklabels) - ytl = ax.set_yticklabels(yticklabels, rotation="vertical") - plt.setp(ytl, va="center") # GH2484 - - # Possibly rotate them if they overlap - _draw_figure(ax.figure) - - if axis_ticklabels_overlap(xtl): - plt.setp(xtl, rotation="vertical") - if axis_ticklabels_overlap(ytl): - plt.setp(ytl, rotation="horizontal") - - # Add the axis labels - ax.set(xlabel=self.xlabel, ylabel=self.ylabel) - - # Annotate the cells with the formatted values - if self.annot: - self._annotate_heatmap(ax, mesh) - - -def heatmap( - data, *, - vmin=None, vmax=None, cmap=None, center=None, robust=False, - annot=None, fmt=".2g", annot_kws=None, - linewidths=0, linecolor="white", - cbar=True, cbar_kws=None, cbar_ax=None, - square=False, xticklabels="auto", yticklabels="auto", - mask=None, ax=None, - **kwargs -): - """Plot rectangular data as a color-encoded matrix. - - This is an Axes-level function and will draw the heatmap into the - currently-active Axes if none is provided to the ``ax`` argument. Part of - this Axes space will be taken and used to plot a colormap, unless ``cbar`` - is False or a separate Axes is provided to ``cbar_ax``. - - Parameters - ---------- - data : rectangular dataset - 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame - is provided, the index/column information will be used to label the - columns and rows. - vmin, vmax : floats, optional - Values to anchor the colormap, otherwise they are inferred from the - data and other keyword arguments. - cmap : matplotlib colormap name or object, or list of colors, optional - The mapping from data values to color space. If not provided, the - default will depend on whether ``center`` is set. - center : float, optional - The value at which to center the colormap when plotting divergent data. - Using this parameter will change the default ``cmap`` if none is - specified. - robust : bool, optional - If True and ``vmin`` or ``vmax`` are absent, the colormap range is - computed with robust quantiles instead of the extreme values. - annot : bool or rectangular dataset, optional - If True, write the data value in each cell. If an array-like with the - same shape as ``data``, then use this to annotate the heatmap instead - of the data. Note that DataFrames will match on position, not index. - fmt : str, optional - String formatting code to use when adding annotations. - annot_kws : dict of key, value mappings, optional - Keyword arguments for :meth:`matplotlib.axes.Axes.text` when ``annot`` - is True. - linewidths : float, optional - Width of the lines that will divide each cell. - linecolor : color, optional - Color of the lines that will divide each cell. - cbar : bool, optional - Whether to draw a colorbar. - cbar_kws : dict of key, value mappings, optional - Keyword arguments for :meth:`matplotlib.figure.Figure.colorbar`. - cbar_ax : matplotlib Axes, optional - Axes in which to draw the colorbar, otherwise take space from the - main Axes. - square : bool, optional - If True, set the Axes aspect to "equal" so each cell will be - square-shaped. - xticklabels, yticklabels : "auto", bool, list-like, or int, optional - If True, plot the column names of the dataframe. If False, don't plot - the column names. If list-like, plot these alternate labels as the - xticklabels. If an integer, use the column names but plot only every - n label. If "auto", try to densely plot non-overlapping labels. - mask : bool array or DataFrame, optional - If passed, data will not be shown in cells where ``mask`` is True. - Cells with missing values are automatically masked. - ax : matplotlib Axes, optional - Axes in which to draw the plot, otherwise use the currently-active - Axes. - kwargs : other keyword arguments - All other keyword arguments are passed to - :meth:`matplotlib.axes.Axes.pcolormesh`. - - Returns - ------- - ax : matplotlib Axes - Axes object with the heatmap. - - See Also - -------- - clustermap : Plot a matrix using hierarchical clustering to arrange the - rows and columns. - - Examples - -------- - - .. include:: ../docstrings/heatmap.rst - - """ - # Initialize the plotter object - plotter = _HeatMapper(data, vmin, vmax, cmap, center, robust, annot, fmt, - annot_kws, cbar, cbar_kws, xticklabels, - yticklabels, mask) - - # Add the pcolormesh kwargs here - kwargs["linewidths"] = linewidths - kwargs["edgecolor"] = linecolor - - # Draw the plot and return the Axes - if ax is None: - ax = plt.gca() - if square: - ax.set_aspect("equal") - plotter.plot(ax, cbar_ax, kwargs) - return ax - - -class _DendrogramPlotter: - """Object for drawing tree of similarities between data rows/columns""" - - def __init__(self, data, linkage, metric, method, axis, label, rotate): - """Plot a dendrogram of the relationships between the columns of data - - Parameters - ---------- - data : pandas.DataFrame - Rectangular data - """ - self.axis = axis - if self.axis == 1: - data = data.T - - if isinstance(data, pd.DataFrame): - array = data.values - else: - array = np.asarray(data) - data = pd.DataFrame(array) - - self.array = array - self.data = data - - self.shape = self.data.shape - self.metric = metric - self.method = method - self.axis = axis - self.label = label - self.rotate = rotate - - if linkage is None: - self.linkage = self.calculated_linkage - else: - self.linkage = linkage - self.dendrogram = self.calculate_dendrogram() - - # Dendrogram ends are always at multiples of 5, who knows why - ticks = 10 * np.arange(self.data.shape[0]) + 5 - - if self.label: - ticklabels = _index_to_ticklabels(self.data.index) - ticklabels = [ticklabels[i] for i in self.reordered_ind] - if self.rotate: - self.xticks = [] - self.yticks = ticks - self.xticklabels = [] - - self.yticklabels = ticklabels - self.ylabel = _index_to_label(self.data.index) - self.xlabel = '' - else: - self.xticks = ticks - self.yticks = [] - self.xticklabels = ticklabels - self.yticklabels = [] - self.ylabel = '' - self.xlabel = _index_to_label(self.data.index) - else: - self.xticks, self.yticks = [], [] - self.yticklabels, self.xticklabels = [], [] - self.xlabel, self.ylabel = '', '' - - self.dependent_coord = self.dendrogram['dcoord'] - self.independent_coord = self.dendrogram['icoord'] - - def _calculate_linkage_scipy(self): - linkage = hierarchy.linkage(self.array, method=self.method, - metric=self.metric) - return linkage - - def _calculate_linkage_fastcluster(self): - import fastcluster - # Fastcluster has a memory-saving vectorized version, but only - # with certain linkage methods, and mostly with euclidean metric - # vector_methods = ('single', 'centroid', 'median', 'ward') - euclidean_methods = ('centroid', 'median', 'ward') - euclidean = self.metric == 'euclidean' and self.method in \ - euclidean_methods - if euclidean or self.method == 'single': - return fastcluster.linkage_vector(self.array, - method=self.method, - metric=self.metric) - else: - linkage = fastcluster.linkage(self.array, method=self.method, - metric=self.metric) - return linkage - - @property - def calculated_linkage(self): - - try: - return self._calculate_linkage_fastcluster() - except ImportError: - if np.prod(self.shape) >= 10000: - msg = ("Clustering large matrix with scipy. Installing " - "`fastcluster` may give better performance.") - warnings.warn(msg) - - return self._calculate_linkage_scipy() - - def calculate_dendrogram(self): - """Calculates a dendrogram based on the linkage matrix - - Made a separate function, not a property because don't want to - recalculate the dendrogram every time it is accessed. - - Returns - ------- - dendrogram : dict - Dendrogram dictionary as returned by scipy.cluster.hierarchy - .dendrogram. The important key-value pairing is - "reordered_ind" which indicates the re-ordering of the matrix - """ - return hierarchy.dendrogram(self.linkage, no_plot=True, - color_threshold=-np.inf) - - @property - def reordered_ind(self): - """Indices of the matrix, reordered by the dendrogram""" - return self.dendrogram['leaves'] - - def plot(self, ax, tree_kws): - """Plots a dendrogram of the similarities between data on the axes - - Parameters - ---------- - ax : matplotlib.axes.Axes - Axes object upon which the dendrogram is plotted - - """ - tree_kws = {} if tree_kws is None else tree_kws.copy() - tree_kws.setdefault("linewidths", .5) - tree_kws.setdefault("colors", tree_kws.pop("color", (.2, .2, .2))) - - if self.rotate and self.axis == 0: - coords = zip(self.dependent_coord, self.independent_coord) - else: - coords = zip(self.independent_coord, self.dependent_coord) - lines = LineCollection([list(zip(x, y)) for x, y in coords], - **tree_kws) - - ax.add_collection(lines) - number_of_leaves = len(self.reordered_ind) - max_dependent_coord = max(map(max, self.dependent_coord)) - - if self.rotate: - ax.yaxis.set_ticks_position('right') - - # Constants 10 and 1.05 come from - # `scipy.cluster.hierarchy._plot_dendrogram` - ax.set_ylim(0, number_of_leaves * 10) - ax.set_xlim(0, max_dependent_coord * 1.05) - - ax.invert_xaxis() - ax.invert_yaxis() - else: - # Constants 10 and 1.05 come from - # `scipy.cluster.hierarchy._plot_dendrogram` - ax.set_xlim(0, number_of_leaves * 10) - ax.set_ylim(0, max_dependent_coord * 1.05) - - despine(ax=ax, bottom=True, left=True) - - ax.set(xticks=self.xticks, yticks=self.yticks, - xlabel=self.xlabel, ylabel=self.ylabel) - xtl = ax.set_xticklabels(self.xticklabels) - ytl = ax.set_yticklabels(self.yticklabels, rotation='vertical') - - # Force a draw of the plot to avoid matplotlib window error - _draw_figure(ax.figure) - - if len(ytl) > 0 and axis_ticklabels_overlap(ytl): - plt.setp(ytl, rotation="horizontal") - if len(xtl) > 0 and axis_ticklabels_overlap(xtl): - plt.setp(xtl, rotation="vertical") - return self - - -def dendrogram( - data, *, - linkage=None, axis=1, label=True, metric='euclidean', - method='average', rotate=False, tree_kws=None, ax=None -): - """Draw a tree diagram of relationships within a matrix - - Parameters - ---------- - data : pandas.DataFrame - Rectangular data - linkage : numpy.array, optional - Linkage matrix - axis : int, optional - Which axis to use to calculate linkage. 0 is rows, 1 is columns. - label : bool, optional - If True, label the dendrogram at leaves with column or row names - metric : str, optional - Distance metric. Anything valid for scipy.spatial.distance.pdist - method : str, optional - Linkage method to use. Anything valid for - scipy.cluster.hierarchy.linkage - rotate : bool, optional - When plotting the matrix, whether to rotate it 90 degrees - counter-clockwise, so the leaves face right - tree_kws : dict, optional - Keyword arguments for the ``matplotlib.collections.LineCollection`` - that is used for plotting the lines of the dendrogram tree. - ax : matplotlib axis, optional - Axis to plot on, otherwise uses current axis - - Returns - ------- - dendrogramplotter : _DendrogramPlotter - A Dendrogram plotter object. - - Notes - ----- - Access the reordered dendrogram indices with - dendrogramplotter.reordered_ind - - """ - if _no_scipy: - raise RuntimeError("dendrogram requires scipy to be installed") - - plotter = _DendrogramPlotter(data, linkage=linkage, axis=axis, - metric=metric, method=method, - label=label, rotate=rotate) - if ax is None: - ax = plt.gca() - - return plotter.plot(ax=ax, tree_kws=tree_kws) - - -class ClusterGrid(Grid): - - def __init__(self, data, pivot_kws=None, z_score=None, standard_scale=None, - figsize=None, row_colors=None, col_colors=None, mask=None, - dendrogram_ratio=None, colors_ratio=None, cbar_pos=None): - """Grid object for organizing clustered heatmap input on to axes""" - if _no_scipy: - raise RuntimeError("ClusterGrid requires scipy to be available") - - if isinstance(data, pd.DataFrame): - self.data = data - else: - self.data = pd.DataFrame(data) - - self.data2d = self.format_data(self.data, pivot_kws, z_score, - standard_scale) - - self.mask = _matrix_mask(self.data2d, mask) - - self._figure = plt.figure(figsize=figsize) - - self.row_colors, self.row_color_labels = \ - self._preprocess_colors(data, row_colors, axis=0) - self.col_colors, self.col_color_labels = \ - self._preprocess_colors(data, col_colors, axis=1) - - try: - row_dendrogram_ratio, col_dendrogram_ratio = dendrogram_ratio - except TypeError: - row_dendrogram_ratio = col_dendrogram_ratio = dendrogram_ratio - - try: - row_colors_ratio, col_colors_ratio = colors_ratio - except TypeError: - row_colors_ratio = col_colors_ratio = colors_ratio - - width_ratios = self.dim_ratios(self.row_colors, - row_dendrogram_ratio, - row_colors_ratio) - height_ratios = self.dim_ratios(self.col_colors, - col_dendrogram_ratio, - col_colors_ratio) - - nrows = 2 if self.col_colors is None else 3 - ncols = 2 if self.row_colors is None else 3 - - self.gs = gridspec.GridSpec(nrows, ncols, - width_ratios=width_ratios, - height_ratios=height_ratios) - - self.ax_row_dendrogram = self._figure.add_subplot(self.gs[-1, 0]) - self.ax_col_dendrogram = self._figure.add_subplot(self.gs[0, -1]) - self.ax_row_dendrogram.set_axis_off() - self.ax_col_dendrogram.set_axis_off() - - self.ax_row_colors = None - self.ax_col_colors = None - - if self.row_colors is not None: - self.ax_row_colors = self._figure.add_subplot( - self.gs[-1, 1]) - if self.col_colors is not None: - self.ax_col_colors = self._figure.add_subplot( - self.gs[1, -1]) - - self.ax_heatmap = self._figure.add_subplot(self.gs[-1, -1]) - if cbar_pos is None: - self.ax_cbar = self.cax = None - else: - # Initialize the colorbar axes in the gridspec so that tight_layout - # works. We will move it where it belongs later. This is a hack. - self.ax_cbar = self._figure.add_subplot(self.gs[0, 0]) - self.cax = self.ax_cbar # Backwards compatibility - self.cbar_pos = cbar_pos - - self.dendrogram_row = None - self.dendrogram_col = None - - def _preprocess_colors(self, data, colors, axis): - """Preprocess {row/col}_colors to extract labels and convert colors.""" - labels = None - - if colors is not None: - if isinstance(colors, (pd.DataFrame, pd.Series)): - - # If data is unindexed, raise - if (not hasattr(data, "index") and axis == 0) or ( - not hasattr(data, "columns") and axis == 1 - ): - axis_name = "col" if axis else "row" - msg = (f"{axis_name}_colors indices can't be matched with data " - f"indices. Provide {axis_name}_colors as a non-indexed " - "datatype, e.g. by using `.to_numpy()``") - raise TypeError(msg) - - # Ensure colors match data indices - if axis == 0: - colors = colors.reindex(data.index) - else: - colors = colors.reindex(data.columns) - - # Replace na's with white color - # TODO We should set these to transparent instead - colors = colors.astype(object).fillna('white') - - # Extract color values and labels from frame/series - if isinstance(colors, pd.DataFrame): - labels = list(colors.columns) - colors = colors.T.values - else: - if colors.name is None: - labels = [""] - else: - labels = [colors.name] - colors = colors.values - - colors = _convert_colors(colors) - - return colors, labels - - def format_data(self, data, pivot_kws, z_score=None, - standard_scale=None): - """Extract variables from data or use directly.""" - - # Either the data is already in 2d matrix format, or need to do a pivot - if pivot_kws is not None: - data2d = data.pivot(**pivot_kws) - else: - data2d = data - - if z_score is not None and standard_scale is not None: - raise ValueError( - 'Cannot perform both z-scoring and standard-scaling on data') - - if z_score is not None: - data2d = self.z_score(data2d, z_score) - if standard_scale is not None: - data2d = self.standard_scale(data2d, standard_scale) - return data2d - - @staticmethod - def z_score(data2d, axis=1): - """Standarize the mean and variance of the data axis - - Parameters - ---------- - data2d : pandas.DataFrame - Data to normalize - axis : int - Which axis to normalize across. If 0, normalize across rows, if 1, - normalize across columns. - - Returns - ------- - normalized : pandas.DataFrame - Noramlized data with a mean of 0 and variance of 1 across the - specified axis. - """ - if axis == 1: - z_scored = data2d - else: - z_scored = data2d.T - - z_scored = (z_scored - z_scored.mean()) / z_scored.std() - - if axis == 1: - return z_scored - else: - return z_scored.T - - @staticmethod - def standard_scale(data2d, axis=1): - """Divide the data by the difference between the max and min - - Parameters - ---------- - data2d : pandas.DataFrame - Data to normalize - axis : int - Which axis to normalize across. If 0, normalize across rows, if 1, - normalize across columns. - - Returns - ------- - standardized : pandas.DataFrame - Noramlized data with a mean of 0 and variance of 1 across the - specified axis. - - """ - # Normalize these values to range from 0 to 1 - if axis == 1: - standardized = data2d - else: - standardized = data2d.T - - subtract = standardized.min() - standardized = (standardized - subtract) / ( - standardized.max() - standardized.min()) - - if axis == 1: - return standardized - else: - return standardized.T - - def dim_ratios(self, colors, dendrogram_ratio, colors_ratio): - """Get the proportions of the figure taken up by each axes.""" - ratios = [dendrogram_ratio] - - if colors is not None: - # Colors are encoded as rgb, so there is an extra dimension - if np.ndim(colors) > 2: - n_colors = len(colors) - else: - n_colors = 1 - - ratios += [n_colors * colors_ratio] - - # Add the ratio for the heatmap itself - ratios.append(1 - sum(ratios)) - - return ratios - - @staticmethod - def color_list_to_matrix_and_cmap(colors, ind, axis=0): - """Turns a list of colors into a numpy matrix and matplotlib colormap - - These arguments can now be plotted using heatmap(matrix, cmap) - and the provided colors will be plotted. - - Parameters - ---------- - colors : list of matplotlib colors - Colors to label the rows or columns of a dataframe. - ind : list of ints - Ordering of the rows or columns, to reorder the original colors - by the clustered dendrogram order - axis : int - Which axis this is labeling - - Returns - ------- - matrix : numpy.array - A numpy array of integer values, where each indexes into the cmap - cmap : matplotlib.colors.ListedColormap - - """ - try: - mpl.colors.to_rgb(colors[0]) - except ValueError: - # We have a 2D color structure - m, n = len(colors), len(colors[0]) - if not all(len(c) == n for c in colors[1:]): - raise ValueError("Multiple side color vectors must have same size") - else: - # We have one vector of colors - m, n = 1, len(colors) - colors = [colors] - - # Map from unique colors to colormap index value - unique_colors = {} - matrix = np.zeros((m, n), int) - for i, inner in enumerate(colors): - for j, color in enumerate(inner): - idx = unique_colors.setdefault(color, len(unique_colors)) - matrix[i, j] = idx - - # Reorder for clustering and transpose for axis - matrix = matrix[:, ind] - if axis == 0: - matrix = matrix.T - - cmap = mpl.colors.ListedColormap(list(unique_colors)) - return matrix, cmap - - def plot_dendrograms(self, row_cluster, col_cluster, metric, method, - row_linkage, col_linkage, tree_kws): - # Plot the row dendrogram - if row_cluster: - self.dendrogram_row = dendrogram( - self.data2d, metric=metric, method=method, label=False, axis=0, - ax=self.ax_row_dendrogram, rotate=True, linkage=row_linkage, - tree_kws=tree_kws - ) - else: - self.ax_row_dendrogram.set_xticks([]) - self.ax_row_dendrogram.set_yticks([]) - # PLot the column dendrogram - if col_cluster: - self.dendrogram_col = dendrogram( - self.data2d, metric=metric, method=method, label=False, - axis=1, ax=self.ax_col_dendrogram, linkage=col_linkage, - tree_kws=tree_kws - ) - else: - self.ax_col_dendrogram.set_xticks([]) - self.ax_col_dendrogram.set_yticks([]) - despine(ax=self.ax_row_dendrogram, bottom=True, left=True) - despine(ax=self.ax_col_dendrogram, bottom=True, left=True) - - def plot_colors(self, xind, yind, **kws): - """Plots color labels between the dendrogram and the heatmap - - Parameters - ---------- - heatmap_kws : dict - Keyword arguments heatmap - - """ - # Remove any custom colormap and centering - # TODO this code has consistently caused problems when we - # have missed kwargs that need to be excluded that it might - # be better to rewrite *in*clusively. - kws = kws.copy() - kws.pop('cmap', None) - kws.pop('norm', None) - kws.pop('center', None) - kws.pop('annot', None) - kws.pop('vmin', None) - kws.pop('vmax', None) - kws.pop('robust', None) - kws.pop('xticklabels', None) - kws.pop('yticklabels', None) - - # Plot the row colors - if self.row_colors is not None: - matrix, cmap = self.color_list_to_matrix_and_cmap( - self.row_colors, yind, axis=0) - - # Get row_color labels - if self.row_color_labels is not None: - row_color_labels = self.row_color_labels - else: - row_color_labels = False - - heatmap(matrix, cmap=cmap, cbar=False, ax=self.ax_row_colors, - xticklabels=row_color_labels, yticklabels=False, **kws) - - # Adjust rotation of labels - if row_color_labels is not False: - plt.setp(self.ax_row_colors.get_xticklabels(), rotation=90) - else: - despine(self.ax_row_colors, left=True, bottom=True) - - # Plot the column colors - if self.col_colors is not None: - matrix, cmap = self.color_list_to_matrix_and_cmap( - self.col_colors, xind, axis=1) - - # Get col_color labels - if self.col_color_labels is not None: - col_color_labels = self.col_color_labels - else: - col_color_labels = False - - heatmap(matrix, cmap=cmap, cbar=False, ax=self.ax_col_colors, - xticklabels=False, yticklabels=col_color_labels, **kws) - - # Adjust rotation of labels, place on right side - if col_color_labels is not False: - self.ax_col_colors.yaxis.tick_right() - plt.setp(self.ax_col_colors.get_yticklabels(), rotation=0) - else: - despine(self.ax_col_colors, left=True, bottom=True) - - def plot_matrix(self, colorbar_kws, xind, yind, **kws): - self.data2d = self.data2d.iloc[yind, xind] - self.mask = self.mask.iloc[yind, xind] - - # Try to reorganize specified tick labels, if provided - xtl = kws.pop("xticklabels", "auto") - try: - xtl = np.asarray(xtl)[xind] - except (TypeError, IndexError): - pass - ytl = kws.pop("yticklabels", "auto") - try: - ytl = np.asarray(ytl)[yind] - except (TypeError, IndexError): - pass - - # Reorganize the annotations to match the heatmap - annot = kws.pop("annot", None) - if annot is None or annot is False: - pass - else: - if isinstance(annot, bool): - annot_data = self.data2d - else: - annot_data = np.asarray(annot) - if annot_data.shape != self.data2d.shape: - err = "`data` and `annot` must have same shape." - raise ValueError(err) - annot_data = annot_data[yind][:, xind] - annot = annot_data - - # Setting ax_cbar=None in clustermap call implies no colorbar - kws.setdefault("cbar", self.ax_cbar is not None) - heatmap(self.data2d, ax=self.ax_heatmap, cbar_ax=self.ax_cbar, - cbar_kws=colorbar_kws, mask=self.mask, - xticklabels=xtl, yticklabels=ytl, annot=annot, **kws) - - ytl = self.ax_heatmap.get_yticklabels() - ytl_rot = None if not ytl else ytl[0].get_rotation() - self.ax_heatmap.yaxis.set_ticks_position('right') - self.ax_heatmap.yaxis.set_label_position('right') - if ytl_rot is not None: - ytl = self.ax_heatmap.get_yticklabels() - plt.setp(ytl, rotation=ytl_rot) - - tight_params = dict(h_pad=.02, w_pad=.02) - if self.ax_cbar is None: - self._figure.tight_layout(**tight_params) - else: - # Turn the colorbar axes off for tight layout so that its - # ticks don't interfere with the rest of the plot layout. - # Then move it. - self.ax_cbar.set_axis_off() - self._figure.tight_layout(**tight_params) - self.ax_cbar.set_axis_on() - self.ax_cbar.set_position(self.cbar_pos) - - def plot(self, metric, method, colorbar_kws, row_cluster, col_cluster, - row_linkage, col_linkage, tree_kws, **kws): - - # heatmap square=True sets the aspect ratio on the axes, but that is - # not compatible with the multi-axes layout of clustergrid - if kws.get("square", False): - msg = "``square=True`` ignored in clustermap" - warnings.warn(msg) - kws.pop("square") - - colorbar_kws = {} if colorbar_kws is None else colorbar_kws - - self.plot_dendrograms(row_cluster, col_cluster, metric, method, - row_linkage=row_linkage, col_linkage=col_linkage, - tree_kws=tree_kws) - try: - xind = self.dendrogram_col.reordered_ind - except AttributeError: - xind = np.arange(self.data2d.shape[1]) - try: - yind = self.dendrogram_row.reordered_ind - except AttributeError: - yind = np.arange(self.data2d.shape[0]) - - self.plot_colors(xind, yind, **kws) - self.plot_matrix(colorbar_kws, xind, yind, **kws) - return self - - -def clustermap( - data, *, - pivot_kws=None, method='average', metric='euclidean', - z_score=None, standard_scale=None, figsize=(10, 10), - cbar_kws=None, row_cluster=True, col_cluster=True, - row_linkage=None, col_linkage=None, - row_colors=None, col_colors=None, mask=None, - dendrogram_ratio=.2, colors_ratio=0.03, - cbar_pos=(.02, .8, .05, .18), tree_kws=None, - **kwargs -): - """ - Plot a matrix dataset as a hierarchically-clustered heatmap. - - This function requires scipy to be available. - - Parameters - ---------- - data : 2D array-like - Rectangular data for clustering. Cannot contain NAs. - pivot_kws : dict, optional - If `data` is a tidy dataframe, can provide keyword arguments for - pivot to create a rectangular dataframe. - method : str, optional - Linkage method to use for calculating clusters. See - :func:`scipy.cluster.hierarchy.linkage` documentation for more - information. - metric : str, optional - Distance metric to use for the data. See - :func:`scipy.spatial.distance.pdist` documentation for more options. - To use different metrics (or methods) for rows and columns, you may - construct each linkage matrix yourself and provide them as - `{row,col}_linkage`. - z_score : int or None, optional - Either 0 (rows) or 1 (columns). Whether or not to calculate z-scores - for the rows or the columns. Z scores are: z = (x - mean)/std, so - values in each row (column) will get the mean of the row (column) - subtracted, then divided by the standard deviation of the row (column). - This ensures that each row (column) has mean of 0 and variance of 1. - standard_scale : int or None, optional - Either 0 (rows) or 1 (columns). Whether or not to standardize that - dimension, meaning for each row or column, subtract the minimum and - divide each by its maximum. - figsize : tuple of (width, height), optional - Overall size of the figure. - cbar_kws : dict, optional - Keyword arguments to pass to `cbar_kws` in :func:`heatmap`, e.g. to - add a label to the colorbar. - {row,col}_cluster : bool, optional - If ``True``, cluster the {rows, columns}. - {row,col}_linkage : :class:`numpy.ndarray`, optional - Precomputed linkage matrix for the rows or columns. See - :func:`scipy.cluster.hierarchy.linkage` for specific formats. - {row,col}_colors : list-like or pandas DataFrame/Series, optional - List of colors to label for either the rows or columns. Useful to evaluate - whether samples within a group are clustered together. Can use nested lists or - DataFrame for multiple color levels of labeling. If given as a - :class:`pandas.DataFrame` or :class:`pandas.Series`, labels for the colors are - extracted from the DataFrames column names or from the name of the Series. - DataFrame/Series colors are also matched to the data by their index, ensuring - colors are drawn in the correct order. - mask : bool array or DataFrame, optional - If passed, data will not be shown in cells where `mask` is True. - Cells with missing values are automatically masked. Only used for - visualizing, not for calculating. - {dendrogram,colors}_ratio : float, or pair of floats, optional - Proportion of the figure size devoted to the two marginal elements. If - a pair is given, they correspond to (row, col) ratios. - cbar_pos : tuple of (left, bottom, width, height), optional - Position of the colorbar axes in the figure. Setting to ``None`` will - disable the colorbar. - tree_kws : dict, optional - Parameters for the :class:`matplotlib.collections.LineCollection` - that is used to plot the lines of the dendrogram tree. - kwargs : other keyword arguments - All other keyword arguments are passed to :func:`heatmap`. - - Returns - ------- - :class:`ClusterGrid` - A :class:`ClusterGrid` instance. - - See Also - -------- - heatmap : Plot rectangular data as a color-encoded matrix. - - Notes - ----- - The returned object has a ``savefig`` method that should be used if you - want to save the figure object without clipping the dendrograms. - - To access the reordered row indices, use: - ``clustergrid.dendrogram_row.reordered_ind`` - - Column indices, use: - ``clustergrid.dendrogram_col.reordered_ind`` - - Examples - -------- - - .. include:: ../docstrings/clustermap.rst - - """ - if _no_scipy: - raise RuntimeError("clustermap requires scipy to be available") - - plotter = ClusterGrid(data, pivot_kws=pivot_kws, figsize=figsize, - row_colors=row_colors, col_colors=col_colors, - z_score=z_score, standard_scale=standard_scale, - mask=mask, dendrogram_ratio=dendrogram_ratio, - colors_ratio=colors_ratio, cbar_pos=cbar_pos) - - return plotter.plot(metric=metric, method=method, - colorbar_kws=cbar_kws, - row_cluster=row_cluster, col_cluster=col_cluster, - row_linkage=row_linkage, col_linkage=col_linkage, - tree_kws=tree_kws, **kwargs) diff --git a/seaborn/miscplot.py b/seaborn/miscplot.py deleted file mode 100644 index 3fb290c812f8de293c9731ecd6bc83ee88fd2239..0000000000000000000000000000000000000000 --- a/seaborn/miscplot.py +++ /dev/null @@ -1,45 +0,0 @@ -import numpy as np -import matplotlib as mpl -import matplotlib.pyplot as plt -import matplotlib.ticker as ticker - -__all__ = ["palplot", "dogplot"] - - -def palplot(pal, size=1): - """Plot the values in a color palette as a horizontal array. - - Parameters - ---------- - pal : sequence of matplotlib colors - colors, i.e. as returned by seaborn.color_palette() - size : - scaling factor for size of plot - - """ - n = len(pal) - _, ax = plt.subplots(1, 1, figsize=(n * size, size)) - ax.imshow(np.arange(n).reshape(1, n), - cmap=mpl.colors.ListedColormap(list(pal)), - interpolation="nearest", aspect="auto") - ax.set_xticks(np.arange(n) - .5) - ax.set_yticks([-.5, .5]) - # Ensure nice border between colors - ax.set_xticklabels(["" for _ in range(n)]) - # The proper way to set no ticks - ax.yaxis.set_major_locator(ticker.NullLocator()) - - -def dogplot(*_, **__): - """Who's a good boy?""" - from urllib.request import urlopen - from io import BytesIO - - url = "https://github.com/mwaskom/seaborn-data/raw/master/png/img{}.png" - pic = np.random.randint(2, 7) - data = BytesIO(urlopen(url.format(pic)).read()) - img = plt.imread(data) - f, ax = plt.subplots(figsize=(5, 5), dpi=100) - f.subplots_adjust(0, 0, 1, 1) - ax.imshow(img) - ax.set_axis_off() diff --git a/seaborn/objects.py b/seaborn/objects.py deleted file mode 100644 index 123e57f0a936e8e73c684dd647b9813da86a3f60..0000000000000000000000000000000000000000 --- a/seaborn/objects.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -A declarative, object-oriented interface for creating statistical graphics. - -The seaborn.objects namespace contains a number of classes that can be composed -together to build a customized visualization. - -The main object is :class:`Plot`, which is the starting point for all figures. -Pass :class:`Plot` a dataset and specify assignments from its variables to -roles in the plot. Build up the visualization by calling its methods. - -There are four other general types of objects in this interface: - -- :class:`Mark` subclasses, which create matplotlib artists for visualization -- :class:`Stat` subclasses, which apply statistical transforms before plotting -- :class:`Move` subclasses, which make further adjustments to reduce overplotting - -These classes are passed to :meth:`Plot.add` to define a layer in the plot. -Each layer has a :class:`Mark` and optional :class:`Stat` and/or :class:`Move`. -Plots can have multiple layers. - -The other general type of object is a :class:`Scale` subclass, which provide an -interface for controlling the mappings between data values and visual properties. -Pass :class:`Scale` objects to :meth:`Plot.scale`. - -See the documentation for other :class:`Plot` methods to learn about the many -ways that a plot can be enhanced and customized. - -""" -from seaborn._core.plot import Plot # noqa: F401 - -from seaborn._marks.base import Mark # noqa: F401 -from seaborn._marks.area import Area, Band # noqa: F401 -from seaborn._marks.bar import Bar, Bars # noqa: F401 -from seaborn._marks.dot import Dot, Dots # noqa: F401 -from seaborn._marks.line import Dash, Line, Lines, Path, Paths, Range # noqa: F401 -from seaborn._marks.text import Text # noqa: F401 - -from seaborn._stats.base import Stat # noqa: F401 -from seaborn._stats.aggregation import Agg, Est # noqa: F401 -from seaborn._stats.counting import Count, Hist # noqa: F401 -from seaborn._stats.density import KDE # noqa: F401 -from seaborn._stats.order import Perc # noqa: F401 -from seaborn._stats.regression import PolyFit # noqa: F401 - -from seaborn._core.moves import Dodge, Jitter, Norm, Shift, Stack, Move # noqa: F401 - -from seaborn._core.scales import ( # noqa: F401 - Boolean, Continuous, Nominal, Temporal, Scale -) diff --git a/seaborn/palettes.py b/seaborn/palettes.py deleted file mode 100644 index f7f4298436f6fd83d819f314c0ad2cbe0db4b257..0000000000000000000000000000000000000000 --- a/seaborn/palettes.py +++ /dev/null @@ -1,841 +0,0 @@ -import colorsys -from itertools import cycle - -import numpy as np -import matplotlib as mpl - -from .external import husl - -from .utils import desaturate, get_color_cycle -from .colors import xkcd_rgb, crayons -from ._compat import get_colormap - - -__all__ = ["color_palette", "hls_palette", "husl_palette", "mpl_palette", - "dark_palette", "light_palette", "diverging_palette", - "blend_palette", "xkcd_palette", "crayon_palette", - "cubehelix_palette", "set_color_codes"] - - -SEABORN_PALETTES = dict( - deep=["#4C72B0", "#DD8452", "#55A868", "#C44E52", "#8172B3", - "#937860", "#DA8BC3", "#8C8C8C", "#CCB974", "#64B5CD"], - deep6=["#4C72B0", "#55A868", "#C44E52", - "#8172B3", "#CCB974", "#64B5CD"], - muted=["#4878D0", "#EE854A", "#6ACC64", "#D65F5F", "#956CB4", - "#8C613C", "#DC7EC0", "#797979", "#D5BB67", "#82C6E2"], - muted6=["#4878D0", "#6ACC64", "#D65F5F", - "#956CB4", "#D5BB67", "#82C6E2"], - pastel=["#A1C9F4", "#FFB482", "#8DE5A1", "#FF9F9B", "#D0BBFF", - "#DEBB9B", "#FAB0E4", "#CFCFCF", "#FFFEA3", "#B9F2F0"], - pastel6=["#A1C9F4", "#8DE5A1", "#FF9F9B", - "#D0BBFF", "#FFFEA3", "#B9F2F0"], - bright=["#023EFF", "#FF7C00", "#1AC938", "#E8000B", "#8B2BE2", - "#9F4800", "#F14CC1", "#A3A3A3", "#FFC400", "#00D7FF"], - bright6=["#023EFF", "#1AC938", "#E8000B", - "#8B2BE2", "#FFC400", "#00D7FF"], - dark=["#001C7F", "#B1400D", "#12711C", "#8C0800", "#591E71", - "#592F0D", "#A23582", "#3C3C3C", "#B8850A", "#006374"], - dark6=["#001C7F", "#12711C", "#8C0800", - "#591E71", "#B8850A", "#006374"], - colorblind=["#0173B2", "#DE8F05", "#029E73", "#D55E00", "#CC78BC", - "#CA9161", "#FBAFE4", "#949494", "#ECE133", "#56B4E9"], - colorblind6=["#0173B2", "#029E73", "#D55E00", - "#CC78BC", "#ECE133", "#56B4E9"] -) - - -MPL_QUAL_PALS = { - "tab10": 10, "tab20": 20, "tab20b": 20, "tab20c": 20, - "Set1": 9, "Set2": 8, "Set3": 12, - "Accent": 8, "Paired": 12, - "Pastel1": 9, "Pastel2": 8, "Dark2": 8, -} - - -QUAL_PALETTE_SIZES = MPL_QUAL_PALS.copy() -QUAL_PALETTE_SIZES.update({k: len(v) for k, v in SEABORN_PALETTES.items()}) -QUAL_PALETTES = list(QUAL_PALETTE_SIZES.keys()) - - -class _ColorPalette(list): - """Set the color palette in a with statement, otherwise be a list.""" - def __enter__(self): - """Open the context.""" - from .rcmod import set_palette - self._orig_palette = color_palette() - set_palette(self) - return self - - def __exit__(self, *args): - """Close the context.""" - from .rcmod import set_palette - set_palette(self._orig_palette) - - def as_hex(self): - """Return a color palette with hex codes instead of RGB values.""" - hex = [mpl.colors.rgb2hex(rgb) for rgb in self] - return _ColorPalette(hex) - - def _repr_html_(self): - """Rich display of the color palette in an HTML frontend.""" - s = 55 - n = len(self) - html = f'<svg width="{n * s}" height="{s}">' - for i, c in enumerate(self.as_hex()): - html += ( - f'<rect x="{i * s}" y="0" width="{s}" height="{s}" style="fill:{c};' - 'stroke-width:2;stroke:rgb(255,255,255)"/>' - ) - html += '</svg>' - return html - - -def _patch_colormap_display(): - """Simplify the rich display of matplotlib color maps in a notebook.""" - def _repr_png_(self): - """Generate a PNG representation of the Colormap.""" - import io - from PIL import Image - import numpy as np - IMAGE_SIZE = (400, 50) - X = np.tile(np.linspace(0, 1, IMAGE_SIZE[0]), (IMAGE_SIZE[1], 1)) - pixels = self(X, bytes=True) - png_bytes = io.BytesIO() - Image.fromarray(pixels).save(png_bytes, format='png') - return png_bytes.getvalue() - - def _repr_html_(self): - """Generate an HTML representation of the Colormap.""" - import base64 - png_bytes = self._repr_png_() - png_base64 = base64.b64encode(png_bytes).decode('ascii') - return ('<img ' - + 'alt="' + self.name + ' color map" ' - + 'title="' + self.name + '"' - + 'src="data:image/png;base64,' + png_base64 + '">') - - mpl.colors.Colormap._repr_png_ = _repr_png_ - mpl.colors.Colormap._repr_html_ = _repr_html_ - - -def color_palette(palette=None, n_colors=None, desat=None, as_cmap=False): - """Return a list of colors or continuous colormap defining a palette. - - Possible ``palette`` values include: - - Name of a seaborn palette (deep, muted, bright, pastel, dark, colorblind) - - Name of matplotlib colormap - - 'husl' or 'hls' - - 'ch:<cubehelix arguments>' - - 'light:<color>', 'dark:<color>', 'blend:<color>,<color>', - - A sequence of colors in any format matplotlib accepts - - Calling this function with ``palette=None`` will return the current - matplotlib color cycle. - - This function can also be used in a ``with`` statement to temporarily - set the color cycle for a plot or set of plots. - - See the :ref:`tutorial <palette_tutorial>` for more information. - - Parameters - ---------- - palette : None, string, or sequence, optional - Name of palette or None to return current palette. If a sequence, input - colors are used but possibly cycled and desaturated. - n_colors : int, optional - Number of colors in the palette. If ``None``, the default will depend - on how ``palette`` is specified. Named palettes default to 6 colors, - but grabbing the current palette or passing in a list of colors will - not change the number of colors unless this is specified. Asking for - more colors than exist in the palette will cause it to cycle. Ignored - when ``as_cmap`` is True. - desat : float, optional - Proportion to desaturate each color by. - as_cmap : bool - If True, return a :class:`matplotlib.colors.ListedColormap`. - - Returns - ------- - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - set_palette : Set the default color cycle for all plots. - set_color_codes : Reassign color codes like ``"b"``, ``"g"``, etc. to - colors from one of the seaborn palettes. - - Examples - -------- - - .. include:: ../docstrings/color_palette.rst - - """ - if palette is None: - palette = get_color_cycle() - if n_colors is None: - n_colors = len(palette) - - elif not isinstance(palette, str): - palette = palette - if n_colors is None: - n_colors = len(palette) - else: - - if n_colors is None: - # Use all colors in a qualitative palette or 6 of another kind - n_colors = QUAL_PALETTE_SIZES.get(palette, 6) - - if palette in SEABORN_PALETTES: - # Named "seaborn variant" of matplotlib default color cycle - palette = SEABORN_PALETTES[palette] - - elif palette == "hls": - # Evenly spaced colors in cylindrical RGB space - palette = hls_palette(n_colors, as_cmap=as_cmap) - - elif palette == "husl": - # Evenly spaced colors in cylindrical Lab space - palette = husl_palette(n_colors, as_cmap=as_cmap) - - elif palette.lower() == "jet": - # Paternalism - raise ValueError("No.") - - elif palette.startswith("ch:"): - # Cubehelix palette with params specified in string - args, kwargs = _parse_cubehelix_args(palette) - palette = cubehelix_palette(n_colors, *args, **kwargs, as_cmap=as_cmap) - - elif palette.startswith("light:"): - # light palette to color specified in string - _, color = palette.split(":") - reverse = color.endswith("_r") - if reverse: - color = color[:-2] - palette = light_palette(color, n_colors, reverse=reverse, as_cmap=as_cmap) - - elif palette.startswith("dark:"): - # light palette to color specified in string - _, color = palette.split(":") - reverse = color.endswith("_r") - if reverse: - color = color[:-2] - palette = dark_palette(color, n_colors, reverse=reverse, as_cmap=as_cmap) - - elif palette.startswith("blend:"): - # blend palette between colors specified in string - _, colors = palette.split(":") - colors = colors.split(",") - palette = blend_palette(colors, n_colors, as_cmap=as_cmap) - - else: - try: - # Perhaps a named matplotlib colormap? - palette = mpl_palette(palette, n_colors, as_cmap=as_cmap) - except (ValueError, KeyError): # Error class changed in mpl36 - raise ValueError(f"{palette!r} is not a valid palette name") - - if desat is not None: - palette = [desaturate(c, desat) for c in palette] - - if not as_cmap: - - # Always return as many colors as we asked for - pal_cycle = cycle(palette) - palette = [next(pal_cycle) for _ in range(n_colors)] - - # Always return in r, g, b tuple format - try: - palette = map(mpl.colors.colorConverter.to_rgb, palette) - palette = _ColorPalette(palette) - except ValueError: - raise ValueError(f"Could not generate a palette for {palette}") - - return palette - - -def hls_palette(n_colors=6, h=.01, l=.6, s=.65, as_cmap=False): # noqa - """ - Return hues with constant lightness and saturation in the HLS system. - - The hues are evenly sampled along a circular path. The resulting palette will be - appropriate for categorical or cyclical data. - - The `h`, `l`, and `s` values should be between 0 and 1. - - .. note:: - While the separation of the resulting colors will be mathematically - constant, the HLS system does not construct a perceptually-uniform space, - so their apparent intensity will vary. - - Parameters - ---------- - n_colors : int - Number of colors in the palette. - h : float - The value of the first hue. - l : float - The lightness value. - s : float - The saturation intensity. - as_cmap : bool - If True, return a matplotlib colormap object. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - husl_palette : Make a palette using evenly spaced hues in the HUSL system. - - Examples - -------- - .. include:: ../docstrings/hls_palette.rst - - """ - if as_cmap: - n_colors = 256 - hues = np.linspace(0, 1, int(n_colors) + 1)[:-1] - hues += h - hues %= 1 - hues -= hues.astype(int) - palette = [colorsys.hls_to_rgb(h_i, l, s) for h_i in hues] - if as_cmap: - return mpl.colors.ListedColormap(palette, "hls") - else: - return _ColorPalette(palette) - - -def husl_palette(n_colors=6, h=.01, s=.9, l=.65, as_cmap=False): # noqa - """ - Return hues with constant lightness and saturation in the HUSL system. - - The hues are evenly sampled along a circular path. The resulting palette will be - appropriate for categorical or cyclical data. - - The `h`, `l`, and `s` values should be between 0 and 1. - - This function is similar to :func:`hls_palette`, but it uses a nonlinear color - space that is more perceptually uniform. - - Parameters - ---------- - n_colors : int - Number of colors in the palette. - h : float - The value of the first hue. - l : float - The lightness value. - s : float - The saturation intensity. - as_cmap : bool - If True, return a matplotlib colormap object. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - hls_palette : Make a palette using evenly spaced hues in the HSL system. - - Examples - -------- - .. include:: ../docstrings/husl_palette.rst - - """ - if as_cmap: - n_colors = 256 - hues = np.linspace(0, 1, int(n_colors) + 1)[:-1] - hues += h - hues %= 1 - hues *= 359 - s *= 99 - l *= 99 # noqa - palette = [_color_to_rgb((h_i, s, l), input="husl") for h_i in hues] - if as_cmap: - return mpl.colors.ListedColormap(palette, "hsl") - else: - return _ColorPalette(palette) - - -def mpl_palette(name, n_colors=6, as_cmap=False): - """ - Return a palette or colormap from the matplotlib registry. - - For continuous palettes, evenly-spaced discrete samples are chosen while - excluding the minimum and maximum value in the colormap to provide better - contrast at the extremes. - - For qualitative palettes (e.g. those from colorbrewer), exact values are - indexed (rather than interpolated), but fewer than `n_colors` can be returned - if the palette does not define that many. - - Parameters - ---------- - name : string - Name of the palette. This should be a named matplotlib colormap. - n_colors : int - Number of discrete colors in the palette. - - Returns - ------- - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - Examples - -------- - .. include:: ../docstrings/mpl_palette.rst - - """ - if name.endswith("_d"): - sub_name = name[:-2] - if sub_name.endswith("_r"): - reverse = True - sub_name = sub_name[:-2] - else: - reverse = False - pal = color_palette(sub_name, 2) + ["#333333"] - if reverse: - pal = pal[::-1] - cmap = blend_palette(pal, n_colors, as_cmap=True) - else: - cmap = get_colormap(name) - - if name in MPL_QUAL_PALS: - bins = np.linspace(0, 1, MPL_QUAL_PALS[name])[:n_colors] - else: - bins = np.linspace(0, 1, int(n_colors) + 2)[1:-1] - palette = list(map(tuple, cmap(bins)[:, :3])) - - if as_cmap: - return cmap - else: - return _ColorPalette(palette) - - -def _color_to_rgb(color, input): - """Add some more flexibility to color choices.""" - if input == "hls": - color = colorsys.hls_to_rgb(*color) - elif input == "husl": - color = husl.husl_to_rgb(*color) - color = tuple(np.clip(color, 0, 1)) - elif input == "xkcd": - color = xkcd_rgb[color] - - return mpl.colors.to_rgb(color) - - -def dark_palette(color, n_colors=6, reverse=False, as_cmap=False, input="rgb"): - """Make a sequential palette that blends from dark to ``color``. - - This kind of palette is good for data that range between relatively - uninteresting low values and interesting high values. - - The ``color`` parameter can be specified in a number of ways, including - all options for defining a color in matplotlib and several additional - color spaces that are handled by seaborn. You can also use the database - of named colors from the XKCD color survey. - - If you are using the IPython notebook, you can also choose this palette - interactively with the :func:`choose_dark_palette` function. - - Parameters - ---------- - color : base color for high values - hex, rgb-tuple, or html color name - n_colors : int, optional - number of colors in the palette - reverse : bool, optional - if True, reverse the direction of the blend - as_cmap : bool, optional - If True, return a :class:`matplotlib.colors.ListedColormap`. - input : {'rgb', 'hls', 'husl', xkcd'} - Color space to interpret the input color. The first three options - apply to tuple inputs and the latter applies to string inputs. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - light_palette : Create a sequential palette with bright low values. - diverging_palette : Create a diverging palette with two colors. - - Examples - -------- - .. include:: ../docstrings/dark_palette.rst - - """ - rgb = _color_to_rgb(color, input) - hue, sat, _ = husl.rgb_to_husl(*rgb) - gray_s, gray_l = .15 * sat, 15 - gray = _color_to_rgb((hue, gray_s, gray_l), input="husl") - colors = [rgb, gray] if reverse else [gray, rgb] - return blend_palette(colors, n_colors, as_cmap) - - -def light_palette(color, n_colors=6, reverse=False, as_cmap=False, input="rgb"): - """Make a sequential palette that blends from light to ``color``. - - The ``color`` parameter can be specified in a number of ways, including - all options for defining a color in matplotlib and several additional - color spaces that are handled by seaborn. You can also use the database - of named colors from the XKCD color survey. - - If you are using a Jupyter notebook, you can also choose this palette - interactively with the :func:`choose_light_palette` function. - - Parameters - ---------- - color : base color for high values - hex code, html color name, or tuple in `input` space. - n_colors : int, optional - number of colors in the palette - reverse : bool, optional - if True, reverse the direction of the blend - as_cmap : bool, optional - If True, return a :class:`matplotlib.colors.ListedColormap`. - input : {'rgb', 'hls', 'husl', xkcd'} - Color space to interpret the input color. The first three options - apply to tuple inputs and the latter applies to string inputs. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - dark_palette : Create a sequential palette with dark low values. - diverging_palette : Create a diverging palette with two colors. - - Examples - -------- - .. include:: ../docstrings/light_palette.rst - - """ - rgb = _color_to_rgb(color, input) - hue, sat, _ = husl.rgb_to_husl(*rgb) - gray_s, gray_l = .15 * sat, 95 - gray = _color_to_rgb((hue, gray_s, gray_l), input="husl") - colors = [rgb, gray] if reverse else [gray, rgb] - return blend_palette(colors, n_colors, as_cmap) - - -def diverging_palette(h_neg, h_pos, s=75, l=50, sep=1, n=6, # noqa - center="light", as_cmap=False): - """Make a diverging palette between two HUSL colors. - - If you are using the IPython notebook, you can also choose this palette - interactively with the :func:`choose_diverging_palette` function. - - Parameters - ---------- - h_neg, h_pos : float in [0, 359] - Anchor hues for negative and positive extents of the map. - s : float in [0, 100], optional - Anchor saturation for both extents of the map. - l : float in [0, 100], optional - Anchor lightness for both extents of the map. - sep : int, optional - Size of the intermediate region. - n : int, optional - Number of colors in the palette (if not returning a cmap) - center : {"light", "dark"}, optional - Whether the center of the palette is light or dark - as_cmap : bool, optional - If True, return a :class:`matplotlib.colors.ListedColormap`. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - dark_palette : Create a sequential palette with dark values. - light_palette : Create a sequential palette with light values. - - Examples - -------- - .. include: ../docstrings/diverging_palette.rst - - """ - palfunc = dict(dark=dark_palette, light=light_palette)[center] - n_half = int(128 - (sep // 2)) - neg = palfunc((h_neg, s, l), n_half, reverse=True, input="husl") - pos = palfunc((h_pos, s, l), n_half, input="husl") - midpoint = dict(light=[(.95, .95, .95)], dark=[(.133, .133, .133)])[center] - mid = midpoint * sep - pal = blend_palette(np.concatenate([neg, mid, pos]), n, as_cmap=as_cmap) - return pal - - -def blend_palette(colors, n_colors=6, as_cmap=False, input="rgb"): - """Make a palette that blends between a list of colors. - - Parameters - ---------- - colors : sequence of colors in various formats interpreted by `input` - hex code, html color name, or tuple in `input` space. - n_colors : int, optional - Number of colors in the palette. - as_cmap : bool, optional - If True, return a :class:`matplotlib.colors.ListedColormap`. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - Examples - -------- - .. include: ../docstrings/blend_palette.rst - - """ - colors = [_color_to_rgb(color, input) for color in colors] - name = "blend" - pal = mpl.colors.LinearSegmentedColormap.from_list(name, colors) - if not as_cmap: - rgb_array = pal(np.linspace(0, 1, int(n_colors)))[:, :3] # no alpha - pal = _ColorPalette(map(tuple, rgb_array)) - return pal - - -def xkcd_palette(colors): - """Make a palette with color names from the xkcd color survey. - - See xkcd for the full list of colors: https://xkcd.com/color/rgb/ - - This is just a simple wrapper around the `seaborn.xkcd_rgb` dictionary. - - Parameters - ---------- - colors : list of strings - List of keys in the `seaborn.xkcd_rgb` dictionary. - - Returns - ------- - palette - A list of colors as RGB tuples. - - See Also - -------- - crayon_palette : Make a palette with Crayola crayon colors. - - """ - palette = [xkcd_rgb[name] for name in colors] - return color_palette(palette, len(palette)) - - -def crayon_palette(colors): - """Make a palette with color names from Crayola crayons. - - Colors are taken from here: - https://en.wikipedia.org/wiki/List_of_Crayola_crayon_colors - - This is just a simple wrapper around the `seaborn.crayons` dictionary. - - Parameters - ---------- - colors : list of strings - List of keys in the `seaborn.crayons` dictionary. - - Returns - ------- - palette - A list of colors as RGB tuples. - - See Also - -------- - xkcd_palette : Make a palette with named colors from the XKCD color survey. - - """ - palette = [crayons[name] for name in colors] - return color_palette(palette, len(palette)) - - -def cubehelix_palette(n_colors=6, start=0, rot=.4, gamma=1.0, hue=0.8, - light=.85, dark=.15, reverse=False, as_cmap=False): - """Make a sequential palette from the cubehelix system. - - This produces a colormap with linearly-decreasing (or increasing) - brightness. That means that information will be preserved if printed to - black and white or viewed by someone who is colorblind. "cubehelix" is - also available as a matplotlib-based palette, but this function gives the - user more control over the look of the palette and has a different set of - defaults. - - In addition to using this function, it is also possible to generate a - cubehelix palette generally in seaborn using a string starting with - `ch:` and containing other parameters (e.g. `"ch:s=.25,r=-.5"`). - - Parameters - ---------- - n_colors : int - Number of colors in the palette. - start : float, 0 <= start <= 3 - The hue value at the start of the helix. - rot : float - Rotations around the hue wheel over the range of the palette. - gamma : float 0 <= gamma - Nonlinearity to emphasize dark (gamma < 1) or light (gamma > 1) colors. - hue : float, 0 <= hue <= 1 - Saturation of the colors. - dark : float 0 <= dark <= 1 - Intensity of the darkest color in the palette. - light : float 0 <= light <= 1 - Intensity of the lightest color in the palette. - reverse : bool - If True, the palette will go from dark to light. - as_cmap : bool - If True, return a :class:`matplotlib.colors.ListedColormap`. - - Returns - ------- - palette - list of RGB tuples or :class:`matplotlib.colors.ListedColormap` - - See Also - -------- - choose_cubehelix_palette : Launch an interactive widget to select cubehelix - palette parameters. - dark_palette : Create a sequential palette with dark low values. - light_palette : Create a sequential palette with bright low values. - - References - ---------- - Green, D. A. (2011). "A colour scheme for the display of astronomical - intensity images". Bulletin of the Astromical Society of India, Vol. 39, - p. 289-295. - - Examples - -------- - .. include:: ../docstrings/cubehelix_palette.rst - - """ - def get_color_function(p0, p1): - # Copied from matplotlib because it lives in private module - def color(x): - # Apply gamma factor to emphasise low or high intensity values - xg = x ** gamma - - # Calculate amplitude and angle of deviation from the black - # to white diagonal in the plane of constant - # perceived intensity. - a = hue * xg * (1 - xg) / 2 - - phi = 2 * np.pi * (start / 3 + rot * x) - - return xg + a * (p0 * np.cos(phi) + p1 * np.sin(phi)) - return color - - cdict = { - "red": get_color_function(-0.14861, 1.78277), - "green": get_color_function(-0.29227, -0.90649), - "blue": get_color_function(1.97294, 0.0), - } - - cmap = mpl.colors.LinearSegmentedColormap("cubehelix", cdict) - - x = np.linspace(light, dark, int(n_colors)) - pal = cmap(x)[:, :3].tolist() - if reverse: - pal = pal[::-1] - - if as_cmap: - x_256 = np.linspace(light, dark, 256) - if reverse: - x_256 = x_256[::-1] - pal_256 = cmap(x_256) - cmap = mpl.colors.ListedColormap(pal_256, "seaborn_cubehelix") - return cmap - else: - return _ColorPalette(pal) - - -def _parse_cubehelix_args(argstr): - """Turn stringified cubehelix params into args/kwargs.""" - - if argstr.startswith("ch:"): - argstr = argstr[3:] - - if argstr.endswith("_r"): - reverse = True - argstr = argstr[:-2] - else: - reverse = False - - if not argstr: - return [], {"reverse": reverse} - - all_args = argstr.split(",") - - args = [float(a.strip(" ")) for a in all_args if "=" not in a] - - kwargs = [a.split("=") for a in all_args if "=" in a] - kwargs = {k.strip(" "): float(v.strip(" ")) for k, v in kwargs} - - kwarg_map = dict( - s="start", r="rot", g="gamma", - h="hue", l="light", d="dark", # noqa: E741 - ) - - kwargs = {kwarg_map.get(k, k): v for k, v in kwargs.items()} - - if reverse: - kwargs["reverse"] = True - - return args, kwargs - - -def set_color_codes(palette="deep"): - """Change how matplotlib color shorthands are interpreted. - - Calling this will change how shorthand codes like "b" or "g" - are interpreted by matplotlib in subsequent plots. - - Parameters - ---------- - palette : {deep, muted, pastel, dark, bright, colorblind} - Named seaborn palette to use as the source of colors. - - See Also - -------- - set : Color codes can be set through the high-level seaborn style - manager. - set_palette : Color codes can also be set through the function that - sets the matplotlib color cycle. - - """ - if palette == "reset": - colors = [ - (0., 0., 1.), - (0., .5, 0.), - (1., 0., 0.), - (.75, 0., .75), - (.75, .75, 0.), - (0., .75, .75), - (0., 0., 0.) - ] - elif not isinstance(palette, str): - err = "set_color_codes requires a named seaborn palette" - raise TypeError(err) - elif palette in SEABORN_PALETTES: - if not palette.endswith("6"): - palette = palette + "6" - colors = SEABORN_PALETTES[palette] + [(.1, .1, .1)] - else: - err = f"Cannot set colors with palette '{palette}'" - raise ValueError(err) - - for code, color in zip("bgrmyck", colors): - rgb = mpl.colors.colorConverter.to_rgb(color) - mpl.colors.colorConverter.colors[code] = rgb diff --git a/seaborn/rcmod.py b/seaborn/rcmod.py deleted file mode 100644 index de238323147e393bfee469e1bc3fafec157cb28f..0000000000000000000000000000000000000000 --- a/seaborn/rcmod.py +++ /dev/null @@ -1,533 +0,0 @@ -"""Control plot style and scaling using the matplotlib rcParams interface.""" -import functools -import matplotlib as mpl -from cycler import cycler -from . import palettes - - -__all__ = ["set_theme", "set", "reset_defaults", "reset_orig", - "axes_style", "set_style", "plotting_context", "set_context", - "set_palette"] - - -_style_keys = [ - - "axes.facecolor", - "axes.edgecolor", - "axes.grid", - "axes.axisbelow", - "axes.labelcolor", - - "figure.facecolor", - - "grid.color", - "grid.linestyle", - - "text.color", - - "xtick.color", - "ytick.color", - "xtick.direction", - "ytick.direction", - "lines.solid_capstyle", - - "patch.edgecolor", - "patch.force_edgecolor", - - "image.cmap", - "font.family", - "font.sans-serif", - - "xtick.bottom", - "xtick.top", - "ytick.left", - "ytick.right", - - "axes.spines.left", - "axes.spines.bottom", - "axes.spines.right", - "axes.spines.top", - -] - -_context_keys = [ - - "font.size", - "axes.labelsize", - "axes.titlesize", - "xtick.labelsize", - "ytick.labelsize", - "legend.fontsize", - "legend.title_fontsize", - - "axes.linewidth", - "grid.linewidth", - "lines.linewidth", - "lines.markersize", - "patch.linewidth", - - "xtick.major.width", - "ytick.major.width", - "xtick.minor.width", - "ytick.minor.width", - - "xtick.major.size", - "ytick.major.size", - "xtick.minor.size", - "ytick.minor.size", - -] - - -def set_theme(context="notebook", style="darkgrid", palette="deep", - font="sans-serif", font_scale=1, color_codes=True, rc=None): - """ - Set aspects of the visual theme for all matplotlib and seaborn plots. - - This function changes the global defaults for all plots using the - matplotlib rcParams system. The themeing is decomposed into several distinct - sets of parameter values. - - The options are illustrated in the :doc:`aesthetics <../tutorial/aesthetics>` - and :doc:`color palette <../tutorial/color_palettes>` tutorials. - - Parameters - ---------- - context : string or dict - Scaling parameters, see :func:`plotting_context`. - style : string or dict - Axes style parameters, see :func:`axes_style`. - palette : string or sequence - Color palette, see :func:`color_palette`. - font : string - Font family, see matplotlib font manager. - font_scale : float, optional - Separate scaling factor to independently scale the size of the - font elements. - color_codes : bool - If ``True`` and ``palette`` is a seaborn palette, remap the shorthand - color codes (e.g. "b", "g", "r", etc.) to the colors from this palette. - rc : dict or None - Dictionary of rc parameter mappings to override the above. - - Examples - -------- - - .. include:: ../docstrings/set_theme.rst - - """ - set_context(context, font_scale) - set_style(style, rc={"font.family": font}) - set_palette(palette, color_codes=color_codes) - if rc is not None: - mpl.rcParams.update(rc) - - -def set(*args, **kwargs): - """ - Alias for :func:`set_theme`, which is the preferred interface. - - This function may be removed in the future. - """ - set_theme(*args, **kwargs) - - -def reset_defaults(): - """Restore all RC params to default settings.""" - mpl.rcParams.update(mpl.rcParamsDefault) - - -def reset_orig(): - """Restore all RC params to original settings (respects custom rc).""" - from . import _orig_rc_params - mpl.rcParams.update(_orig_rc_params) - - -def axes_style(style=None, rc=None): - """ - Get the parameters that control the general style of the plots. - - The style parameters control properties like the color of the background and - whether a grid is enabled by default. This is accomplished using the - matplotlib rcParams system. - - The options are illustrated in the - :doc:`aesthetics tutorial <../tutorial/aesthetics>`. - - This function can also be used as a context manager to temporarily - alter the global defaults. See :func:`set_theme` or :func:`set_style` - to modify the global defaults for all plots. - - Parameters - ---------- - style : None, dict, or one of {darkgrid, whitegrid, dark, white, ticks} - A dictionary of parameters or the name of a preconfigured style. - rc : dict, optional - Parameter mappings to override the values in the preset seaborn - style dictionaries. This only updates parameters that are - considered part of the style definition. - - Examples - -------- - - .. include:: ../docstrings/axes_style.rst - - """ - if style is None: - style_dict = {k: mpl.rcParams[k] for k in _style_keys} - - elif isinstance(style, dict): - style_dict = style - - else: - styles = ["white", "dark", "whitegrid", "darkgrid", "ticks"] - if style not in styles: - raise ValueError(f"style must be one of {', '.join(styles)}") - - # Define colors here - dark_gray = ".15" - light_gray = ".8" - - # Common parameters - style_dict = { - - "figure.facecolor": "white", - "axes.labelcolor": dark_gray, - - "xtick.direction": "out", - "ytick.direction": "out", - "xtick.color": dark_gray, - "ytick.color": dark_gray, - - "axes.axisbelow": True, - "grid.linestyle": "-", - - - "text.color": dark_gray, - "font.family": ["sans-serif"], - "font.sans-serif": ["Arial", "DejaVu Sans", "Liberation Sans", - "Bitstream Vera Sans", "sans-serif"], - - - "lines.solid_capstyle": "round", - "patch.edgecolor": "w", - "patch.force_edgecolor": True, - - "image.cmap": "rocket", - - "xtick.top": False, - "ytick.right": False, - - } - - # Set grid on or off - if "grid" in style: - style_dict.update({ - "axes.grid": True, - }) - else: - style_dict.update({ - "axes.grid": False, - }) - - # Set the color of the background, spines, and grids - if style.startswith("dark"): - style_dict.update({ - - "axes.facecolor": "#EAEAF2", - "axes.edgecolor": "white", - "grid.color": "white", - - "axes.spines.left": True, - "axes.spines.bottom": True, - "axes.spines.right": True, - "axes.spines.top": True, - - }) - - elif style == "whitegrid": - style_dict.update({ - - "axes.facecolor": "white", - "axes.edgecolor": light_gray, - "grid.color": light_gray, - - "axes.spines.left": True, - "axes.spines.bottom": True, - "axes.spines.right": True, - "axes.spines.top": True, - - }) - - elif style in ["white", "ticks"]: - style_dict.update({ - - "axes.facecolor": "white", - "axes.edgecolor": dark_gray, - "grid.color": light_gray, - - "axes.spines.left": True, - "axes.spines.bottom": True, - "axes.spines.right": True, - "axes.spines.top": True, - - }) - - # Show or hide the axes ticks - if style == "ticks": - style_dict.update({ - "xtick.bottom": True, - "ytick.left": True, - }) - else: - style_dict.update({ - "xtick.bottom": False, - "ytick.left": False, - }) - - # Remove entries that are not defined in the base list of valid keys - # This lets us handle matplotlib <=/> 2.0 - style_dict = {k: v for k, v in style_dict.items() if k in _style_keys} - - # Override these settings with the provided rc dictionary - if rc is not None: - rc = {k: v for k, v in rc.items() if k in _style_keys} - style_dict.update(rc) - - # Wrap in an _AxesStyle object so this can be used in a with statement - style_object = _AxesStyle(style_dict) - - return style_object - - -def set_style(style=None, rc=None): - """ - Set the parameters that control the general style of the plots. - - The style parameters control properties like the color of the background and - whether a grid is enabled by default. This is accomplished using the - matplotlib rcParams system. - - The options are illustrated in the - :doc:`aesthetics tutorial <../tutorial/aesthetics>`. - - See :func:`axes_style` to get the parameter values. - - Parameters - ---------- - style : dict, or one of {darkgrid, whitegrid, dark, white, ticks} - A dictionary of parameters or the name of a preconfigured style. - rc : dict, optional - Parameter mappings to override the values in the preset seaborn - style dictionaries. This only updates parameters that are - considered part of the style definition. - - Examples - -------- - - .. include:: ../docstrings/set_style.rst - - """ - style_object = axes_style(style, rc) - mpl.rcParams.update(style_object) - - -def plotting_context(context=None, font_scale=1, rc=None): - """ - Get the parameters that control the scaling of plot elements. - - These parameters correspond to label size, line thickness, etc. For more - information, see the :doc:`aesthetics tutorial <../tutorial/aesthetics>`. - - The base context is "notebook", and the other contexts are "paper", "talk", - and "poster", which are version of the notebook parameters scaled by different - values. Font elements can also be scaled independently of (but relative to) - the other values. - - This function can also be used as a context manager to temporarily - alter the global defaults. See :func:`set_theme` or :func:`set_context` - to modify the global defaults for all plots. - - Parameters - ---------- - context : None, dict, or one of {paper, notebook, talk, poster} - A dictionary of parameters or the name of a preconfigured set. - font_scale : float, optional - Separate scaling factor to independently scale the size of the - font elements. - rc : dict, optional - Parameter mappings to override the values in the preset seaborn - context dictionaries. This only updates parameters that are - considered part of the context definition. - - Examples - -------- - - .. include:: ../docstrings/plotting_context.rst - - """ - if context is None: - context_dict = {k: mpl.rcParams[k] for k in _context_keys} - - elif isinstance(context, dict): - context_dict = context - - else: - - contexts = ["paper", "notebook", "talk", "poster"] - if context not in contexts: - raise ValueError(f"context must be in {', '.join(contexts)}") - - # Set up dictionary of default parameters - texts_base_context = { - - "font.size": 12, - "axes.labelsize": 12, - "axes.titlesize": 12, - "xtick.labelsize": 11, - "ytick.labelsize": 11, - "legend.fontsize": 11, - "legend.title_fontsize": 12, - - } - - base_context = { - - "axes.linewidth": 1.25, - "grid.linewidth": 1, - "lines.linewidth": 1.5, - "lines.markersize": 6, - "patch.linewidth": 1, - - "xtick.major.width": 1.25, - "ytick.major.width": 1.25, - "xtick.minor.width": 1, - "ytick.minor.width": 1, - - "xtick.major.size": 6, - "ytick.major.size": 6, - "xtick.minor.size": 4, - "ytick.minor.size": 4, - - } - base_context.update(texts_base_context) - - # Scale all the parameters by the same factor depending on the context - scaling = dict(paper=.8, notebook=1, talk=1.5, poster=2)[context] - context_dict = {k: v * scaling for k, v in base_context.items()} - - # Now independently scale the fonts - font_keys = texts_base_context.keys() - font_dict = {k: context_dict[k] * font_scale for k in font_keys} - context_dict.update(font_dict) - - # Override these settings with the provided rc dictionary - if rc is not None: - rc = {k: v for k, v in rc.items() if k in _context_keys} - context_dict.update(rc) - - # Wrap in a _PlottingContext object so this can be used in a with statement - context_object = _PlottingContext(context_dict) - - return context_object - - -def set_context(context=None, font_scale=1, rc=None): - """ - Set the parameters that control the scaling of plot elements. - - These parameters correspond to label size, line thickness, etc. - Calling this function modifies the global matplotlib `rcParams`. For more - information, see the :doc:`aesthetics tutorial <../tutorial/aesthetics>`. - - The base context is "notebook", and the other contexts are "paper", "talk", - and "poster", which are version of the notebook parameters scaled by different - values. Font elements can also be scaled independently of (but relative to) - the other values. - - See :func:`plotting_context` to get the parameter values. - - Parameters - ---------- - context : dict, or one of {paper, notebook, talk, poster} - A dictionary of parameters or the name of a preconfigured set. - font_scale : float, optional - Separate scaling factor to independently scale the size of the - font elements. - rc : dict, optional - Parameter mappings to override the values in the preset seaborn - context dictionaries. This only updates parameters that are - considered part of the context definition. - - Examples - -------- - - .. include:: ../docstrings/set_context.rst - - """ - context_object = plotting_context(context, font_scale, rc) - mpl.rcParams.update(context_object) - - -class _RCAesthetics(dict): - def __enter__(self): - rc = mpl.rcParams - self._orig = {k: rc[k] for k in self._keys} - self._set(self) - - def __exit__(self, exc_type, exc_value, exc_tb): - self._set(self._orig) - - def __call__(self, func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - with self: - return func(*args, **kwargs) - return wrapper - - -class _AxesStyle(_RCAesthetics): - """Light wrapper on a dict to set style temporarily.""" - _keys = _style_keys - _set = staticmethod(set_style) - - -class _PlottingContext(_RCAesthetics): - """Light wrapper on a dict to set context temporarily.""" - _keys = _context_keys - _set = staticmethod(set_context) - - -def set_palette(palette, n_colors=None, desat=None, color_codes=False): - """Set the matplotlib color cycle using a seaborn palette. - - Parameters - ---------- - palette : seaborn color palette | matplotlib colormap | hls | husl - Palette definition. Should be something :func:`color_palette` can process. - n_colors : int - Number of colors in the cycle. The default number of colors will depend - on the format of ``palette``, see the :func:`color_palette` - documentation for more information. - desat : float - Proportion to desaturate each color by. - color_codes : bool - If ``True`` and ``palette`` is a seaborn palette, remap the shorthand - color codes (e.g. "b", "g", "r", etc.) to the colors from this palette. - - See Also - -------- - color_palette : build a color palette or set the color cycle temporarily - in a ``with`` statement. - set_context : set parameters to scale plot elements - set_style : set the default parameters for figure style - - """ - colors = palettes.color_palette(palette, n_colors, desat) - cyl = cycler('color', colors) - mpl.rcParams['axes.prop_cycle'] = cyl - if color_codes: - try: - palettes.set_color_codes(palette) - except (ValueError, TypeError): - pass diff --git a/seaborn/regression.py b/seaborn/regression.py deleted file mode 100644 index 5e5503a422820191a124fa9a08b6d1319faffd6f..0000000000000000000000000000000000000000 --- a/seaborn/regression.py +++ /dev/null @@ -1,940 +0,0 @@ -"""Plotting functions for linear models (broadly construed).""" -import copy -from textwrap import dedent -import warnings -import numpy as np -import pandas as pd -import matplotlib as mpl -import matplotlib.pyplot as plt - -try: - import statsmodels - assert statsmodels - _has_statsmodels = True -except ImportError: - _has_statsmodels = False - -from . import utils -from . import algorithms as algo -from .axisgrid import FacetGrid, _facet_docs - - -__all__ = ["lmplot", "regplot", "residplot"] - - -class _LinearPlotter: - """Base class for plotting relational data in tidy format. - - To get anything useful done you'll have to inherit from this, but setup - code that can be abstracted out should be put here. - - """ - def establish_variables(self, data, **kws): - """Extract variables from data or use directly.""" - self.data = data - - # Validate the inputs - any_strings = any([isinstance(v, str) for v in kws.values()]) - if any_strings and data is None: - raise ValueError("Must pass `data` if using named variables.") - - # Set the variables - for var, val in kws.items(): - if isinstance(val, str): - vector = data[val] - elif isinstance(val, list): - vector = np.asarray(val) - else: - vector = val - if vector is not None and vector.shape != (1,): - vector = np.squeeze(vector) - if np.ndim(vector) > 1: - err = "regplot inputs must be 1d" - raise ValueError(err) - setattr(self, var, vector) - - def dropna(self, *vars): - """Remove observations with missing data.""" - vals = [getattr(self, var) for var in vars] - vals = [v for v in vals if v is not None] - not_na = np.all(np.column_stack([pd.notnull(v) for v in vals]), axis=1) - for var in vars: - val = getattr(self, var) - if val is not None: - setattr(self, var, val[not_na]) - - def plot(self, ax): - raise NotImplementedError - - -class _RegressionPlotter(_LinearPlotter): - """Plotter for numeric independent variables with regression model. - - This does the computations and drawing for the `regplot` function, and - is thus also used indirectly by `lmplot`. - """ - def __init__(self, x, y, data=None, x_estimator=None, x_bins=None, - x_ci="ci", scatter=True, fit_reg=True, ci=95, n_boot=1000, - units=None, seed=None, order=1, logistic=False, lowess=False, - robust=False, logx=False, x_partial=None, y_partial=None, - truncate=False, dropna=True, x_jitter=None, y_jitter=None, - color=None, label=None): - - # Set member attributes - self.x_estimator = x_estimator - self.ci = ci - self.x_ci = ci if x_ci == "ci" else x_ci - self.n_boot = n_boot - self.seed = seed - self.scatter = scatter - self.fit_reg = fit_reg - self.order = order - self.logistic = logistic - self.lowess = lowess - self.robust = robust - self.logx = logx - self.truncate = truncate - self.x_jitter = x_jitter - self.y_jitter = y_jitter - self.color = color - self.label = label - - # Validate the regression options: - if sum((order > 1, logistic, robust, lowess, logx)) > 1: - raise ValueError("Mutually exclusive regression options.") - - # Extract the data vals from the arguments or passed dataframe - self.establish_variables(data, x=x, y=y, units=units, - x_partial=x_partial, y_partial=y_partial) - - # Drop null observations - if dropna: - self.dropna("x", "y", "units", "x_partial", "y_partial") - - # Regress nuisance variables out of the data - if self.x_partial is not None: - self.x = self.regress_out(self.x, self.x_partial) - if self.y_partial is not None: - self.y = self.regress_out(self.y, self.y_partial) - - # Possibly bin the predictor variable, which implies a point estimate - if x_bins is not None: - self.x_estimator = np.mean if x_estimator is None else x_estimator - x_discrete, x_bins = self.bin_predictor(x_bins) - self.x_discrete = x_discrete - else: - self.x_discrete = self.x - - # Disable regression in case of singleton inputs - if len(self.x) <= 1: - self.fit_reg = False - - # Save the range of the x variable for the grid later - if self.fit_reg: - self.x_range = self.x.min(), self.x.max() - - @property - def scatter_data(self): - """Data where each observation is a point.""" - x_j = self.x_jitter - if x_j is None: - x = self.x - else: - x = self.x + np.random.uniform(-x_j, x_j, len(self.x)) - - y_j = self.y_jitter - if y_j is None: - y = self.y - else: - y = self.y + np.random.uniform(-y_j, y_j, len(self.y)) - - return x, y - - @property - def estimate_data(self): - """Data with a point estimate and CI for each discrete x value.""" - x, y = self.x_discrete, self.y - vals = sorted(np.unique(x)) - points, cis = [], [] - - for val in vals: - - # Get the point estimate of the y variable - _y = y[x == val] - est = self.x_estimator(_y) - points.append(est) - - # Compute the confidence interval for this estimate - if self.x_ci is None: - cis.append(None) - else: - units = None - if self.x_ci == "sd": - sd = np.std(_y) - _ci = est - sd, est + sd - else: - if self.units is not None: - units = self.units[x == val] - boots = algo.bootstrap(_y, - func=self.x_estimator, - n_boot=self.n_boot, - units=units, - seed=self.seed) - _ci = utils.ci(boots, self.x_ci) - cis.append(_ci) - - return vals, points, cis - - def _check_statsmodels(self): - """Check whether statsmodels is installed if any boolean options require it.""" - options = "logistic", "robust", "lowess" - err = "`{}=True` requires statsmodels, an optional dependency, to be installed." - for option in options: - if getattr(self, option) and not _has_statsmodels: - raise RuntimeError(err.format(option)) - - def fit_regression(self, ax=None, x_range=None, grid=None): - """Fit the regression model.""" - self._check_statsmodels() - - # Create the grid for the regression - if grid is None: - if self.truncate: - x_min, x_max = self.x_range - else: - if ax is None: - x_min, x_max = x_range - else: - x_min, x_max = ax.get_xlim() - grid = np.linspace(x_min, x_max, 100) - ci = self.ci - - # Fit the regression - if self.order > 1: - yhat, yhat_boots = self.fit_poly(grid, self.order) - elif self.logistic: - from statsmodels.genmod.generalized_linear_model import GLM - from statsmodels.genmod.families import Binomial - yhat, yhat_boots = self.fit_statsmodels(grid, GLM, - family=Binomial()) - elif self.lowess: - ci = None - grid, yhat = self.fit_lowess() - elif self.robust: - from statsmodels.robust.robust_linear_model import RLM - yhat, yhat_boots = self.fit_statsmodels(grid, RLM) - elif self.logx: - yhat, yhat_boots = self.fit_logx(grid) - else: - yhat, yhat_boots = self.fit_fast(grid) - - # Compute the confidence interval at each grid point - if ci is None: - err_bands = None - else: - err_bands = utils.ci(yhat_boots, ci, axis=0) - - return grid, yhat, err_bands - - def fit_fast(self, grid): - """Low-level regression and prediction using linear algebra.""" - def reg_func(_x, _y): - return np.linalg.pinv(_x).dot(_y) - - X, y = np.c_[np.ones(len(self.x)), self.x], self.y - grid = np.c_[np.ones(len(grid)), grid] - yhat = grid.dot(reg_func(X, y)) - if self.ci is None: - return yhat, None - - beta_boots = algo.bootstrap(X, y, - func=reg_func, - n_boot=self.n_boot, - units=self.units, - seed=self.seed).T - yhat_boots = grid.dot(beta_boots).T - return yhat, yhat_boots - - def fit_poly(self, grid, order): - """Regression using numpy polyfit for higher-order trends.""" - def reg_func(_x, _y): - return np.polyval(np.polyfit(_x, _y, order), grid) - - x, y = self.x, self.y - yhat = reg_func(x, y) - if self.ci is None: - return yhat, None - - yhat_boots = algo.bootstrap(x, y, - func=reg_func, - n_boot=self.n_boot, - units=self.units, - seed=self.seed) - return yhat, yhat_boots - - def fit_statsmodels(self, grid, model, **kwargs): - """More general regression function using statsmodels objects.""" - import statsmodels.tools.sm_exceptions as sme - X, y = np.c_[np.ones(len(self.x)), self.x], self.y - grid = np.c_[np.ones(len(grid)), grid] - - def reg_func(_x, _y): - err_classes = (sme.PerfectSeparationError,) - try: - with warnings.catch_warnings(): - if hasattr(sme, "PerfectSeparationWarning"): - # statsmodels>=0.14.0 - warnings.simplefilter("error", sme.PerfectSeparationWarning) - err_classes = (*err_classes, sme.PerfectSeparationWarning) - yhat = model(_y, _x, **kwargs).fit().predict(grid) - except err_classes: - yhat = np.empty(len(grid)) - yhat.fill(np.nan) - return yhat - - yhat = reg_func(X, y) - if self.ci is None: - return yhat, None - - yhat_boots = algo.bootstrap(X, y, - func=reg_func, - n_boot=self.n_boot, - units=self.units, - seed=self.seed) - return yhat, yhat_boots - - def fit_lowess(self): - """Fit a locally-weighted regression, which returns its own grid.""" - from statsmodels.nonparametric.smoothers_lowess import lowess - grid, yhat = lowess(self.y, self.x).T - return grid, yhat - - def fit_logx(self, grid): - """Fit the model in log-space.""" - X, y = np.c_[np.ones(len(self.x)), self.x], self.y - grid = np.c_[np.ones(len(grid)), np.log(grid)] - - def reg_func(_x, _y): - _x = np.c_[_x[:, 0], np.log(_x[:, 1])] - return np.linalg.pinv(_x).dot(_y) - - yhat = grid.dot(reg_func(X, y)) - if self.ci is None: - return yhat, None - - beta_boots = algo.bootstrap(X, y, - func=reg_func, - n_boot=self.n_boot, - units=self.units, - seed=self.seed).T - yhat_boots = grid.dot(beta_boots).T - return yhat, yhat_boots - - def bin_predictor(self, bins): - """Discretize a predictor by assigning value to closest bin.""" - x = np.asarray(self.x) - if np.isscalar(bins): - percentiles = np.linspace(0, 100, bins + 2)[1:-1] - bins = np.percentile(x, percentiles) - else: - bins = np.ravel(bins) - - dist = np.abs(np.subtract.outer(x, bins)) - x_binned = bins[np.argmin(dist, axis=1)].ravel() - - return x_binned, bins - - def regress_out(self, a, b): - """Regress b from a keeping a's original mean.""" - a_mean = a.mean() - a = a - a_mean - b = b - b.mean() - b = np.c_[b] - a_prime = a - b.dot(np.linalg.pinv(b).dot(a)) - return np.asarray(a_prime + a_mean).reshape(a.shape) - - def plot(self, ax, scatter_kws, line_kws): - """Draw the full plot.""" - # Insert the plot label into the correct set of keyword arguments - if self.scatter: - scatter_kws["label"] = self.label - else: - line_kws["label"] = self.label - - # Use the current color cycle state as a default - if self.color is None: - lines, = ax.plot([], []) - color = lines.get_color() - lines.remove() - else: - color = self.color - - # Ensure that color is hex to avoid matplotlib weirdness - color = mpl.colors.rgb2hex(mpl.colors.colorConverter.to_rgb(color)) - - # Let color in keyword arguments override overall plot color - scatter_kws.setdefault("color", color) - line_kws.setdefault("color", color) - - # Draw the constituent plots - if self.scatter: - self.scatterplot(ax, scatter_kws) - - if self.fit_reg: - self.lineplot(ax, line_kws) - - # Label the axes - if hasattr(self.x, "name"): - ax.set_xlabel(self.x.name) - if hasattr(self.y, "name"): - ax.set_ylabel(self.y.name) - - def scatterplot(self, ax, kws): - """Draw the data.""" - # Treat the line-based markers specially, explicitly setting larger - # linewidth than is provided by the seaborn style defaults. - # This would ideally be handled better in matplotlib (i.e., distinguish - # between edgewidth for solid glyphs and linewidth for line glyphs - # but this should do for now. - line_markers = ["1", "2", "3", "4", "+", "x", "|", "_"] - if self.x_estimator is None: - if "marker" in kws and kws["marker"] in line_markers: - lw = mpl.rcParams["lines.linewidth"] - else: - lw = mpl.rcParams["lines.markeredgewidth"] - kws.setdefault("linewidths", lw) - - if not hasattr(kws['color'], 'shape') or kws['color'].shape[1] < 4: - kws.setdefault("alpha", .8) - - x, y = self.scatter_data - ax.scatter(x, y, **kws) - else: - # TODO abstraction - ci_kws = {"color": kws["color"]} - if "alpha" in kws: - ci_kws["alpha"] = kws["alpha"] - ci_kws["linewidth"] = mpl.rcParams["lines.linewidth"] * 1.75 - kws.setdefault("s", 50) - - xs, ys, cis = self.estimate_data - if [ci for ci in cis if ci is not None]: - for x, ci in zip(xs, cis): - ax.plot([x, x], ci, **ci_kws) - ax.scatter(xs, ys, **kws) - - def lineplot(self, ax, kws): - """Draw the model.""" - # Fit the regression model - grid, yhat, err_bands = self.fit_regression(ax) - edges = grid[0], grid[-1] - - # Get set default aesthetics - fill_color = kws["color"] - lw = kws.pop("lw", mpl.rcParams["lines.linewidth"] * 1.5) - kws.setdefault("linewidth", lw) - - # Draw the regression line and confidence interval - line, = ax.plot(grid, yhat, **kws) - if not self.truncate: - line.sticky_edges.x[:] = edges # Prevent mpl from adding margin - if err_bands is not None: - ax.fill_between(grid, *err_bands, facecolor=fill_color, alpha=.15) - - -_regression_docs = dict( - - model_api=dedent("""\ - There are a number of mutually exclusive options for estimating the - regression model. See the :ref:`tutorial <regression_tutorial>` for more - information.\ - """), - regplot_vs_lmplot=dedent("""\ - The :func:`regplot` and :func:`lmplot` functions are closely related, but - the former is an axes-level function while the latter is a figure-level - function that combines :func:`regplot` and :class:`FacetGrid`.\ - """), - x_estimator=dedent("""\ - x_estimator : callable that maps vector -> scalar, optional - Apply this function to each unique value of ``x`` and plot the - resulting estimate. This is useful when ``x`` is a discrete variable. - If ``x_ci`` is given, this estimate will be bootstrapped and a - confidence interval will be drawn.\ - """), - x_bins=dedent("""\ - x_bins : int or vector, optional - Bin the ``x`` variable into discrete bins and then estimate the central - tendency and a confidence interval. This binning only influences how - the scatterplot is drawn; the regression is still fit to the original - data. This parameter is interpreted either as the number of - evenly-sized (not necessary spaced) bins or the positions of the bin - centers. When this parameter is used, it implies that the default of - ``x_estimator`` is ``numpy.mean``.\ - """), - x_ci=dedent("""\ - x_ci : "ci", "sd", int in [0, 100] or None, optional - Size of the confidence interval used when plotting a central tendency - for discrete values of ``x``. If ``"ci"``, defer to the value of the - ``ci`` parameter. If ``"sd"``, skip bootstrapping and show the - standard deviation of the observations in each bin.\ - """), - scatter=dedent("""\ - scatter : bool, optional - If ``True``, draw a scatterplot with the underlying observations (or - the ``x_estimator`` values).\ - """), - fit_reg=dedent("""\ - fit_reg : bool, optional - If ``True``, estimate and plot a regression model relating the ``x`` - and ``y`` variables.\ - """), - ci=dedent("""\ - ci : int in [0, 100] or None, optional - Size of the confidence interval for the regression estimate. This will - be drawn using translucent bands around the regression line. The - confidence interval is estimated using a bootstrap; for large - datasets, it may be advisable to avoid that computation by setting - this parameter to None.\ - """), - n_boot=dedent("""\ - n_boot : int, optional - Number of bootstrap resamples used to estimate the ``ci``. The default - value attempts to balance time and stability; you may want to increase - this value for "final" versions of plots.\ - """), - units=dedent("""\ - units : variable name in ``data``, optional - If the ``x`` and ``y`` observations are nested within sampling units, - those can be specified here. This will be taken into account when - computing the confidence intervals by performing a multilevel bootstrap - that resamples both units and observations (within unit). This does not - otherwise influence how the regression is estimated or drawn.\ - """), - seed=dedent("""\ - seed : int, numpy.random.Generator, or numpy.random.RandomState, optional - Seed or random number generator for reproducible bootstrapping.\ - """), - order=dedent("""\ - order : int, optional - If ``order`` is greater than 1, use ``numpy.polyfit`` to estimate a - polynomial regression.\ - """), - logistic=dedent("""\ - logistic : bool, optional - If ``True``, assume that ``y`` is a binary variable and use - ``statsmodels`` to estimate a logistic regression model. Note that this - is substantially more computationally intensive than linear regression, - so you may wish to decrease the number of bootstrap resamples - (``n_boot``) or set ``ci`` to None.\ - """), - lowess=dedent("""\ - lowess : bool, optional - If ``True``, use ``statsmodels`` to estimate a nonparametric lowess - model (locally weighted linear regression). Note that confidence - intervals cannot currently be drawn for this kind of model.\ - """), - robust=dedent("""\ - robust : bool, optional - If ``True``, use ``statsmodels`` to estimate a robust regression. This - will de-weight outliers. Note that this is substantially more - computationally intensive than standard linear regression, so you may - wish to decrease the number of bootstrap resamples (``n_boot``) or set - ``ci`` to None.\ - """), - logx=dedent("""\ - logx : bool, optional - If ``True``, estimate a linear regression of the form y ~ log(x), but - plot the scatterplot and regression model in the input space. Note that - ``x`` must be positive for this to work.\ - """), - xy_partial=dedent("""\ - {x,y}_partial : strings in ``data`` or matrices - Confounding variables to regress out of the ``x`` or ``y`` variables - before plotting.\ - """), - truncate=dedent("""\ - truncate : bool, optional - If ``True``, the regression line is bounded by the data limits. If - ``False``, it extends to the ``x`` axis limits. - """), - xy_jitter=dedent("""\ - {x,y}_jitter : floats, optional - Add uniform random noise of this size to either the ``x`` or ``y`` - variables. The noise is added to a copy of the data after fitting the - regression, and only influences the look of the scatterplot. This can - be helpful when plotting variables that take discrete values.\ - """), - scatter_line_kws=dedent("""\ - {scatter,line}_kws : dictionaries - Additional keyword arguments to pass to ``plt.scatter`` and - ``plt.plot``.\ - """), -) -_regression_docs.update(_facet_docs) - - -def lmplot( - data, *, - x=None, y=None, hue=None, col=None, row=None, - palette=None, col_wrap=None, height=5, aspect=1, markers="o", - sharex=None, sharey=None, hue_order=None, col_order=None, row_order=None, - legend=True, legend_out=None, x_estimator=None, x_bins=None, - x_ci="ci", scatter=True, fit_reg=True, ci=95, n_boot=1000, - units=None, seed=None, order=1, logistic=False, lowess=False, - robust=False, logx=False, x_partial=None, y_partial=None, - truncate=True, x_jitter=None, y_jitter=None, scatter_kws=None, - line_kws=None, facet_kws=None, -): - - if facet_kws is None: - facet_kws = {} - - def facet_kw_deprecation(key, val): - msg = ( - f"{key} is deprecated from the `lmplot` function signature. " - "Please update your code to pass it using `facet_kws`." - ) - if val is not None: - warnings.warn(msg, UserWarning) - facet_kws[key] = val - - facet_kw_deprecation("sharex", sharex) - facet_kw_deprecation("sharey", sharey) - facet_kw_deprecation("legend_out", legend_out) - - if data is None: - raise TypeError("Missing required keyword argument `data`.") - - # Reduce the dataframe to only needed columns - need_cols = [x, y, hue, col, row, units, x_partial, y_partial] - cols = np.unique([a for a in need_cols if a is not None]).tolist() - data = data[cols] - - # Initialize the grid - facets = FacetGrid( - data, row=row, col=col, hue=hue, - palette=palette, - row_order=row_order, col_order=col_order, hue_order=hue_order, - height=height, aspect=aspect, col_wrap=col_wrap, - **facet_kws, - ) - - # Add the markers here as FacetGrid has figured out how many levels of the - # hue variable are needed and we don't want to duplicate that process - if facets.hue_names is None: - n_markers = 1 - else: - n_markers = len(facets.hue_names) - if not isinstance(markers, list): - markers = [markers] * n_markers - if len(markers) != n_markers: - raise ValueError("markers must be a singleton or a list of markers " - "for each level of the hue variable") - facets.hue_kws = {"marker": markers} - - def update_datalim(data, x, y, ax, **kws): - xys = data[[x, y]].to_numpy().astype(float) - ax.update_datalim(xys, updatey=False) - ax.autoscale_view(scaley=False) - - facets.map_dataframe(update_datalim, x=x, y=y) - - # Draw the regression plot on each facet - regplot_kws = dict( - x_estimator=x_estimator, x_bins=x_bins, x_ci=x_ci, - scatter=scatter, fit_reg=fit_reg, ci=ci, n_boot=n_boot, units=units, - seed=seed, order=order, logistic=logistic, lowess=lowess, - robust=robust, logx=logx, x_partial=x_partial, y_partial=y_partial, - truncate=truncate, x_jitter=x_jitter, y_jitter=y_jitter, - scatter_kws=scatter_kws, line_kws=line_kws, - ) - facets.map_dataframe(regplot, x=x, y=y, **regplot_kws) - facets.set_axis_labels(x, y) - - # Add a legend - if legend and (hue is not None) and (hue not in [col, row]): - facets.add_legend() - return facets - - -lmplot.__doc__ = dedent("""\ - Plot data and regression model fits across a FacetGrid. - - This function combines :func:`regplot` and :class:`FacetGrid`. It is - intended as a convenient interface to fit regression models across - conditional subsets of a dataset. - - When thinking about how to assign variables to different facets, a general - rule is that it makes sense to use ``hue`` for the most important - comparison, followed by ``col`` and ``row``. However, always think about - your particular dataset and the goals of the visualization you are - creating. - - {model_api} - - The parameters to this function span most of the options in - :class:`FacetGrid`, although there may be occasional cases where you will - want to use that class and :func:`regplot` directly. - - Parameters - ---------- - {data} - x, y : strings, optional - Input variables; these should be column names in ``data``. - hue, col, row : strings - Variables that define subsets of the data, which will be drawn on - separate facets in the grid. See the ``*_order`` parameters to control - the order of levels of this variable. - {palette} - {col_wrap} - {height} - {aspect} - markers : matplotlib marker code or list of marker codes, optional - Markers for the scatterplot. If a list, each marker in the list will be - used for each level of the ``hue`` variable. - {share_xy} - - .. deprecated:: 0.12.0 - Pass using the `facet_kws` dictionary. - - {{hue,col,row}}_order : lists, optional - Order for the levels of the faceting variables. By default, this will - be the order that the levels appear in ``data`` or, if the variables - are pandas categoricals, the category order. - legend : bool, optional - If ``True`` and there is a ``hue`` variable, add a legend. - {legend_out} - - .. deprecated:: 0.12.0 - Pass using the `facet_kws` dictionary. - - {x_estimator} - {x_bins} - {x_ci} - {scatter} - {fit_reg} - {ci} - {n_boot} - {units} - {seed} - {order} - {logistic} - {lowess} - {robust} - {logx} - {xy_partial} - {truncate} - {xy_jitter} - {scatter_line_kws} - facet_kws : dict - Dictionary of keyword arguments for :class:`FacetGrid`. - - See Also - -------- - regplot : Plot data and a conditional model fit. - FacetGrid : Subplot grid for plotting conditional relationships. - pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with - ``kind="reg"``). - - Notes - ----- - - {regplot_vs_lmplot} - - Examples - -------- - - .. include:: ../docstrings/lmplot.rst - - """).format(**_regression_docs) - - -def regplot( - data=None, *, x=None, y=None, - x_estimator=None, x_bins=None, x_ci="ci", - scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, - seed=None, order=1, logistic=False, lowess=False, robust=False, - logx=False, x_partial=None, y_partial=None, - truncate=True, dropna=True, x_jitter=None, y_jitter=None, - label=None, color=None, marker="o", - scatter_kws=None, line_kws=None, ax=None -): - - plotter = _RegressionPlotter(x, y, data, x_estimator, x_bins, x_ci, - scatter, fit_reg, ci, n_boot, units, seed, - order, logistic, lowess, robust, logx, - x_partial, y_partial, truncate, dropna, - x_jitter, y_jitter, color, label) - - if ax is None: - ax = plt.gca() - - scatter_kws = {} if scatter_kws is None else copy.copy(scatter_kws) - scatter_kws["marker"] = marker - line_kws = {} if line_kws is None else copy.copy(line_kws) - plotter.plot(ax, scatter_kws, line_kws) - return ax - - -regplot.__doc__ = dedent("""\ - Plot data and a linear regression model fit. - - {model_api} - - Parameters - ---------- - x, y: string, series, or vector array - Input variables. If strings, these should correspond with column names - in ``data``. When pandas objects are used, axes will be labeled with - the series name. - {data} - {x_estimator} - {x_bins} - {x_ci} - {scatter} - {fit_reg} - {ci} - {n_boot} - {units} - {seed} - {order} - {logistic} - {lowess} - {robust} - {logx} - {xy_partial} - {truncate} - {xy_jitter} - label : string - Label to apply to either the scatterplot or regression line (if - ``scatter`` is ``False``) for use in a legend. - color : matplotlib color - Color to apply to all plot elements; will be superseded by colors - passed in ``scatter_kws`` or ``line_kws``. - marker : matplotlib marker code - Marker to use for the scatterplot glyphs. - {scatter_line_kws} - ax : matplotlib Axes, optional - Axes object to draw the plot onto, otherwise uses the current Axes. - - Returns - ------- - ax : matplotlib Axes - The Axes object containing the plot. - - See Also - -------- - lmplot : Combine :func:`regplot` and :class:`FacetGrid` to plot multiple - linear relationships in a dataset. - jointplot : Combine :func:`regplot` and :class:`JointGrid` (when used with - ``kind="reg"``). - pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with - ``kind="reg"``). - residplot : Plot the residuals of a linear regression model. - - Notes - ----- - - {regplot_vs_lmplot} - - - It's also easy to combine :func:`regplot` and :class:`JointGrid` or - :class:`PairGrid` through the :func:`jointplot` and :func:`pairplot` - functions, although these do not directly accept all of :func:`regplot`'s - parameters. - - Examples - -------- - - .. include:: ../docstrings/regplot.rst - - """).format(**_regression_docs) - - -def residplot( - data=None, *, x=None, y=None, - x_partial=None, y_partial=None, lowess=False, - order=1, robust=False, dropna=True, label=None, color=None, - scatter_kws=None, line_kws=None, ax=None -): - """Plot the residuals of a linear regression. - - This function will regress y on x (possibly as a robust or polynomial - regression) and then draw a scatterplot of the residuals. You can - optionally fit a lowess smoother to the residual plot, which can - help in determining if there is structure to the residuals. - - Parameters - ---------- - data : DataFrame, optional - DataFrame to use if `x` and `y` are column names. - x : vector or string - Data or column name in `data` for the predictor variable. - y : vector or string - Data or column name in `data` for the response variable. - {x, y}_partial : vectors or string(s) , optional - These variables are treated as confounding and are removed from - the `x` or `y` variables before plotting. - lowess : boolean, optional - Fit a lowess smoother to the residual scatterplot. - order : int, optional - Order of the polynomial to fit when calculating the residuals. - robust : boolean, optional - Fit a robust linear regression when calculating the residuals. - dropna : boolean, optional - If True, ignore observations with missing data when fitting and - plotting. - label : string, optional - Label that will be used in any plot legends. - color : matplotlib color, optional - Color to use for all elements of the plot. - {scatter, line}_kws : dictionaries, optional - Additional keyword arguments passed to scatter() and plot() for drawing - the components of the plot. - ax : matplotlib axis, optional - Plot into this axis, otherwise grab the current axis or make a new - one if not existing. - - Returns - ------- - ax: matplotlib axes - Axes with the regression plot. - - See Also - -------- - regplot : Plot a simple linear regression model. - jointplot : Draw a :func:`residplot` with univariate marginal distributions - (when used with ``kind="resid"``). - - Examples - -------- - - .. include:: ../docstrings/residplot.rst - - """ - plotter = _RegressionPlotter(x, y, data, ci=None, - order=order, robust=robust, - x_partial=x_partial, y_partial=y_partial, - dropna=dropna, color=color, label=label) - - if ax is None: - ax = plt.gca() - - # Calculate the residual from a linear regression - _, yhat, _ = plotter.fit_regression(grid=plotter.x) - plotter.y = plotter.y - yhat - - # Set the regression option on the plotter - if lowess: - plotter.lowess = True - else: - plotter.fit_reg = False - - # Plot a horizontal line at 0 - ax.axhline(0, ls=":", c=".2") - - # Draw the scatterplot - scatter_kws = {} if scatter_kws is None else scatter_kws.copy() - line_kws = {} if line_kws is None else line_kws.copy() - plotter.plot(ax, scatter_kws, line_kws) - return ax diff --git a/seaborn/relational.py b/seaborn/relational.py deleted file mode 100644 index ff0701c7938d29396f1e055fdfec998e66acdd29..0000000000000000000000000000000000000000 --- a/seaborn/relational.py +++ /dev/null @@ -1,982 +0,0 @@ -from functools import partial -import warnings - -import numpy as np -import pandas as pd -import matplotlib as mpl -import matplotlib.pyplot as plt -from matplotlib.cbook import normalize_kwargs - -from ._base import ( - VectorPlotter, -) -from .utils import ( - adjust_legend_subtitles, - _default_color, - _deprecate_ci, - _get_transform_functions, - _scatter_legend_artist, -) -from ._compat import groupby_apply_include_groups -from ._statistics import EstimateAggregator, WeightedAggregator -from .axisgrid import FacetGrid, _facet_docs -from ._docstrings import DocstringComponents, _core_docs - - -__all__ = ["relplot", "scatterplot", "lineplot"] - - -_relational_narrative = DocstringComponents(dict( - - # --- Introductory prose - main_api=""" -The relationship between `x` and `y` can be shown for different subsets -of the data using the `hue`, `size`, and `style` parameters. These -parameters control what visual semantics are used to identify the different -subsets. It is possible to show up to three dimensions independently by -using all three semantic types, but this style of plot can be hard to -interpret and is often ineffective. Using redundant semantics (i.e. both -`hue` and `style` for the same variable) can be helpful for making -graphics more accessible. - -See the :ref:`tutorial <relational_tutorial>` for more information. - """, - - relational_semantic=""" -The default treatment of the `hue` (and to a lesser extent, `size`) -semantic, if present, depends on whether the variable is inferred to -represent "numeric" or "categorical" data. In particular, numeric variables -are represented with a sequential colormap by default, and the legend -entries show regular "ticks" with values that may or may not exist in the -data. This behavior can be controlled through various parameters, as -described and illustrated below. - """, -)) - -_relational_docs = dict( - - # --- Shared function parameters - data_vars=""" -x, y : names of variables in `data` or vector data - Input data variables; must be numeric. Can pass data directly or - reference columns in `data`. - """, - data=""" -data : DataFrame, array, or list of arrays - Input data structure. If `x` and `y` are specified as names, this - should be a "long-form" DataFrame containing those columns. Otherwise - it is treated as "wide-form" data and grouping variables are ignored. - See the examples for the various ways this parameter can be specified - and the different effects of each. - """, - palette=""" -palette : string, list, dict, or matplotlib colormap - An object that determines how colors are chosen when `hue` is used. - It can be the name of a seaborn palette or matplotlib colormap, a list - of colors (anything matplotlib understands), a dict mapping levels - of the `hue` variable to colors, or a matplotlib colormap object. - """, - hue_order=""" -hue_order : list - Specified order for the appearance of the `hue` variable levels, - otherwise they are determined from the data. Not relevant when the - `hue` variable is numeric. - """, - hue_norm=""" -hue_norm : tuple or :class:`matplotlib.colors.Normalize` object - Normalization in data units for colormap applied to the `hue` - variable when it is numeric. Not relevant if `hue` is categorical. - """, - sizes=""" -sizes : list, dict, or tuple - An object that determines how sizes are chosen when `size` is used. - List or dict arguments should provide a size for each unique data value, - which forces a categorical interpretation. The argument may also be a - min, max tuple. - """, - size_order=""" -size_order : list - Specified order for appearance of the `size` variable levels, - otherwise they are determined from the data. Not relevant when the - `size` variable is numeric. - """, - size_norm=""" -size_norm : tuple or Normalize object - Normalization in data units for scaling plot objects when the - `size` variable is numeric. - """, - dashes=""" -dashes : boolean, list, or dictionary - Object determining how to draw the lines for different levels of the - `style` variable. Setting to `True` will use default dash codes, or - you can pass a list of dash codes or a dictionary mapping levels of the - `style` variable to dash codes. Setting to `False` will use solid - lines for all subsets. Dashes are specified as in matplotlib: a tuple - of `(segment, gap)` lengths, or an empty string to draw a solid line. - """, - markers=""" -markers : boolean, list, or dictionary - Object determining how to draw the markers for different levels of the - `style` variable. Setting to `True` will use default markers, or - you can pass a list of markers or a dictionary mapping levels of the - `style` variable to markers. Setting to `False` will draw - marker-less lines. Markers are specified as in matplotlib. - """, - style_order=""" -style_order : list - Specified order for appearance of the `style` variable levels - otherwise they are determined from the data. Not relevant when the - `style` variable is numeric. - """, - units=""" -units : vector or key in `data` - Grouping variable identifying sampling units. When used, a separate - line will be drawn for each unit with appropriate semantics, but no - legend entry will be added. Useful for showing distribution of - experimental replicates when exact identities are not needed. - """, - estimator=""" -estimator : name of pandas method or callable or None - Method for aggregating across multiple observations of the `y` - variable at the same `x` level. If `None`, all observations will - be drawn. - """, - ci=""" -ci : int or "sd" or None - Size of the confidence interval to draw when aggregating. - - .. deprecated:: 0.12.0 - Use the new `errorbar` parameter for more flexibility. - - """, - n_boot=""" -n_boot : int - Number of bootstraps to use for computing the confidence interval. - """, - seed=""" -seed : int, numpy.random.Generator, or numpy.random.RandomState - Seed or random number generator for reproducible bootstrapping. - """, - legend=""" -legend : "auto", "brief", "full", or False - How to draw the legend. If "brief", numeric `hue` and `size` - variables will be represented with a sample of evenly spaced values. - If "full", every group will get an entry in the legend. If "auto", - choose between brief or full representation based on number of levels. - If `False`, no legend data is added and no legend is drawn. - """, - ax_in=""" -ax : matplotlib Axes - Axes object to draw the plot onto, otherwise uses the current Axes. - """, - ax_out=""" -ax : matplotlib Axes - Returns the Axes object with the plot drawn onto it. - """, - -) - - -_param_docs = DocstringComponents.from_nested_components( - core=_core_docs["params"], - facets=DocstringComponents(_facet_docs), - rel=DocstringComponents(_relational_docs), - stat=DocstringComponents.from_function_params(EstimateAggregator.__init__), -) - - -class _RelationalPlotter(VectorPlotter): - - wide_structure = { - "x": "@index", "y": "@values", "hue": "@columns", "style": "@columns", - } - - # TODO where best to define default parameters? - sort = True - - -class _LinePlotter(_RelationalPlotter): - - _legend_attributes = ["color", "linewidth", "marker", "dashes"] - - def __init__( - self, *, - data=None, variables={}, - estimator=None, n_boot=None, seed=None, errorbar=None, - sort=True, orient="x", err_style=None, err_kws=None, legend=None - ): - - # TODO this is messy, we want the mapping to be agnostic about - # the kind of plot to draw, but for the time being we need to set - # this information so the SizeMapping can use it - self._default_size_range = ( - np.r_[.5, 2] * mpl.rcParams["lines.linewidth"] - ) - - super().__init__(data=data, variables=variables) - - self.estimator = estimator - self.errorbar = errorbar - self.n_boot = n_boot - self.seed = seed - self.sort = sort - self.orient = orient - self.err_style = err_style - self.err_kws = {} if err_kws is None else err_kws - - self.legend = legend - - def plot(self, ax, kws): - """Draw the plot onto an axes, passing matplotlib kwargs.""" - - # Draw a test plot, using the passed in kwargs. The goal here is to - # honor both (a) the current state of the plot cycler and (b) the - # specified kwargs on all the lines we will draw, overriding when - # relevant with the data semantics. Note that we won't cycle - # internally; in other words, if `hue` is not used, all elements will - # have the same color, but they will have the color that you would have - # gotten from the corresponding matplotlib function, and calling the - # function will advance the axes property cycle. - - kws = normalize_kwargs(kws, mpl.lines.Line2D) - kws.setdefault("markeredgewidth", 0.75) - kws.setdefault("markeredgecolor", "w") - - # Set default error kwargs - err_kws = self.err_kws.copy() - if self.err_style == "band": - err_kws.setdefault("alpha", .2) - elif self.err_style == "bars": - pass - elif self.err_style is not None: - err = "`err_style` must be 'band' or 'bars', not {}" - raise ValueError(err.format(self.err_style)) - - # Initialize the aggregation object - weighted = "weight" in self.plot_data - agg = (WeightedAggregator if weighted else EstimateAggregator)( - self.estimator, self.errorbar, n_boot=self.n_boot, seed=self.seed, - ) - - # TODO abstract variable to aggregate over here-ish. Better name? - orient = self.orient - if orient not in {"x", "y"}: - err = f"`orient` must be either 'x' or 'y', not {orient!r}." - raise ValueError(err) - other = {"x": "y", "y": "x"}[orient] - - # TODO How to handle NA? We don't want NA to propagate through to the - # estimate/CI when some values are present, but we would also like - # matplotlib to show "gaps" in the line when all values are missing. - # This is straightforward absent aggregation, but complicated with it. - # If we want to use nas, we need to conditionalize dropna in iter_data. - - # Loop over the semantic subsets and add to the plot - grouping_vars = "hue", "size", "style" - for sub_vars, sub_data in self.iter_data(grouping_vars, from_comp_data=True): - - if self.sort: - sort_vars = ["units", orient, other] - sort_cols = [var for var in sort_vars if var in self.variables] - sub_data = sub_data.sort_values(sort_cols) - - if ( - self.estimator is not None - and sub_data[orient].value_counts().max() > 1 - ): - if "units" in self.variables: - # TODO eventually relax this constraint - err = "estimator must be None when specifying units" - raise ValueError(err) - grouped = sub_data.groupby(orient, sort=self.sort) - # Could pass as_index=False instead of reset_index, - # but that fails on a corner case with older pandas. - sub_data = ( - grouped - .apply(agg, other, **groupby_apply_include_groups(False)) - .reset_index() - ) - else: - sub_data[f"{other}min"] = np.nan - sub_data[f"{other}max"] = np.nan - - # Apply inverse axis scaling - for var in "xy": - _, inv = _get_transform_functions(ax, var) - for col in sub_data.filter(regex=f"^{var}"): - sub_data[col] = inv(sub_data[col]) - - # --- Draw the main line(s) - - if "units" in self.variables: # XXX why not add to grouping variables? - lines = [] - for _, unit_data in sub_data.groupby("units"): - lines.extend(ax.plot(unit_data["x"], unit_data["y"], **kws)) - else: - lines = ax.plot(sub_data["x"], sub_data["y"], **kws) - - for line in lines: - - if "hue" in sub_vars: - line.set_color(self._hue_map(sub_vars["hue"])) - - if "size" in sub_vars: - line.set_linewidth(self._size_map(sub_vars["size"])) - - if "style" in sub_vars: - attributes = self._style_map(sub_vars["style"]) - if "dashes" in attributes: - line.set_dashes(attributes["dashes"]) - if "marker" in attributes: - line.set_marker(attributes["marker"]) - - line_color = line.get_color() - line_alpha = line.get_alpha() - line_capstyle = line.get_solid_capstyle() - - # --- Draw the confidence intervals - - if self.estimator is not None and self.errorbar is not None: - - # TODO handling of orientation will need to happen here - - if self.err_style == "band": - - func = {"x": ax.fill_between, "y": ax.fill_betweenx}[orient] - func( - sub_data[orient], - sub_data[f"{other}min"], sub_data[f"{other}max"], - color=line_color, **err_kws - ) - - elif self.err_style == "bars": - - error_param = { - f"{other}err": ( - sub_data[other] - sub_data[f"{other}min"], - sub_data[f"{other}max"] - sub_data[other], - ) - } - ebars = ax.errorbar( - sub_data["x"], sub_data["y"], **error_param, - linestyle="", color=line_color, alpha=line_alpha, - **err_kws - ) - - # Set the capstyle properly on the error bars - for obj in ebars.get_children(): - if isinstance(obj, mpl.collections.LineCollection): - obj.set_capstyle(line_capstyle) - - # Finalize the axes details - self._add_axis_labels(ax) - if self.legend: - legend_artist = partial(mpl.lines.Line2D, xdata=[], ydata=[]) - attrs = {"hue": "color", "size": "linewidth", "style": None} - self.add_legend_data(ax, legend_artist, kws, attrs) - handles, _ = ax.get_legend_handles_labels() - if handles: - legend = ax.legend(title=self.legend_title) - adjust_legend_subtitles(legend) - - -class _ScatterPlotter(_RelationalPlotter): - - _legend_attributes = ["color", "s", "marker"] - - def __init__(self, *, data=None, variables={}, legend=None): - - # TODO this is messy, we want the mapping to be agnostic about - # the kind of plot to draw, but for the time being we need to set - # this information so the SizeMapping can use it - self._default_size_range = ( - np.r_[.5, 2] * np.square(mpl.rcParams["lines.markersize"]) - ) - - super().__init__(data=data, variables=variables) - - self.legend = legend - - def plot(self, ax, kws): - - # --- Determine the visual attributes of the plot - - data = self.comp_data.dropna() - if data.empty: - return - - kws = normalize_kwargs(kws, mpl.collections.PathCollection) - - # Define the vectors of x and y positions - empty = np.full(len(data), np.nan) - x = data.get("x", empty) - y = data.get("y", empty) - - # Apply inverse scaling to the coordinate variables - _, inv_x = _get_transform_functions(ax, "x") - _, inv_y = _get_transform_functions(ax, "y") - x, y = inv_x(x), inv_y(y) - - if "style" in self.variables: - # Use a representative marker so scatter sets the edgecolor - # properly for line art markers. We currently enforce either - # all or none line art so this works. - example_level = self._style_map.levels[0] - example_marker = self._style_map(example_level, "marker") - kws.setdefault("marker", example_marker) - - # Conditionally set the marker edgecolor based on whether the marker is "filled" - # See https://github.com/matplotlib/matplotlib/issues/17849 for context - m = kws.get("marker", mpl.rcParams.get("marker", "o")) - if not isinstance(m, mpl.markers.MarkerStyle): - # TODO in more recent matplotlib (which?) can pass a MarkerStyle here - m = mpl.markers.MarkerStyle(m) - if m.is_filled(): - kws.setdefault("edgecolor", "w") - - # Draw the scatter plot - points = ax.scatter(x=x, y=y, **kws) - - # Apply the mapping from semantic variables to artist attributes - - if "hue" in self.variables: - points.set_facecolors(self._hue_map(data["hue"])) - - if "size" in self.variables: - points.set_sizes(self._size_map(data["size"])) - - if "style" in self.variables: - p = [self._style_map(val, "path") for val in data["style"]] - points.set_paths(p) - - # Apply dependent default attributes - - if "linewidth" not in kws: - sizes = points.get_sizes() - linewidth = .08 * np.sqrt(np.percentile(sizes, 10)) - points.set_linewidths(linewidth) - kws["linewidth"] = linewidth - - # Finalize the axes details - self._add_axis_labels(ax) - if self.legend: - attrs = {"hue": "color", "size": "s", "style": None} - self.add_legend_data(ax, _scatter_legend_artist, kws, attrs) - handles, _ = ax.get_legend_handles_labels() - if handles: - legend = ax.legend(title=self.legend_title) - adjust_legend_subtitles(legend) - - -def lineplot( - data=None, *, - x=None, y=None, hue=None, size=None, style=None, units=None, weights=None, - palette=None, hue_order=None, hue_norm=None, - sizes=None, size_order=None, size_norm=None, - dashes=True, markers=None, style_order=None, - estimator="mean", errorbar=("ci", 95), n_boot=1000, seed=None, - orient="x", sort=True, err_style="band", err_kws=None, - legend="auto", ci="deprecated", ax=None, **kwargs -): - - # Handle deprecation of ci parameter - errorbar = _deprecate_ci(errorbar, ci) - - p = _LinePlotter( - data=data, - variables=dict( - x=x, y=y, hue=hue, size=size, style=style, units=units, weight=weights - ), - estimator=estimator, n_boot=n_boot, seed=seed, errorbar=errorbar, - sort=sort, orient=orient, err_style=err_style, err_kws=err_kws, - legend=legend, - ) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - p.map_size(sizes=sizes, order=size_order, norm=size_norm) - p.map_style(markers=markers, dashes=dashes, order=style_order) - - if ax is None: - ax = plt.gca() - - if "style" not in p.variables and not {"ls", "linestyle"} & set(kwargs): # XXX - kwargs["dashes"] = "" if dashes is None or isinstance(dashes, bool) else dashes - - if not p.has_xy_data: - return ax - - p._attach(ax) - - # Other functions have color as an explicit param, - # and we should probably do that here too - color = kwargs.pop("color", kwargs.pop("c", None)) - kwargs["color"] = _default_color(ax.plot, hue, color, kwargs) - - p.plot(ax, kwargs) - return ax - - -lineplot.__doc__ = """\ -Draw a line plot with possibility of several semantic groupings. - -{narrative.main_api} - -{narrative.relational_semantic} - -By default, the plot aggregates over multiple `y` values at each value of -`x` and shows an estimate of the central tendency and a confidence -interval for that estimate. - -Parameters ----------- -{params.core.data} -{params.core.xy} -hue : vector or key in `data` - Grouping variable that will produce lines with different colors. - Can be either categorical or numeric, although color mapping will - behave differently in latter case. -size : vector or key in `data` - Grouping variable that will produce lines with different widths. - Can be either categorical or numeric, although size mapping will - behave differently in latter case. -style : vector or key in `data` - Grouping variable that will produce lines with different dashes - and/or markers. Can have a numeric dtype but will always be treated - as categorical. -{params.rel.units} -weights : vector or key in `data` - Data values or column used to compute weighted estimation. - Note that use of weights currently limits the choice of statistics - to a 'mean' estimator and 'ci' errorbar. -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.rel.sizes} -{params.rel.size_order} -{params.rel.size_norm} -{params.rel.dashes} -{params.rel.markers} -{params.rel.style_order} -{params.rel.estimator} -{params.stat.errorbar} -{params.rel.n_boot} -{params.rel.seed} -orient : "x" or "y" - Dimension along which the data are sorted / aggregated. Equivalently, - the "independent variable" of the resulting function. -sort : boolean - If True, the data will be sorted by the x and y variables, otherwise - lines will connect points in the order they appear in the dataset. -err_style : "band" or "bars" - Whether to draw the confidence intervals with translucent error bands - or discrete error bars. -err_kws : dict of keyword arguments - Additional parameters to control the aesthetics of the error bars. The - kwargs are passed either to :meth:`matplotlib.axes.Axes.fill_between` - or :meth:`matplotlib.axes.Axes.errorbar`, depending on `err_style`. -{params.rel.legend} -{params.rel.ci} -{params.core.ax} -kwargs : key, value mappings - Other keyword arguments are passed down to - :meth:`matplotlib.axes.Axes.plot`. - -Returns -------- -{returns.ax} - -See Also --------- -{seealso.scatterplot} -{seealso.pointplot} - -Examples --------- - -.. include:: ../docstrings/lineplot.rst - -""".format( - narrative=_relational_narrative, - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) - - -def scatterplot( - data=None, *, - x=None, y=None, hue=None, size=None, style=None, - palette=None, hue_order=None, hue_norm=None, - sizes=None, size_order=None, size_norm=None, - markers=True, style_order=None, legend="auto", ax=None, - **kwargs -): - - p = _ScatterPlotter( - data=data, - variables=dict(x=x, y=y, hue=hue, size=size, style=style), - legend=legend - ) - - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - p.map_size(sizes=sizes, order=size_order, norm=size_norm) - p.map_style(markers=markers, order=style_order) - - if ax is None: - ax = plt.gca() - - if not p.has_xy_data: - return ax - - p._attach(ax) - - color = kwargs.pop("color", None) - kwargs["color"] = _default_color(ax.scatter, hue, color, kwargs) - - p.plot(ax, kwargs) - - return ax - - -scatterplot.__doc__ = """\ -Draw a scatter plot with possibility of several semantic groupings. - -{narrative.main_api} - -{narrative.relational_semantic} - -Parameters ----------- -{params.core.data} -{params.core.xy} -hue : vector or key in `data` - Grouping variable that will produce points with different colors. - Can be either categorical or numeric, although color mapping will - behave differently in latter case. -size : vector or key in `data` - Grouping variable that will produce points with different sizes. - Can be either categorical or numeric, although size mapping will - behave differently in latter case. -style : vector or key in `data` - Grouping variable that will produce points with different markers. - Can have a numeric dtype but will always be treated as categorical. -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.rel.sizes} -{params.rel.size_order} -{params.rel.size_norm} -{params.rel.markers} -{params.rel.style_order} -{params.rel.legend} -{params.core.ax} -kwargs : key, value mappings - Other keyword arguments are passed down to - :meth:`matplotlib.axes.Axes.scatter`. - -Returns -------- -{returns.ax} - -See Also --------- -{seealso.lineplot} -{seealso.stripplot} -{seealso.swarmplot} - -Examples --------- - -.. include:: ../docstrings/scatterplot.rst - -""".format( - narrative=_relational_narrative, - params=_param_docs, - returns=_core_docs["returns"], - seealso=_core_docs["seealso"], -) - - -def relplot( - data=None, *, - x=None, y=None, hue=None, size=None, style=None, units=None, weights=None, - row=None, col=None, col_wrap=None, row_order=None, col_order=None, - palette=None, hue_order=None, hue_norm=None, - sizes=None, size_order=None, size_norm=None, - markers=None, dashes=None, style_order=None, - legend="auto", kind="scatter", height=5, aspect=1, facet_kws=None, - **kwargs -): - - if kind == "scatter": - - Plotter = _ScatterPlotter - func = scatterplot - markers = True if markers is None else markers - - elif kind == "line": - - Plotter = _LinePlotter - func = lineplot - dashes = True if dashes is None else dashes - - else: - err = f"Plot kind {kind} not recognized" - raise ValueError(err) - - # Check for attempt to plot onto specific axes and warn - if "ax" in kwargs: - msg = ( - "relplot is a figure-level function and does not accept " - "the `ax` parameter. You may wish to try {}".format(kind + "plot") - ) - warnings.warn(msg, UserWarning) - kwargs.pop("ax") - - # Use the full dataset to map the semantics - variables = dict(x=x, y=y, hue=hue, size=size, style=style) - if kind == "line": - variables["units"] = units - variables["weight"] = weights - else: - if units is not None: - msg = "The `units` parameter has no effect with kind='scatter'." - warnings.warn(msg, stacklevel=2) - if weights is not None: - msg = "The `weights` parameter has no effect with kind='scatter'." - warnings.warn(msg, stacklevel=2) - p = Plotter( - data=data, - variables=variables, - legend=legend, - ) - p.map_hue(palette=palette, order=hue_order, norm=hue_norm) - p.map_size(sizes=sizes, order=size_order, norm=size_norm) - p.map_style(markers=markers, dashes=dashes, order=style_order) - - # Extract the semantic mappings - if "hue" in p.variables: - palette = p._hue_map.lookup_table - hue_order = p._hue_map.levels - hue_norm = p._hue_map.norm - else: - palette = hue_order = hue_norm = None - - if "size" in p.variables: - sizes = p._size_map.lookup_table - size_order = p._size_map.levels - size_norm = p._size_map.norm - - if "style" in p.variables: - style_order = p._style_map.levels - if markers: - markers = {k: p._style_map(k, "marker") for k in style_order} - else: - markers = None - if dashes: - dashes = {k: p._style_map(k, "dashes") for k in style_order} - else: - dashes = None - else: - markers = dashes = style_order = None - - # Now extract the data that would be used to draw a single plot - variables = p.variables - plot_data = p.plot_data - - # Define the common plotting parameters - plot_kws = dict( - palette=palette, hue_order=hue_order, hue_norm=hue_norm, - sizes=sizes, size_order=size_order, size_norm=size_norm, - markers=markers, dashes=dashes, style_order=style_order, - legend=False, - ) - plot_kws.update(kwargs) - if kind == "scatter": - plot_kws.pop("dashes") - - # Add the grid semantics onto the plotter - grid_variables = dict( - x=x, y=y, row=row, col=col, hue=hue, size=size, style=style, - ) - if kind == "line": - grid_variables.update(units=units, weights=weights) - p.assign_variables(data, grid_variables) - - # Define the named variables for plotting on each facet - # Rename the variables with a leading underscore to avoid - # collisions with faceting variable names - plot_variables = {v: f"_{v}" for v in variables} - if "weight" in plot_variables: - plot_variables["weights"] = plot_variables.pop("weight") - plot_kws.update(plot_variables) - - # Pass the row/col variables to FacetGrid with their original - # names so that the axes titles render correctly - for var in ["row", "col"]: - # Handle faceting variables that lack name information - if var in p.variables and p.variables[var] is None: - p.variables[var] = f"_{var}_" - grid_kws = {v: p.variables.get(v) for v in ["row", "col"]} - - # Rename the columns of the plot_data structure appropriately - new_cols = plot_variables.copy() - new_cols.update(grid_kws) - full_data = p.plot_data.rename(columns=new_cols) - - # Set up the FacetGrid object - facet_kws = {} if facet_kws is None else facet_kws.copy() - g = FacetGrid( - data=full_data.dropna(axis=1, how="all"), - **grid_kws, - col_wrap=col_wrap, row_order=row_order, col_order=col_order, - height=height, aspect=aspect, dropna=False, - **facet_kws - ) - - # Draw the plot - g.map_dataframe(func, **plot_kws) - - # Label the axes, using the original variables - # Pass "" when the variable name is None to overwrite internal variables - g.set_axis_labels(variables.get("x") or "", variables.get("y") or "") - - if legend: - # Replace the original plot data so the legend uses numeric data with - # the correct type, since we force a categorical mapping above. - p.plot_data = plot_data - - # Handle the additional non-semantic keyword arguments out here. - # We're selective because some kwargs may be seaborn function specific - # and not relevant to the matplotlib artists going into the legend. - # Ideally, we will have a better solution where we don't need to re-make - # the legend out here and will have parity with the axes-level functions. - keys = ["c", "color", "alpha", "m", "marker"] - if kind == "scatter": - legend_artist = _scatter_legend_artist - keys += ["s", "facecolor", "fc", "edgecolor", "ec", "linewidth", "lw"] - else: - legend_artist = partial(mpl.lines.Line2D, xdata=[], ydata=[]) - keys += [ - "markersize", "ms", - "markeredgewidth", "mew", - "markeredgecolor", "mec", - "linestyle", "ls", - "linewidth", "lw", - ] - - common_kws = {k: v for k, v in kwargs.items() if k in keys} - attrs = {"hue": "color", "style": None} - if kind == "scatter": - attrs["size"] = "s" - elif kind == "line": - attrs["size"] = "linewidth" - p.add_legend_data(g.axes.flat[0], legend_artist, common_kws, attrs) - if p.legend_data: - g.add_legend(legend_data=p.legend_data, - label_order=p.legend_order, - title=p.legend_title, - adjust_subtitles=True) - - # Rename the columns of the FacetGrid's `data` attribute - # to match the original column names - orig_cols = { - f"_{k}": f"_{k}_" if v is None else v for k, v in variables.items() - } - grid_data = g.data.rename(columns=orig_cols) - if data is not None and (x is not None or y is not None): - if not isinstance(data, pd.DataFrame): - data = pd.DataFrame(data) - g.data = pd.merge( - data, - grid_data[grid_data.columns.difference(data.columns)], - left_index=True, - right_index=True, - ) - else: - g.data = grid_data - - return g - - -relplot.__doc__ = """\ -Figure-level interface for drawing relational plots onto a FacetGrid. - -This function provides access to several different axes-level functions -that show the relationship between two variables with semantic mappings -of subsets. The `kind` parameter selects the underlying axes-level -function to use: - -- :func:`scatterplot` (with `kind="scatter"`; the default) -- :func:`lineplot` (with `kind="line"`) - -Extra keyword arguments are passed to the underlying function, so you -should refer to the documentation for each to see kind-specific options. - -{narrative.main_api} - -{narrative.relational_semantic} - -After plotting, the :class:`FacetGrid` with the plot is returned and can -be used directly to tweak supporting plot details or add other layers. - -Parameters ----------- -{params.core.data} -{params.core.xy} -hue : vector or key in `data` - Grouping variable that will produce elements with different colors. - Can be either categorical or numeric, although color mapping will - behave differently in latter case. -size : vector or key in `data` - Grouping variable that will produce elements with different sizes. - Can be either categorical or numeric, although size mapping will - behave differently in latter case. -style : vector or key in `data` - Grouping variable that will produce elements with different styles. - Can have a numeric dtype but will always be treated as categorical. -{params.rel.units} -weights : vector or key in `data` - Data values or column used to compute weighted estimation. - Note that use of weights currently limits the choice of statistics - to a 'mean' estimator and 'ci' errorbar. -{params.facets.rowcol} -{params.facets.col_wrap} -row_order, col_order : lists of strings - Order to organize the rows and/or columns of the grid in, otherwise the - orders are inferred from the data objects. -{params.core.palette} -{params.core.hue_order} -{params.core.hue_norm} -{params.rel.sizes} -{params.rel.size_order} -{params.rel.size_norm} -{params.rel.style_order} -{params.rel.dashes} -{params.rel.markers} -{params.rel.legend} -kind : string - Kind of plot to draw, corresponding to a seaborn relational plot. - Options are `"scatter"` or `"line"`. -{params.facets.height} -{params.facets.aspect} -facet_kws : dict - Dictionary of other keyword arguments to pass to :class:`FacetGrid`. -kwargs : key, value pairings - Other keyword arguments are passed through to the underlying plotting - function. - -Returns -------- -{returns.facetgrid} - -Examples --------- - -.. include:: ../docstrings/relplot.rst - -""".format( - narrative=_relational_narrative, - params=_param_docs, - returns=_core_docs["returns"], -) diff --git a/seaborn/utils.py b/seaborn/utils.py deleted file mode 100644 index 98720ba36d7f76a7bcd8afc0c93de3cd9c489d12..0000000000000000000000000000000000000000 --- a/seaborn/utils.py +++ /dev/null @@ -1,897 +0,0 @@ -"""Utility functions, mostly for internal use.""" -import os -import inspect -import warnings -import colorsys -from contextlib import contextmanager -from urllib.request import urlopen, urlretrieve -from types import ModuleType - -import numpy as np -import pandas as pd -import matplotlib as mpl -from matplotlib.colors import to_rgb -import matplotlib.pyplot as plt -from matplotlib.cbook import normalize_kwargs - -from seaborn._core.typing import deprecated -from seaborn.external.version import Version -from seaborn.external.appdirs import user_cache_dir - -__all__ = ["desaturate", "saturate", "set_hls_values", "move_legend", - "despine", "get_dataset_names", "get_data_home", "load_dataset"] - -DATASET_SOURCE = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master" -DATASET_NAMES_URL = f"{DATASET_SOURCE}/dataset_names.txt" - - -def ci_to_errsize(cis, heights): - """Convert intervals to error arguments relative to plot heights. - - Parameters - ---------- - cis : 2 x n sequence - sequence of confidence interval limits - heights : n sequence - sequence of plot heights - - Returns - ------- - errsize : 2 x n array - sequence of error size relative to height values in correct - format as argument for plt.bar - - """ - cis = np.atleast_2d(cis).reshape(2, -1) - heights = np.atleast_1d(heights) - errsize = [] - for i, (low, high) in enumerate(np.transpose(cis)): - h = heights[i] - elow = h - low - ehigh = high - h - errsize.append([elow, ehigh]) - - errsize = np.asarray(errsize).T - return errsize - - -def _draw_figure(fig): - """Force draw of a matplotlib figure, accounting for back-compat.""" - # See https://github.com/matplotlib/matplotlib/issues/19197 for context - fig.canvas.draw() - if fig.stale: - try: - fig.draw(fig.canvas.get_renderer()) - except AttributeError: - pass - - -def _default_color(method, hue, color, kws, saturation=1): - """If needed, get a default color by using the matplotlib property cycle.""" - - if hue is not None: - # This warning is probably user-friendly, but it's currently triggered - # in a FacetGrid context and I don't want to mess with that logic right now - # if color is not None: - # msg = "`color` is ignored when `hue` is assigned." - # warnings.warn(msg) - return None - - kws = kws.copy() - kws.pop("label", None) - - if color is not None: - if saturation < 1: - color = desaturate(color, saturation) - return color - - elif method.__name__ == "plot": - - color = normalize_kwargs(kws, mpl.lines.Line2D).get("color") - scout, = method([], [], scalex=False, scaley=False, color=color) - color = scout.get_color() - scout.remove() - - elif method.__name__ == "scatter": - - # Matplotlib will raise if the size of x/y don't match s/c, - # and the latter might be in the kws dict - scout_size = max( - np.atleast_1d(kws.get(key, [])).shape[0] - for key in ["s", "c", "fc", "facecolor", "facecolors"] - ) - scout_x = scout_y = np.full(scout_size, np.nan) - - scout = method(scout_x, scout_y, **kws) - facecolors = scout.get_facecolors() - - if not len(facecolors): - # Handle bug in matplotlib <= 3.2 (I think) - # This will limit the ability to use non color= kwargs to specify - # a color in versions of matplotlib with the bug, but trying to - # work out what the user wanted by re-implementing the broken logic - # of inspecting the kwargs is probably too brittle. - single_color = False - else: - single_color = np.unique(facecolors, axis=0).shape[0] == 1 - - # Allow the user to specify an array of colors through various kwargs - if "c" not in kws and single_color: - color = to_rgb(facecolors[0]) - - scout.remove() - - elif method.__name__ == "bar": - - # bar() needs masked, not empty data, to generate a patch - scout, = method([np.nan], [np.nan], **kws) - color = to_rgb(scout.get_facecolor()) - scout.remove() - # Axes.bar adds both a patch and a container - method.__self__.containers.pop(-1) - - elif method.__name__ == "fill_between": - - kws = normalize_kwargs(kws, mpl.collections.PolyCollection) - scout = method([], [], **kws) - facecolor = scout.get_facecolor() - color = to_rgb(facecolor[0]) - scout.remove() - - if saturation < 1: - color = desaturate(color, saturation) - - return color - - -def desaturate(color, prop): - """Decrease the saturation channel of a color by some percent. - - Parameters - ---------- - color : matplotlib color - hex, rgb-tuple, or html color name - prop : float - saturation channel of color will be multiplied by this value - - Returns - ------- - new_color : rgb tuple - desaturated color code in RGB tuple representation - - """ - # Check inputs - if not 0 <= prop <= 1: - raise ValueError("prop must be between 0 and 1") - - # Get rgb tuple rep - rgb = to_rgb(color) - - # Short circuit to avoid floating point issues - if prop == 1: - return rgb - - # Convert to hls - h, l, s = colorsys.rgb_to_hls(*rgb) - - # Desaturate the saturation channel - s *= prop - - # Convert back to rgb - new_color = colorsys.hls_to_rgb(h, l, s) - - return new_color - - -def saturate(color): - """Return a fully saturated color with the same hue. - - Parameters - ---------- - color : matplotlib color - hex, rgb-tuple, or html color name - - Returns - ------- - new_color : rgb tuple - saturated color code in RGB tuple representation - - """ - return set_hls_values(color, s=1) - - -def set_hls_values(color, h=None, l=None, s=None): # noqa - """Independently manipulate the h, l, or s channels of a color. - - Parameters - ---------- - color : matplotlib color - hex, rgb-tuple, or html color name - h, l, s : floats between 0 and 1, or None - new values for each channel in hls space - - Returns - ------- - new_color : rgb tuple - new color code in RGB tuple representation - - """ - # Get an RGB tuple representation - rgb = to_rgb(color) - vals = list(colorsys.rgb_to_hls(*rgb)) - for i, val in enumerate([h, l, s]): - if val is not None: - vals[i] = val - - rgb = colorsys.hls_to_rgb(*vals) - return rgb - - -def axlabel(xlabel, ylabel, **kwargs): - """Grab current axis and label it. - - DEPRECATED: will be removed in a future version. - - """ - msg = "This function is deprecated and will be removed in a future version" - warnings.warn(msg, FutureWarning) - ax = plt.gca() - ax.set_xlabel(xlabel, **kwargs) - ax.set_ylabel(ylabel, **kwargs) - - -def remove_na(vector): - """Helper method for removing null values from data vectors. - - Parameters - ---------- - vector : vector object - Must implement boolean masking with [] subscript syntax. - - Returns - ------- - clean_clean : same type as ``vector`` - Vector of data with null values removed. May be a copy or a view. - - """ - return vector[pd.notnull(vector)] - - -def get_color_cycle(): - """Return the list of colors in the current matplotlib color cycle - - Parameters - ---------- - None - - Returns - ------- - colors : list - List of matplotlib colors in the current cycle, or dark gray if - the current color cycle is empty. - """ - cycler = mpl.rcParams['axes.prop_cycle'] - return cycler.by_key()['color'] if 'color' in cycler.keys else [".15"] - - -def despine(fig=None, ax=None, top=True, right=True, left=False, - bottom=False, offset=None, trim=False): - """Remove the top and right spines from plot(s). - - fig : matplotlib figure, optional - Figure to despine all axes of, defaults to the current figure. - ax : matplotlib axes, optional - Specific axes object to despine. Ignored if fig is provided. - top, right, left, bottom : boolean, optional - If True, remove that spine. - offset : int or dict, optional - Absolute distance, in points, spines should be moved away - from the axes (negative values move spines inward). A single value - applies to all spines; a dict can be used to set offset values per - side. - trim : bool, optional - If True, limit spines to the smallest and largest major tick - on each non-despined axis. - - Returns - ------- - None - - """ - # Get references to the axes we want - if fig is None and ax is None: - axes = plt.gcf().axes - elif fig is not None: - axes = fig.axes - elif ax is not None: - axes = [ax] - - for ax_i in axes: - for side in ["top", "right", "left", "bottom"]: - # Toggle the spine objects - is_visible = not locals()[side] - ax_i.spines[side].set_visible(is_visible) - if offset is not None and is_visible: - try: - val = offset.get(side, 0) - except AttributeError: - val = offset - ax_i.spines[side].set_position(('outward', val)) - - # Potentially move the ticks - if left and not right: - maj_on = any( - t.tick1line.get_visible() - for t in ax_i.yaxis.majorTicks - ) - min_on = any( - t.tick1line.get_visible() - for t in ax_i.yaxis.minorTicks - ) - ax_i.yaxis.set_ticks_position("right") - for t in ax_i.yaxis.majorTicks: - t.tick2line.set_visible(maj_on) - for t in ax_i.yaxis.minorTicks: - t.tick2line.set_visible(min_on) - - if bottom and not top: - maj_on = any( - t.tick1line.get_visible() - for t in ax_i.xaxis.majorTicks - ) - min_on = any( - t.tick1line.get_visible() - for t in ax_i.xaxis.minorTicks - ) - ax_i.xaxis.set_ticks_position("top") - for t in ax_i.xaxis.majorTicks: - t.tick2line.set_visible(maj_on) - for t in ax_i.xaxis.minorTicks: - t.tick2line.set_visible(min_on) - - if trim: - # clip off the parts of the spines that extend past major ticks - xticks = np.asarray(ax_i.get_xticks()) - if xticks.size: - firsttick = np.compress(xticks >= min(ax_i.get_xlim()), - xticks)[0] - lasttick = np.compress(xticks <= max(ax_i.get_xlim()), - xticks)[-1] - ax_i.spines['bottom'].set_bounds(firsttick, lasttick) - ax_i.spines['top'].set_bounds(firsttick, lasttick) - newticks = xticks.compress(xticks <= lasttick) - newticks = newticks.compress(newticks >= firsttick) - ax_i.set_xticks(newticks) - - yticks = np.asarray(ax_i.get_yticks()) - if yticks.size: - firsttick = np.compress(yticks >= min(ax_i.get_ylim()), - yticks)[0] - lasttick = np.compress(yticks <= max(ax_i.get_ylim()), - yticks)[-1] - ax_i.spines['left'].set_bounds(firsttick, lasttick) - ax_i.spines['right'].set_bounds(firsttick, lasttick) - newticks = yticks.compress(yticks <= lasttick) - newticks = newticks.compress(newticks >= firsttick) - ax_i.set_yticks(newticks) - - -def move_legend(obj, loc, **kwargs): - """ - Recreate a plot's legend at a new location. - - The name is a slight misnomer. Matplotlib legends do not expose public - control over their position parameters. So this function creates a new legend, - copying over the data from the original object, which is then removed. - - Parameters - ---------- - obj : the object with the plot - This argument can be either a seaborn or matplotlib object: - - - :class:`seaborn.FacetGrid` or :class:`seaborn.PairGrid` - - :class:`matplotlib.axes.Axes` or :class:`matplotlib.figure.Figure` - - loc : str or int - Location argument, as in :meth:`matplotlib.axes.Axes.legend`. - - kwargs - Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.legend`. - - Examples - -------- - - .. include:: ../docstrings/move_legend.rst - - """ - # This is a somewhat hackish solution that will hopefully be obviated by - # upstream improvements to matplotlib legends that make them easier to - # modify after creation. - - from seaborn.axisgrid import Grid # Avoid circular import - - # Locate the legend object and a method to recreate the legend - if isinstance(obj, Grid): - old_legend = obj.legend - legend_func = obj.figure.legend - elif isinstance(obj, mpl.axes.Axes): - old_legend = obj.legend_ - legend_func = obj.legend - elif isinstance(obj, mpl.figure.Figure): - if obj.legends: - old_legend = obj.legends[-1] - else: - old_legend = None - legend_func = obj.legend - else: - err = "`obj` must be a seaborn Grid or matplotlib Axes or Figure instance." - raise TypeError(err) - - if old_legend is None: - err = f"{obj} has no legend attached." - raise ValueError(err) - - # Extract the components of the legend we need to reuse - # Import here to avoid a circular import - from seaborn._compat import get_legend_handles - handles = get_legend_handles(old_legend) - labels = [t.get_text() for t in old_legend.get_texts()] - - # Handle the case where the user is trying to override the labels - if (new_labels := kwargs.pop("labels", None)) is not None: - if len(new_labels) != len(labels): - err = "Length of new labels does not match existing legend." - raise ValueError(err) - labels = new_labels - - # Extract legend properties that can be passed to the recreation method - # (Vexingly, these don't all round-trip) - legend_kws = inspect.signature(mpl.legend.Legend).parameters - props = {k: v for k, v in old_legend.properties().items() if k in legend_kws} - - # Delegate default bbox_to_anchor rules to matplotlib - props.pop("bbox_to_anchor") - - # Try to propagate the existing title and font properties; respect new ones too - title = props.pop("title") - if "title" in kwargs: - title.set_text(kwargs.pop("title")) - title_kwargs = {k: v for k, v in kwargs.items() if k.startswith("title_")} - for key, val in title_kwargs.items(): - title.set(**{key[6:]: val}) - kwargs.pop(key) - - # Try to respect the frame visibility - kwargs.setdefault("frameon", old_legend.legendPatch.get_visible()) - - # Remove the old legend and create the new one - props.update(kwargs) - old_legend.remove() - new_legend = legend_func(handles, labels, loc=loc, **props) - new_legend.set_title(title.get_text(), title.get_fontproperties()) - - # Let the Grid object continue to track the correct legend object - if isinstance(obj, Grid): - obj._legend = new_legend - - -def _kde_support(data, bw, gridsize, cut, clip): - """Establish support for a kernel density estimate.""" - support_min = max(data.min() - bw * cut, clip[0]) - support_max = min(data.max() + bw * cut, clip[1]) - support = np.linspace(support_min, support_max, gridsize) - - return support - - -def ci(a, which=95, axis=None): - """Return a percentile range from an array of values.""" - p = 50 - which / 2, 50 + which / 2 - return np.nanpercentile(a, p, axis) - - -def get_dataset_names(): - """Report available example datasets, useful for reporting issues. - - Requires an internet connection. - - """ - with urlopen(DATASET_NAMES_URL) as resp: - txt = resp.read() - - dataset_names = [name.strip() for name in txt.decode().split("\n")] - return list(filter(None, dataset_names)) - - -def get_data_home(data_home=None): - """Return a path to the cache directory for example datasets. - - This directory is used by :func:`load_dataset`. - - If the ``data_home`` argument is not provided, it will use a directory - specified by the `SEABORN_DATA` environment variable (if it exists) - or otherwise default to an OS-appropriate user cache location. - - """ - if data_home is None: - data_home = os.environ.get("SEABORN_DATA", user_cache_dir("seaborn")) - data_home = os.path.expanduser(data_home) - if not os.path.exists(data_home): - os.makedirs(data_home) - return data_home - - -def load_dataset(name, cache=True, data_home=None, **kws): - """Load an example dataset from the online repository (requires internet). - - This function provides quick access to a small number of example datasets - that are useful for documenting seaborn or generating reproducible examples - for bug reports. It is not necessary for normal usage. - - Note that some of the datasets have a small amount of preprocessing applied - to define a proper ordering for categorical variables. - - Use :func:`get_dataset_names` to see a list of available datasets. - - Parameters - ---------- - name : str - Name of the dataset (``{name}.csv`` on - https://github.com/mwaskom/seaborn-data). - cache : boolean, optional - If True, try to load from the local cache first, and save to the cache - if a download is required. - data_home : string, optional - The directory in which to cache data; see :func:`get_data_home`. - kws : keys and values, optional - Additional keyword arguments are passed to passed through to - :func:`pandas.read_csv`. - - Returns - ------- - df : :class:`pandas.DataFrame` - Tabular data, possibly with some preprocessing applied. - - """ - # A common beginner mistake is to assume that one's personal data needs - # to be passed through this function to be usable with seaborn. - # Let's provide a more helpful error than you would otherwise get. - if isinstance(name, pd.DataFrame): - err = ( - "This function accepts only strings (the name of an example dataset). " - "You passed a pandas DataFrame. If you have your own dataset, " - "it is not necessary to use this function before plotting." - ) - raise TypeError(err) - - url = f"{DATASET_SOURCE}/{name}.csv" - - if cache: - cache_path = os.path.join(get_data_home(data_home), os.path.basename(url)) - if not os.path.exists(cache_path): - if name not in get_dataset_names(): - raise ValueError(f"'{name}' is not one of the example datasets.") - urlretrieve(url, cache_path) - full_path = cache_path - else: - full_path = url - - df = pd.read_csv(full_path, **kws) - - if df.iloc[-1].isnull().all(): - df = df.iloc[:-1] - - # Set some columns as a categorical type with ordered levels - - if name == "tips": - df["day"] = pd.Categorical(df["day"], ["Thur", "Fri", "Sat", "Sun"]) - df["sex"] = pd.Categorical(df["sex"], ["Male", "Female"]) - df["time"] = pd.Categorical(df["time"], ["Lunch", "Dinner"]) - df["smoker"] = pd.Categorical(df["smoker"], ["Yes", "No"]) - - elif name == "flights": - months = df["month"].str[:3] - df["month"] = pd.Categorical(months, months.unique()) - - elif name == "exercise": - df["time"] = pd.Categorical(df["time"], ["1 min", "15 min", "30 min"]) - df["kind"] = pd.Categorical(df["kind"], ["rest", "walking", "running"]) - df["diet"] = pd.Categorical(df["diet"], ["no fat", "low fat"]) - - elif name == "titanic": - df["class"] = pd.Categorical(df["class"], ["First", "Second", "Third"]) - df["deck"] = pd.Categorical(df["deck"], list("ABCDEFG")) - - elif name == "penguins": - df["sex"] = df["sex"].str.title() - - elif name == "diamonds": - df["color"] = pd.Categorical( - df["color"], ["D", "E", "F", "G", "H", "I", "J"], - ) - df["clarity"] = pd.Categorical( - df["clarity"], ["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"], - ) - df["cut"] = pd.Categorical( - df["cut"], ["Ideal", "Premium", "Very Good", "Good", "Fair"], - ) - - elif name == "taxis": - df["pickup"] = pd.to_datetime(df["pickup"]) - df["dropoff"] = pd.to_datetime(df["dropoff"]) - - elif name == "seaice": - df["Date"] = pd.to_datetime(df["Date"]) - - elif name == "dowjones": - df["Date"] = pd.to_datetime(df["Date"]) - - return df - - -def axis_ticklabels_overlap(labels): - """Return a boolean for whether the list of ticklabels have overlaps. - - Parameters - ---------- - labels : list of matplotlib ticklabels - - Returns - ------- - overlap : boolean - True if any of the labels overlap. - - """ - if not labels: - return False - try: - bboxes = [l.get_window_extent() for l in labels] - overlaps = [b.count_overlaps(bboxes) for b in bboxes] - return max(overlaps) > 1 - except RuntimeError: - # Issue on macos backend raises an error in the above code - return False - - -def axes_ticklabels_overlap(ax): - """Return booleans for whether the x and y ticklabels on an Axes overlap. - - Parameters - ---------- - ax : matplotlib Axes - - Returns - ------- - x_overlap, y_overlap : booleans - True when the labels on that axis overlap. - - """ - return (axis_ticklabels_overlap(ax.get_xticklabels()), - axis_ticklabels_overlap(ax.get_yticklabels())) - - -def locator_to_legend_entries(locator, limits, dtype): - """Return levels and formatted levels for brief numeric legends.""" - raw_levels = locator.tick_values(*limits).astype(dtype) - - # The locator can return ticks outside the limits, clip them here - raw_levels = [l for l in raw_levels if l >= limits[0] and l <= limits[1]] - - class dummy_axis: - def get_view_interval(self): - return limits - - if isinstance(locator, mpl.ticker.LogLocator): - formatter = mpl.ticker.LogFormatter() - else: - formatter = mpl.ticker.ScalarFormatter() - # Avoid having an offset/scientific notation which we don't currently - # have any way of representing in the legend - formatter.set_useOffset(False) - formatter.set_scientific(False) - formatter.axis = dummy_axis() - - formatted_levels = formatter.format_ticks(raw_levels) - - return raw_levels, formatted_levels - - -def relative_luminance(color): - """Calculate the relative luminance of a color according to W3C standards - - Parameters - ---------- - color : matplotlib color or sequence of matplotlib colors - Hex code, rgb-tuple, or html color name. - - Returns - ------- - luminance : float(s) between 0 and 1 - - """ - rgb = mpl.colors.colorConverter.to_rgba_array(color)[:, :3] - rgb = np.where(rgb <= .03928, rgb / 12.92, ((rgb + .055) / 1.055) ** 2.4) - lum = rgb.dot([.2126, .7152, .0722]) - try: - return lum.item() - except ValueError: - return lum - - -def to_utf8(obj): - """Return a string representing a Python object. - - Strings (i.e. type ``str``) are returned unchanged. - - Byte strings (i.e. type ``bytes``) are returned as UTF-8-decoded strings. - - For other objects, the method ``__str__()`` is called, and the result is - returned as a string. - - Parameters - ---------- - obj : object - Any Python object - - Returns - ------- - s : str - UTF-8-decoded string representation of ``obj`` - - """ - if isinstance(obj, str): - return obj - try: - return obj.decode(encoding="utf-8") - except AttributeError: # obj is not bytes-like - return str(obj) - - -def _check_argument(param, options, value, prefix=False): - """Raise if value for param is not in options.""" - if prefix and value is not None: - failure = not any(value.startswith(p) for p in options if isinstance(p, str)) - else: - failure = value not in options - if failure: - raise ValueError( - f"The value for `{param}` must be one of {options}, " - f"but {repr(value)} was passed." - ) - return value - - -def _assign_default_kwargs(kws, call_func, source_func): - """Assign default kwargs for call_func using values from source_func.""" - # This exists so that axes-level functions and figure-level functions can - # both call a Plotter method while having the default kwargs be defined in - # the signature of the axes-level function. - # An alternative would be to have a decorator on the method that sets its - # defaults based on those defined in the axes-level function. - # Then the figure-level function would not need to worry about defaults. - # I am not sure which is better. - needed = inspect.signature(call_func).parameters - defaults = inspect.signature(source_func).parameters - - for param in needed: - if param in defaults and param not in kws: - kws[param] = defaults[param].default - - return kws - - -def adjust_legend_subtitles(legend): - """ - Make invisible-handle "subtitles" entries look more like titles. - - Note: This function is not part of the public API and may be changed or removed. - - """ - # Legend title not in rcParams until 3.0 - font_size = plt.rcParams.get("legend.title_fontsize", None) - hpackers = legend.findobj(mpl.offsetbox.VPacker)[0].get_children() - for hpack in hpackers: - draw_area, text_area = hpack.get_children() - handles = draw_area.get_children() - if not all(artist.get_visible() for artist in handles): - draw_area.set_width(0) - for text in text_area.get_children(): - if font_size is not None: - text.set_size(font_size) - - -def _deprecate_ci(errorbar, ci): - """ - Warn on usage of ci= and convert to appropriate errorbar= arg. - - ci was deprecated when errorbar was added in 0.12. It should not be removed - completely for some time, but it can be moved out of function definitions - (and extracted from kwargs) after one cycle. - - """ - if ci is not deprecated and ci != "deprecated": - if ci is None: - errorbar = None - elif ci == "sd": - errorbar = "sd" - else: - errorbar = ("ci", ci) - msg = ( - "\n\nThe `ci` parameter is deprecated. " - f"Use `errorbar={repr(errorbar)}` for the same effect.\n" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - - return errorbar - - -def _get_transform_functions(ax, axis): - """Return the forward and inverse transforms for a given axis.""" - axis_obj = getattr(ax, f"{axis}axis") - transform = axis_obj.get_transform() - return transform.transform, transform.inverted().transform - - -@contextmanager -def _disable_autolayout(): - """Context manager for preventing rc-controlled auto-layout behavior.""" - # This is a workaround for an issue in matplotlib, for details see - # https://github.com/mwaskom/seaborn/issues/2914 - # The only affect of this rcParam is to set the default value for - # layout= in plt.figure, so we could just do that instead. - # But then we would need to own the complexity of the transition - # from tight_layout=True -> layout="tight". This seems easier, - # but can be removed when (if) that is simpler on the matplotlib side, - # or if the layout algorithms are improved to handle figure legends. - orig_val = mpl.rcParams["figure.autolayout"] - try: - mpl.rcParams["figure.autolayout"] = False - yield - finally: - mpl.rcParams["figure.autolayout"] = orig_val - - -def _version_predates(lib: ModuleType, version: str) -> bool: - """Helper function for checking version compatibility.""" - return Version(lib.__version__) < Version(version) - - -def _scatter_legend_artist(**kws): - - kws = normalize_kwargs(kws, mpl.collections.PathCollection) - - edgecolor = kws.pop("edgecolor", None) - rc = mpl.rcParams - line_kws = { - "linestyle": "", - "marker": kws.pop("marker", "o"), - "markersize": np.sqrt(kws.pop("s", rc["lines.markersize"] ** 2)), - "markerfacecolor": kws.pop("facecolor", kws.get("color")), - "markeredgewidth": kws.pop("linewidth", 0), - **kws, - } - - if edgecolor is not None: - if edgecolor == "face": - line_kws["markeredgecolor"] = line_kws["markerfacecolor"] - else: - line_kws["markeredgecolor"] = edgecolor - - return mpl.lines.Line2D([], [], **line_kws) - - -def _get_patch_legend_artist(fill): - - def legend_artist(**kws): - - color = kws.pop("color", None) - if color is not None: - if fill: - kws["facecolor"] = color - else: - kws["edgecolor"] = color - kws["facecolor"] = "none" - - return mpl.patches.Rectangle((0, 0), 0, 0, **kws) - - return legend_artist diff --git a/seaborn/widgets.py b/seaborn/widgets.py deleted file mode 100644 index 502812af57f5fa2c7e8163c33f472b594f506c79..0000000000000000000000000000000000000000 --- a/seaborn/widgets.py +++ /dev/null @@ -1,426 +0,0 @@ -import numpy as np -import matplotlib.pyplot as plt -from matplotlib.colors import LinearSegmentedColormap - -try: - from ipywidgets import interact, FloatSlider, IntSlider -except ImportError: - def interact(f): - msg = "Interactive palettes require `ipywidgets`, which is not installed." - raise ImportError(msg) - -from .miscplot import palplot -from .palettes import (color_palette, dark_palette, light_palette, - diverging_palette, cubehelix_palette) - - -__all__ = ["choose_colorbrewer_palette", "choose_cubehelix_palette", - "choose_dark_palette", "choose_light_palette", - "choose_diverging_palette"] - - -def _init_mutable_colormap(): - """Create a matplotlib colormap that will be updated by the widgets.""" - greys = color_palette("Greys", 256) - cmap = LinearSegmentedColormap.from_list("interactive", greys) - cmap._init() - cmap._set_extremes() - return cmap - - -def _update_lut(cmap, colors): - """Change the LUT values in a matplotlib colormap in-place.""" - cmap._lut[:256] = colors - cmap._set_extremes() - - -def _show_cmap(cmap): - """Show a continuous matplotlib colormap.""" - from .rcmod import axes_style # Avoid circular import - with axes_style("white"): - f, ax = plt.subplots(figsize=(8.25, .75)) - ax.set(xticks=[], yticks=[]) - x = np.linspace(0, 1, 256)[np.newaxis, :] - ax.pcolormesh(x, cmap=cmap) - - -def choose_colorbrewer_palette(data_type, as_cmap=False): - """Select a palette from the ColorBrewer set. - - These palettes are built into matplotlib and can be used by name in - many seaborn functions, or by passing the object returned by this function. - - Parameters - ---------- - data_type : {'sequential', 'diverging', 'qualitative'} - This describes the kind of data you want to visualize. See the seaborn - color palette docs for more information about how to choose this value. - Note that you can pass substrings (e.g. 'q' for 'qualitative. - - as_cmap : bool - If True, the return value is a matplotlib colormap rather than a - list of discrete colors. - - Returns - ------- - pal or cmap : list of colors or matplotlib colormap - Object that can be passed to plotting functions. - - See Also - -------- - dark_palette : Create a sequential palette with dark low values. - light_palette : Create a sequential palette with bright low values. - diverging_palette : Create a diverging palette from selected colors. - cubehelix_palette : Create a sequential palette or colormap using the - cubehelix system. - - - """ - if data_type.startswith("q") and as_cmap: - raise ValueError("Qualitative palettes cannot be colormaps.") - - pal = [] - if as_cmap: - cmap = _init_mutable_colormap() - - if data_type.startswith("s"): - opts = ["Greys", "Reds", "Greens", "Blues", "Oranges", "Purples", - "BuGn", "BuPu", "GnBu", "OrRd", "PuBu", "PuRd", "RdPu", "YlGn", - "PuBuGn", "YlGnBu", "YlOrBr", "YlOrRd"] - variants = ["regular", "reverse", "dark"] - - @interact - def choose_sequential(name=opts, n=(2, 18), - desat=FloatSlider(min=0, max=1, value=1), - variant=variants): - if variant == "reverse": - name += "_r" - elif variant == "dark": - name += "_d" - - if as_cmap: - colors = color_palette(name, 256, desat) - _update_lut(cmap, np.c_[colors, np.ones(256)]) - _show_cmap(cmap) - else: - pal[:] = color_palette(name, n, desat) - palplot(pal) - - elif data_type.startswith("d"): - opts = ["RdBu", "RdGy", "PRGn", "PiYG", "BrBG", - "RdYlBu", "RdYlGn", "Spectral"] - variants = ["regular", "reverse"] - - @interact - def choose_diverging(name=opts, n=(2, 16), - desat=FloatSlider(min=0, max=1, value=1), - variant=variants): - if variant == "reverse": - name += "_r" - if as_cmap: - colors = color_palette(name, 256, desat) - _update_lut(cmap, np.c_[colors, np.ones(256)]) - _show_cmap(cmap) - else: - pal[:] = color_palette(name, n, desat) - palplot(pal) - - elif data_type.startswith("q"): - opts = ["Set1", "Set2", "Set3", "Paired", "Accent", - "Pastel1", "Pastel2", "Dark2"] - - @interact - def choose_qualitative(name=opts, n=(2, 16), - desat=FloatSlider(min=0, max=1, value=1)): - pal[:] = color_palette(name, n, desat) - palplot(pal) - - if as_cmap: - return cmap - return pal - - -def choose_dark_palette(input="husl", as_cmap=False): - """Launch an interactive widget to create a dark sequential palette. - - This corresponds with the :func:`dark_palette` function. This kind - of palette is good for data that range between relatively uninteresting - low values and interesting high values. - - Requires IPython 2+ and must be used in the notebook. - - Parameters - ---------- - input : {'husl', 'hls', 'rgb'} - Color space for defining the seed value. Note that the default is - different than the default input for :func:`dark_palette`. - as_cmap : bool - If True, the return value is a matplotlib colormap rather than a - list of discrete colors. - - Returns - ------- - pal or cmap : list of colors or matplotlib colormap - Object that can be passed to plotting functions. - - See Also - -------- - dark_palette : Create a sequential palette with dark low values. - light_palette : Create a sequential palette with bright low values. - cubehelix_palette : Create a sequential palette or colormap using the - cubehelix system. - - """ - pal = [] - if as_cmap: - cmap = _init_mutable_colormap() - - if input == "rgb": - @interact - def choose_dark_palette_rgb(r=(0., 1.), - g=(0., 1.), - b=(0., 1.), - n=(3, 17)): - color = r, g, b - if as_cmap: - colors = dark_palette(color, 256, input="rgb") - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = dark_palette(color, n, input="rgb") - palplot(pal) - - elif input == "hls": - @interact - def choose_dark_palette_hls(h=(0., 1.), - l=(0., 1.), # noqa: E741 - s=(0., 1.), - n=(3, 17)): - color = h, l, s - if as_cmap: - colors = dark_palette(color, 256, input="hls") - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = dark_palette(color, n, input="hls") - palplot(pal) - - elif input == "husl": - @interact - def choose_dark_palette_husl(h=(0, 359), - s=(0, 99), - l=(0, 99), # noqa: E741 - n=(3, 17)): - color = h, s, l - if as_cmap: - colors = dark_palette(color, 256, input="husl") - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = dark_palette(color, n, input="husl") - palplot(pal) - - if as_cmap: - return cmap - return pal - - -def choose_light_palette(input="husl", as_cmap=False): - """Launch an interactive widget to create a light sequential palette. - - This corresponds with the :func:`light_palette` function. This kind - of palette is good for data that range between relatively uninteresting - low values and interesting high values. - - Requires IPython 2+ and must be used in the notebook. - - Parameters - ---------- - input : {'husl', 'hls', 'rgb'} - Color space for defining the seed value. Note that the default is - different than the default input for :func:`light_palette`. - as_cmap : bool - If True, the return value is a matplotlib colormap rather than a - list of discrete colors. - - Returns - ------- - pal or cmap : list of colors or matplotlib colormap - Object that can be passed to plotting functions. - - See Also - -------- - light_palette : Create a sequential palette with bright low values. - dark_palette : Create a sequential palette with dark low values. - cubehelix_palette : Create a sequential palette or colormap using the - cubehelix system. - - """ - pal = [] - if as_cmap: - cmap = _init_mutable_colormap() - - if input == "rgb": - @interact - def choose_light_palette_rgb(r=(0., 1.), - g=(0., 1.), - b=(0., 1.), - n=(3, 17)): - color = r, g, b - if as_cmap: - colors = light_palette(color, 256, input="rgb") - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = light_palette(color, n, input="rgb") - palplot(pal) - - elif input == "hls": - @interact - def choose_light_palette_hls(h=(0., 1.), - l=(0., 1.), # noqa: E741 - s=(0., 1.), - n=(3, 17)): - color = h, l, s - if as_cmap: - colors = light_palette(color, 256, input="hls") - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = light_palette(color, n, input="hls") - palplot(pal) - - elif input == "husl": - @interact - def choose_light_palette_husl(h=(0, 359), - s=(0, 99), - l=(0, 99), # noqa: E741 - n=(3, 17)): - color = h, s, l - if as_cmap: - colors = light_palette(color, 256, input="husl") - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = light_palette(color, n, input="husl") - palplot(pal) - - if as_cmap: - return cmap - return pal - - -def choose_diverging_palette(as_cmap=False): - """Launch an interactive widget to choose a diverging color palette. - - This corresponds with the :func:`diverging_palette` function. This kind - of palette is good for data that range between interesting low values - and interesting high values with a meaningful midpoint. (For example, - change scores relative to some baseline value). - - Requires IPython 2+ and must be used in the notebook. - - Parameters - ---------- - as_cmap : bool - If True, the return value is a matplotlib colormap rather than a - list of discrete colors. - - Returns - ------- - pal or cmap : list of colors or matplotlib colormap - Object that can be passed to plotting functions. - - See Also - -------- - diverging_palette : Create a diverging color palette or colormap. - choose_colorbrewer_palette : Interactively choose palettes from the - colorbrewer set, including diverging palettes. - - """ - pal = [] - if as_cmap: - cmap = _init_mutable_colormap() - - @interact - def choose_diverging_palette( - h_neg=IntSlider(min=0, - max=359, - value=220), - h_pos=IntSlider(min=0, - max=359, - value=10), - s=IntSlider(min=0, max=99, value=74), - l=IntSlider(min=0, max=99, value=50), # noqa: E741 - sep=IntSlider(min=1, max=50, value=10), - n=(2, 16), - center=["light", "dark"] - ): - if as_cmap: - colors = diverging_palette(h_neg, h_pos, s, l, sep, 256, center) - _update_lut(cmap, colors) - _show_cmap(cmap) - else: - pal[:] = diverging_palette(h_neg, h_pos, s, l, sep, n, center) - palplot(pal) - - if as_cmap: - return cmap - return pal - - -def choose_cubehelix_palette(as_cmap=False): - """Launch an interactive widget to create a sequential cubehelix palette. - - This corresponds with the :func:`cubehelix_palette` function. This kind - of palette is good for data that range between relatively uninteresting - low values and interesting high values. The cubehelix system allows the - palette to have more hue variance across the range, which can be helpful - for distinguishing a wider range of values. - - Requires IPython 2+ and must be used in the notebook. - - Parameters - ---------- - as_cmap : bool - If True, the return value is a matplotlib colormap rather than a - list of discrete colors. - - Returns - ------- - pal or cmap : list of colors or matplotlib colormap - Object that can be passed to plotting functions. - - See Also - -------- - cubehelix_palette : Create a sequential palette or colormap using the - cubehelix system. - - """ - pal = [] - if as_cmap: - cmap = _init_mutable_colormap() - - @interact - def choose_cubehelix(n_colors=IntSlider(min=2, max=16, value=9), - start=FloatSlider(min=0, max=3, value=0), - rot=FloatSlider(min=-1, max=1, value=.4), - gamma=FloatSlider(min=0, max=5, value=1), - hue=FloatSlider(min=0, max=1, value=.8), - light=FloatSlider(min=0, max=1, value=.85), - dark=FloatSlider(min=0, max=1, value=.15), - reverse=False): - - if as_cmap: - colors = cubehelix_palette(256, start, rot, gamma, - hue, light, dark, reverse) - _update_lut(cmap, np.c_[colors, np.ones(256)]) - _show_cmap(cmap) - else: - pal[:] = cubehelix_palette(n_colors, start, rot, gamma, - hue, light, dark, reverse) - palplot(pal) - - if as_cmap: - return cmap - return pal