Source code for deepmatcher.data.process

import copy
import io
import logging
import os
from timeit import default_timer as timer

import six

from torchtext.utils import unicode_csv_reader

from .dataset import MatchingDataset
from .field import MatchingField

logger = logging.getLogger(__name__)


def _check_header(header, id_attr, left_prefix, right_prefix, label_attr, ignore_columns):
    r"""Verify CSV file header.

    Checks that:
    * There is a label column
    * There is an ID column
    * All columns except the label and ID columns, and ignored columns start with either
        the left table attribute prefix or the right table attribute prefix.
    * The number of left and right table attributes are the same.
    """
    # assert id_attr in header
    if label_attr:
        assert label_attr in header

    for attr in header:
        if attr not in (id_attr, label_attr) and attr not in ignore_columns:
            if not attr.startswith(left_prefix) and not attr.startswith(right_prefix):
                raise ValueError('Attribute ' + attr + ' is not a left or a right table '
                                 'column, not a label or id and is not ignored. Not sure '
                                 'what it is...')

    num_left = sum(attr.startswith(left_prefix) for attr in header)
    num_right = sum(attr.startswith(right_prefix) for attr in header)
    assert num_left == num_right


def _make_fields(header, id_attr, label_attr, ignore_columns, lower, tokenize,
                 include_lengths):
    r"""Create field metadata, i.e., attribute processing specification for each
    attribute.

    This includes fields for label and ID columns.

    Returns:
        list(tuple(str, MatchingField)): A list of tuples containing column name
            (e.g. "left_address") and corresponding :class:`~data.MatchingField` pairs,
            in the same order that the columns occur in the CSV file.
    """
    assert isinstance(tokenize, six.string_types)

    text_field = MatchingField(
        lower=lower,
        tokenize=tokenize,
        init_token='<<<',
        eos_token='>>>',
        batch_first=True,
        include_lengths=include_lengths)
    numeric_field = MatchingField(
        sequential=False, preprocessing=lambda x: int(x), use_vocab=False)
    id_field = MatchingField(sequential=False, use_vocab=False, id=True)

    fields = []
    for attr in header:
        if attr == id_attr:
            fields.append((attr, id_field))
        elif attr == label_attr:
            fields.append((attr, numeric_field))
        elif attr in ignore_columns:
            fields.append((attr, None))
        else:
            fields.append((attr, text_field))
    return fields


def _maybe_download_nltk_data():
    import nltk
    nltk.download('perluniprops', quiet=True)
    nltk.download('nonbreaking_prefixes', quiet=True)
    nltk.download('punkt', quiet=True)


[docs]def process(path,
            train=None,
            validation=None,
            test=None,
            unlabeled=None,
            cache='cacheddata.pth',
            check_cached_data=True,
            auto_rebuild_cache=True,
            tokenize='nltk',
            lowercase=True,
            embeddings='fasttext.en.bin',
            embeddings_cache_path='~/.vector_cache',
            ignore_columns=(),
            include_lengths=True,
            id_attr='id',
            label_attr='label',
            left_prefix='left_',
            right_prefix='right_',
            pca=True):
    """Creates dataset objects for multiple splits of a dataset.

    This involves the following steps (if data cannot be retrieved from the cache):
    #. Read CSV header of a data file and verify header is sane.
    #. Create fields, i.e., column processing specifications (e.g. tokenization, label
        conversion to integers etc.)
    #. Load each data file:
        #. Read each example (tuple pair) in specified CSV file.
        #. Preprocess example. Involves lowercasing and tokenization (unless disabled).
        #. Compute metadata if training data file. \
            See :meth:`MatchingDataset.compute_metadata` for details.
    #. Create vocabulary consisting of all tokens in all attributes in all datasets.
    #. Download word embedding data if necessary.
    #. Create mapping from each word in vocabulary to its word embedding.
    #. Compute metadata
    #. Write to cache

    Arguments:
        path (str): Common prefix of the splits' file paths.
        train (str): Suffix to add to path for the train set.
        validation (str): Suffix to add to path for the validation set, or None
            for no validation set. Default is None.
        test (str): Suffix to add to path for the test set, or None for no test
            set. Default is None.
        cache (str): Suffix to add to path for cache file. If `None` disables caching.
        check_cached_data (bool): Verify that data files haven't changes since the
            cache was constructed and that relevant field options haven't changed.
        auto_rebuild_cache (bool): Automatically rebuild the cache if the data files
            are modified or if the field options change. Defaults to False.
        lowercase (bool): Whether to lowercase all words in all attributes.
        embeddings (str or list): One or more of the following strings:

            * `fasttext.{lang}.bin`:
                This uses sub-word level word embeddings based on binary models from "wiki
                word vectors" released by FastText. {lang} is 'en' or any other 2 letter
                ISO 639-1 Language Code, or 3 letter ISO 639-2 Code, if the language does
                not have a 2 letter code. 300d vectors.
                ``fasttext.en.bin`` is the default.
            * `fasttext.wiki.vec`:
                Uses wiki news word vectors released as part of "Advances in Pre-Training
                Distributed Word Representations" by Mikolov et al. (2018). 300d vectors.
            * `fasttext.crawl.vec`:
                Uses Common Crawl word vectors released as part of "Advances in
                Pre-Training Distributed Word Representations" by Mikolov et al. (2018).
                300d vectors.
            * `glove.6B.{dims}`:
                Uses uncased Glove trained on Wiki + Gigaword. {dims} is one of (50d,
                100d, 200d, or 300d).
            * `glove.42B.300d`:
                Uses uncased Glove trained on Common Crawl. 300d vectors.
            * `glove.840B.300d`:
                Uses cased Glove trained on Common Crawl. 300d vectors.
            * `glove.twitter.27B.{dims}`:
                Uses cased Glove trained on Twitter. {dims} is one of (25d, 50d, 100d, or
                200d).
        embeddings_cache_path (str): Directory to store dowloaded word vector data.
        ignore_columns (list): A list of columns to ignore in the CSV files.
        include_lengths (bool): Whether to provide the model with the lengths of
            each attribute sequence in each batch. If True, length information can be
            used by the neural network, e.g. when picking the last RNN output of each
            attribute sequence.
        id_attr (str): The name of the tuple pair ID column in the CSV file.
        label_attr (str): The name of the tuple pair match label column in the CSV file.
        left_prefix (str): The prefix for attribute names belonging to the left table.
        right_prefix (str): The prefix for attribute names belonging to the right table.
        pca (bool): Whether to compute PCA for each attribute (needed for SIF model).
            Defaults to False.

    Returns:
        Tuple[MatchingDataset]: Datasets for (train, validation, and test) splits in that
            order, if provided, or dataset for unlabeled, if provided.
    """
    if unlabeled is not None:
        raise ValueError('Parameter "unlabeled" has been deprecated, use '
                         '"deepmatcher.data.process_unlabeled" instead.')

    # TODO(Sid): check for all datasets to make sure the files exist and have the same schema
    a_dataset = train or validation or test
    with io.open(os.path.expanduser(os.path.join(path, a_dataset)), encoding="utf8") as f:
        header = next(unicode_csv_reader(f))

    _maybe_download_nltk_data()
    _check_header(header, id_attr, left_prefix, right_prefix, label_attr, ignore_columns)
    fields = _make_fields(header, id_attr, label_attr, ignore_columns, lowercase,
                          tokenize, include_lengths)

    column_naming = {
        'id': id_attr,
        'left': left_prefix,
        'right': right_prefix,
        'label': label_attr
    }

    datasets = MatchingDataset.splits(
        path,
        train,
        validation,
        test,
        fields,
        embeddings,
        embeddings_cache_path,
        column_naming,
        cache,
        check_cached_data,
        auto_rebuild_cache,
        train_pca=pca)

    # Save additional information to train dataset.
    datasets[0].ignore_columns = ignore_columns
    datasets[0].tokenize = tokenize
    datasets[0].lowercase = lowercase
    datasets[0].include_lengths = include_lengths

    return datasets


[docs]def process_unlabeled(path, trained_model, ignore_columns=None):
    """Creates a dataset object for an unlabeled dataset.

    Args:
        path (string):
            The full path to the unlabeled data file (not just the directory).
        trained_model (:class:`~deepmatcher.MatchingModel`):
            The trained model. The model is aware of the configuration of the training
            data on which it was trained, and so this method reuses the same
            configuration for the unlabeled data.
        ignore_columns (list):
            A list of columns to ignore in the unlabeled CSV file.
    """
    with io.open(path, encoding="utf8") as f:
        header = next(unicode_csv_reader(f))

    train_info = trained_model.meta
    if ignore_columns is None:
        ignore_columns = train_info.ignore_columns
    column_naming = dict(train_info.column_naming)
    column_naming['label'] = None

    fields = _make_fields(header, column_naming['id'], column_naming['label'],
                          ignore_columns, train_info.lowercase, train_info.tokenize,
                          train_info.include_lengths)

    begin = timer()
    dataset_args = {'fields': fields, 'column_naming': column_naming}
    dataset = MatchingDataset(path=path, **dataset_args)

    # Make sure we have the same attributes.
    assert set(dataset.all_text_fields) == set(train_info.all_text_fields)

    after_load = timer()
    logger.info('Data load time: {}s'.format(after_load - begin))

    reverse_fields_dict = dict((pair[1], pair[0]) for pair in fields)
    for field, name in reverse_fields_dict.items():
        if field is not None and field.use_vocab:
            # Copy over vocab from original train data.
            field.vocab = copy.deepcopy(train_info.vocabs[name])
            # Then extend the vocab.
            field.extend_vocab(
                dataset, vectors=train_info.embeddings, cache=train_info.embeddings_cache)

    dataset.vocabs = {
        name: dataset.fields[name].vocab
        for name in train_info.all_text_fields
    }

    after_vocab = timer()
    logger.info('Vocab update time: {}s'.format(after_vocab - after_load))

    return dataset