Source code for mxnet.contrib.text.utils

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# coding: utf-8

"""Provide utilities for text data processing."""
from __future__ import absolute_import
from __future__ import print_function

import collections
import re


[docs]def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
                          to_lower=False, counter_to_update=None):
    """Counts tokens in the specified string.

    For token_delim=\'\' and seq_delim=\'\', a specified string of two sequences of
    tokens may look like::

    token1token2token3token4token5

     and  are regular expressions. Make use of \\\\ to allow special characters as
    delimiters. The list of
    special characters can be found at https://docs.python.org/3/library/re.html.

    Parameters
    ----------
    source_str : str
        A source string of tokens.
    token_delim : str, default ' '
        A token delimiter.
    seq_delim : str, default '\\\\n'
        A sequence delimiter.
    to_lower : bool, default False
        Whether to convert the source source_str to the lower case.
    counter_to_update : collections.Counter or None, default None
        The collections.Counter instance to be updated with the token counts of `source_str`. If
        None, return a new collections.Counter instance counting tokens from `source_str`.


    Returns
    -------
    collections.Counter
        The `counter_to_update` collections.Counter instance after being updated with the token
        counts of `source_str`. If `counter_to_update` is None, return a new collections.Counter
        instance counting tokens from `source_str`.


    Examples
    --------
    >>> source_str = ' Life is great ! \\n life is good . \\n'
    >>> count_tokens_from_str(token_line, ' ', '\\n', True)
    Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2})


    >>> source_str = '*Life*is*great*!*\\n*life*is*good*.*\\n'
    >>> count_tokens_from_str(token_line, '\\*', '\\n', True)
    Counter({'is': 2, 'life': 2, '!': 1, 'great': 1, 'good': 1, '.': 1})
    """

    source_str = filter(None,
                        re.split(token_delim + '|' + seq_delim, source_str))
    if to_lower:
        source_str = [t.lower() for t in source_str]

    if counter_to_update is None:
        return collections.Counter(source_str)
    else:
        counter_to_update.update(source_str)
        return counter_to_update