Source code for mxnet.contrib.text.utils
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# coding: utf-8
"""Provide utilities for text data processing."""
from __future__ import absolute_import
from __future__ import print_function
import collections
import re
[docs]def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
to_lower=False, counter_to_update=None):
"""Counts tokens in the specified string.
For token_delim=\'\' and seq_delim=\'\', a specified string of two sequences of
tokens may look like::
token1 | token2 | token3 | token4 | token5 |
and are regular expressions. Make use of \\\\ to allow special characters as
delimiters. The list of
special characters can be found at https://docs.python.org/3/library/re.html.
Parameters
----------
source_str : str
A source string of tokens.
token_delim : str, default ' '
A token delimiter.
seq_delim : str, default '\\\\n'
A sequence delimiter.
to_lower : bool, default False
Whether to convert the source source_str to the lower case.
counter_to_update : collections.Counter or None, default None
The collections.Counter instance to be updated with the token counts of `source_str`. If
None, return a new collections.Counter instance counting tokens from `source_str`.
Returns
-------
collections.Counter
The `counter_to_update` collections.Counter instance after being updated with the token
counts of `source_str`. If `counter_to_update` is None, return a new collections.Counter
instance counting tokens from `source_str`.
Examples
--------
>>> source_str = ' Life is great ! \\n life is good . \\n'
>>> count_tokens_from_str(token_line, ' ', '\\n', True)
Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2})
>>> source_str = '*Life*is*great*!*\\n*life*is*good*.*\\n'
>>> count_tokens_from_str(token_line, '\\*', '\\n', True)
Counter({'is': 2, 'life': 2, '!': 1, 'great': 1, 'good': 1, '.': 1})
"""
source_str = filter(None,
re.split(token_delim + '|' + seq_delim, source_str))
if to_lower:
source_str = [t.lower() for t in source_str]
if counter_to_update is None:
return collections.Counter(source_str)
else:
counter_to_update.update(source_str)
return counter_to_update
| | | |