Source code for mmlspark.featurize.text.TextFeaturizer

# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.


import sys
if sys.version >= '3':
    basestring = str

from pyspark.ml.param.shared import *
from pyspark import keyword_only
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from mmlspark.core.serialize.java_params_patch import *
from pyspark.ml.wrapper import JavaTransformer, JavaEstimator, JavaModel
from pyspark.ml.common import inherit_doc
from mmlspark.core.schema.Utils import *

[docs]@inherit_doc class TextFeaturizer(ComplexParamsMixin, JavaMLReadable, JavaMLWritable, JavaEstimator): """ Args: binary (bool): If true, all nonegative word counts are set to 1 (default: false) caseSensitiveStopWords (bool): Whether to do a case sensitive comparison over the stop words (default: false) defaultStopWordLanguage (str): Which language to use for the stop word remover, set this to custom to use the stopWords input (default: english) inputCol (str): The name of the input column minDocFreq (int): The minimum number of documents in which a term should appear. (default: 1) minTokenLength (int): Minimum token length, >= 0. (default: 0) nGramLength (int): The size of the Ngrams (default: 2) numFeatures (int): Set the number of features to hash each document to (default: 262144) outputCol (str): The name of the output column (default: [self.uid]_output) stopWords (str): The words to be filtered out. toLowercase (bool): Indicates whether to convert all characters to lowercase before tokenizing. (default: true) tokenizerGaps (bool): Indicates whether regex splits on gaps (true) or matches tokens (false). (default: true) tokenizerPattern (str): Regex pattern used to match delimiters if gaps is true or tokens if gaps is false. (default: \s+) useIDF (bool): Whether to scale the Term Frequencies by IDF (default: true) useNGram (bool): Whether to enumerate N grams (default: false) useStopWordsRemover (bool): Whether to remove stop words from tokenized data (default: false) useTokenizer (bool): Whether to tokenize the input (default: true) """ @keyword_only def __init__(self, binary=False, caseSensitiveStopWords=False, defaultStopWordLanguage="english", inputCol=None, minDocFreq=1, minTokenLength=0, nGramLength=2, numFeatures=262144, outputCol=None, stopWords=None, toLowercase=True, tokenizerGaps=True, tokenizerPattern="\s+", useIDF=True, useNGram=False, useStopWordsRemover=False, useTokenizer=True): super(TextFeaturizer, self).__init__() self._java_obj = self._new_java_obj("com.microsoft.ml.spark.featurize.text.TextFeaturizer") self.binary = Param(self, "binary", "binary: If true, all nonegative word counts are set to 1 (default: false)") self._setDefault(binary=False) self.caseSensitiveStopWords = Param(self, "caseSensitiveStopWords", "caseSensitiveStopWords: Whether to do a case sensitive comparison over the stop words (default: false)") self._setDefault(caseSensitiveStopWords=False) self.defaultStopWordLanguage = Param(self, "defaultStopWordLanguage", "defaultStopWordLanguage: Which language to use for the stop word remover, set this to custom to use the stopWords input (default: english)") self._setDefault(defaultStopWordLanguage="english") self.inputCol = Param(self, "inputCol", "inputCol: The name of the input column") self.minDocFreq = Param(self, "minDocFreq", "minDocFreq: The minimum number of documents in which a term should appear. (default: 1)") self._setDefault(minDocFreq=1) self.minTokenLength = Param(self, "minTokenLength", "minTokenLength: Minimum token length, >= 0. (default: 0)") self._setDefault(minTokenLength=0) self.nGramLength = Param(self, "nGramLength", "nGramLength: The size of the Ngrams (default: 2)") self._setDefault(nGramLength=2) self.numFeatures = Param(self, "numFeatures", "numFeatures: Set the number of features to hash each document to (default: 262144)") self._setDefault(numFeatures=262144) self.outputCol = Param(self, "outputCol", "outputCol: The name of the output column (default: [self.uid]_output)") self._setDefault(outputCol=self.uid + "_output") self.stopWords = Param(self, "stopWords", "stopWords: The words to be filtered out.") self.toLowercase = Param(self, "toLowercase", "toLowercase: Indicates whether to convert all characters to lowercase before tokenizing. (default: true)") self._setDefault(toLowercase=True) self.tokenizerGaps = Param(self, "tokenizerGaps", "tokenizerGaps: Indicates whether regex splits on gaps (true) or matches tokens (false). (default: true)") self._setDefault(tokenizerGaps=True) self.tokenizerPattern = Param(self, "tokenizerPattern", "tokenizerPattern: Regex pattern used to match delimiters if gaps is true or tokens if gaps is false. (default: \s+)") self._setDefault(tokenizerPattern="\s+") self.useIDF = Param(self, "useIDF", "useIDF: Whether to scale the Term Frequencies by IDF (default: true)") self._setDefault(useIDF=True) self.useNGram = Param(self, "useNGram", "useNGram: Whether to enumerate N grams (default: false)") self._setDefault(useNGram=False) self.useStopWordsRemover = Param(self, "useStopWordsRemover", "useStopWordsRemover: Whether to remove stop words from tokenized data (default: false)") self._setDefault(useStopWordsRemover=False) self.useTokenizer = Param(self, "useTokenizer", "useTokenizer: Whether to tokenize the input (default: true)") self._setDefault(useTokenizer=True) if hasattr(self, "_input_kwargs"): kwargs = self._input_kwargs else: kwargs = self.__init__._input_kwargs self.setParams(**kwargs)
[docs] @keyword_only def setParams(self, binary=False, caseSensitiveStopWords=False, defaultStopWordLanguage="english", inputCol=None, minDocFreq=1, minTokenLength=0, nGramLength=2, numFeatures=262144, outputCol=None, stopWords=None, toLowercase=True, tokenizerGaps=True, tokenizerPattern="\s+", useIDF=True, useNGram=False, useStopWordsRemover=False, useTokenizer=True): """ Set the (keyword only) parameters Args: binary (bool): If true, all nonegative word counts are set to 1 (default: false) caseSensitiveStopWords (bool): Whether to do a case sensitive comparison over the stop words (default: false) defaultStopWordLanguage (str): Which language to use for the stop word remover, set this to custom to use the stopWords input (default: english) inputCol (str): The name of the input column minDocFreq (int): The minimum number of documents in which a term should appear. (default: 1) minTokenLength (int): Minimum token length, >= 0. (default: 0) nGramLength (int): The size of the Ngrams (default: 2) numFeatures (int): Set the number of features to hash each document to (default: 262144) outputCol (str): The name of the output column (default: [self.uid]_output) stopWords (str): The words to be filtered out. toLowercase (bool): Indicates whether to convert all characters to lowercase before tokenizing. (default: true) tokenizerGaps (bool): Indicates whether regex splits on gaps (true) or matches tokens (false). (default: true) tokenizerPattern (str): Regex pattern used to match delimiters if gaps is true or tokens if gaps is false. (default: \s+) useIDF (bool): Whether to scale the Term Frequencies by IDF (default: true) useNGram (bool): Whether to enumerate N grams (default: false) useStopWordsRemover (bool): Whether to remove stop words from tokenized data (default: false) useTokenizer (bool): Whether to tokenize the input (default: true) """ if hasattr(self, "_input_kwargs"): kwargs = self._input_kwargs else: kwargs = self.__init__._input_kwargs return self._set(**kwargs)
[docs] def setBinary(self, value): """ Args: binary (bool): If true, all nonegative word counts are set to 1 (default: false) """ self._set(binary=value) return self
[docs] def getBinary(self): """ Returns: bool: If true, all nonegative word counts are set to 1 (default: false) """ return self.getOrDefault(self.binary)
[docs] def setCaseSensitiveStopWords(self, value): """ Args: caseSensitiveStopWords (bool): Whether to do a case sensitive comparison over the stop words (default: false) """ self._set(caseSensitiveStopWords=value) return self
[docs] def getCaseSensitiveStopWords(self): """ Returns: bool: Whether to do a case sensitive comparison over the stop words (default: false) """ return self.getOrDefault(self.caseSensitiveStopWords)
[docs] def setDefaultStopWordLanguage(self, value): """ Args: defaultStopWordLanguage (str): Which language to use for the stop word remover, set this to custom to use the stopWords input (default: english) """ self._set(defaultStopWordLanguage=value) return self
[docs] def getDefaultStopWordLanguage(self): """ Returns: str: Which language to use for the stop word remover, set this to custom to use the stopWords input (default: english) """ return self.getOrDefault(self.defaultStopWordLanguage)
[docs] def setInputCol(self, value): """ Args: inputCol (str): The name of the input column """ self._set(inputCol=value) return self
[docs] def getInputCol(self): """ Returns: str: The name of the input column """ return self.getOrDefault(self.inputCol)
[docs] def setMinDocFreq(self, value): """ Args: minDocFreq (int): The minimum number of documents in which a term should appear. (default: 1) """ self._set(minDocFreq=value) return self
[docs] def getMinDocFreq(self): """ Returns: int: The minimum number of documents in which a term should appear. (default: 1) """ return self.getOrDefault(self.minDocFreq)
[docs] def setMinTokenLength(self, value): """ Args: minTokenLength (int): Minimum token length, >= 0. (default: 0) """ self._set(minTokenLength=value) return self
[docs] def getMinTokenLength(self): """ Returns: int: Minimum token length, >= 0. (default: 0) """ return self.getOrDefault(self.minTokenLength)
[docs] def setNGramLength(self, value): """ Args: nGramLength (int): The size of the Ngrams (default: 2) """ self._set(nGramLength=value) return self
[docs] def getNGramLength(self): """ Returns: int: The size of the Ngrams (default: 2) """ return self.getOrDefault(self.nGramLength)
[docs] def setNumFeatures(self, value): """ Args: numFeatures (int): Set the number of features to hash each document to (default: 262144) """ self._set(numFeatures=value) return self
[docs] def getNumFeatures(self): """ Returns: int: Set the number of features to hash each document to (default: 262144) """ return self.getOrDefault(self.numFeatures)
[docs] def setOutputCol(self, value): """ Args: outputCol (str): The name of the output column (default: [self.uid]_output) """ self._set(outputCol=value) return self
[docs] def getOutputCol(self): """ Returns: str: The name of the output column (default: [self.uid]_output) """ return self.getOrDefault(self.outputCol)
[docs] def setStopWords(self, value): """ Args: stopWords (str): The words to be filtered out. """ self._set(stopWords=value) return self
[docs] def getStopWords(self): """ Returns: str: The words to be filtered out. """ return self.getOrDefault(self.stopWords)
[docs] def setToLowercase(self, value): """ Args: toLowercase (bool): Indicates whether to convert all characters to lowercase before tokenizing. (default: true) """ self._set(toLowercase=value) return self
[docs] def getToLowercase(self): """ Returns: bool: Indicates whether to convert all characters to lowercase before tokenizing. (default: true) """ return self.getOrDefault(self.toLowercase)
[docs] def setTokenizerGaps(self, value): """ Args: tokenizerGaps (bool): Indicates whether regex splits on gaps (true) or matches tokens (false). (default: true) """ self._set(tokenizerGaps=value) return self
[docs] def getTokenizerGaps(self): """ Returns: bool: Indicates whether regex splits on gaps (true) or matches tokens (false). (default: true) """ return self.getOrDefault(self.tokenizerGaps)
[docs] def setTokenizerPattern(self, value): """ Args: tokenizerPattern (str): Regex pattern used to match delimiters if gaps is true or tokens if gaps is false. (default: \s+) """ self._set(tokenizerPattern=value) return self
[docs] def getTokenizerPattern(self): """ Returns: str: Regex pattern used to match delimiters if gaps is true or tokens if gaps is false. (default: \s+) """ return self.getOrDefault(self.tokenizerPattern)
[docs] def setUseIDF(self, value): """ Args: useIDF (bool): Whether to scale the Term Frequencies by IDF (default: true) """ self._set(useIDF=value) return self
[docs] def getUseIDF(self): """ Returns: bool: Whether to scale the Term Frequencies by IDF (default: true) """ return self.getOrDefault(self.useIDF)
[docs] def setUseNGram(self, value): """ Args: useNGram (bool): Whether to enumerate N grams (default: false) """ self._set(useNGram=value) return self
[docs] def getUseNGram(self): """ Returns: bool: Whether to enumerate N grams (default: false) """ return self.getOrDefault(self.useNGram)
[docs] def setUseStopWordsRemover(self, value): """ Args: useStopWordsRemover (bool): Whether to remove stop words from tokenized data (default: false) """ self._set(useStopWordsRemover=value) return self
[docs] def getUseStopWordsRemover(self): """ Returns: bool: Whether to remove stop words from tokenized data (default: false) """ return self.getOrDefault(self.useStopWordsRemover)
[docs] def setUseTokenizer(self, value): """ Args: useTokenizer (bool): Whether to tokenize the input (default: true) """ self._set(useTokenizer=value) return self
[docs] def getUseTokenizer(self): """ Returns: bool: Whether to tokenize the input (default: true) """ return self.getOrDefault(self.useTokenizer)
[docs] @classmethod def read(cls): """ Returns an MLReader instance for this class. """ return JavaMMLReader(cls)
[docs] @staticmethod def getJavaPackage(): """ Returns package name String. """ return "com.microsoft.ml.spark.featurize.text.TextFeaturizer"
@staticmethod def _from_java(java_stage): module_name=TextFeaturizer.__module__ module_name=module_name.rsplit(".", 1)[0] + ".TextFeaturizer" return from_java(java_stage, module_name) def _create_model(self, java_model): return TextFeaturizerModel(java_model)
[docs]class TextFeaturizerModel(ComplexParamsMixin, JavaModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`TextFeaturizer`. This class is left empty on purpose. All necessary methods are exposed through inheritance. """
[docs] @classmethod def read(cls): """ Returns an MLReader instance for this class. """ return JavaMMLReader(cls)
[docs] @staticmethod def getJavaPackage(): """ Returns package name String. """ return "com.microsoft.ml.spark.featurize.text.TextFeaturizerModel"
@staticmethod def _from_java(java_stage): module_name=TextFeaturizerModel.__module__ module_name=module_name.rsplit(".", 1)[0] + ".TextFeaturizerModel" return from_java(java_stage, module_name)