# Copyright (C) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE in project root for information.
import sys
if sys.version >= '3':
basestring = str
from pyspark import SparkContext, SQLContext
from pyspark.sql import DataFrame
from pyspark.ml.param.shared import *
from pyspark import keyword_only
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from synapse.ml.core.serialize.java_params_patch import *
from pyspark.ml.wrapper import JavaTransformer, JavaEstimator, JavaModel
from pyspark.ml.evaluation import JavaEvaluator
from pyspark.ml.common import inherit_doc
from synapse.ml.core.schema.Utils import *
from pyspark.ml.param import TypeConverters
from synapse.ml.core.schema.TypeConversionUtils import generateTypeConverter, complexTypeConverter
from synapse.ml.isolationforest.IsolationForestModel import IsolationForestModel
[docs]@inherit_doc
class IsolationForest(ComplexParamsMixin, JavaMLReadable, JavaMLWritable, JavaEstimator):
"""
Args:
bootstrap (bool): If true, draw sample for each tree with replacement. If false, do not sample with replacement.
contamination (float): The fraction of outliers in the training data set. If this is set to 0.0, it speeds up the training and all predicted labels will be false. The model and outlier scores are otherwise unaffected by this parameter.
contaminationError (float): The error allowed when calculating the threshold required to achieve the specified contamination fraction. The default is 0.0, which forces an exact calculation of the threshold. The exact calculation is slow and can fail for large datasets. If there are issues with the exact calculation, a good choice for this parameter is often 1% of the specified contamination value.
featuresCol (object): The feature vector.
maxFeatures (float): The number of features used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.
maxSamples (float): The number of samples used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.
numEstimators (int): The number of trees in the ensemble.
predictionCol (object): The predicted label.
randomSeed (long): The seed used for the random number generator.
scoreCol (object): The outlier score.
"""
bootstrap = Param(Params._dummy(), "bootstrap", "If true, draw sample for each tree with replacement. If false, do not sample with replacement.", typeConverter=TypeConverters.toBoolean)
contamination = Param(Params._dummy(), "contamination", "The fraction of outliers in the training data set. If this is set to 0.0, it speeds up the training and all predicted labels will be false. The model and outlier scores are otherwise unaffected by this parameter.", typeConverter=TypeConverters.toFloat)
contaminationError = Param(Params._dummy(), "contaminationError", "The error allowed when calculating the threshold required to achieve the specified contamination fraction. The default is 0.0, which forces an exact calculation of the threshold. The exact calculation is slow and can fail for large datasets. If there are issues with the exact calculation, a good choice for this parameter is often 1% of the specified contamination value.", typeConverter=TypeConverters.toFloat)
featuresCol = Param(Params._dummy(), "featuresCol", "The feature vector.")
maxFeatures = Param(Params._dummy(), "maxFeatures", "The number of features used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.", typeConverter=TypeConverters.toFloat)
maxSamples = Param(Params._dummy(), "maxSamples", "The number of samples used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.", typeConverter=TypeConverters.toFloat)
numEstimators = Param(Params._dummy(), "numEstimators", "The number of trees in the ensemble.", typeConverter=TypeConverters.toInt)
predictionCol = Param(Params._dummy(), "predictionCol", "The predicted label.")
randomSeed = Param(Params._dummy(), "randomSeed", "The seed used for the random number generator.")
scoreCol = Param(Params._dummy(), "scoreCol", "The outlier score.")
@keyword_only
def __init__(
self,
java_obj=None,
bootstrap=False,
contamination=0.0,
contaminationError=0.0,
featuresCol="features",
maxFeatures=1.0,
maxSamples=256.0,
numEstimators=100,
predictionCol="predictedLabel",
randomSeed=1,
scoreCol="outlierScore"
):
super(IsolationForest, self).__init__()
if java_obj is None:
self._java_obj = self._new_java_obj("com.microsoft.azure.synapse.ml.isolationforest.IsolationForest", self.uid)
else:
self._java_obj = java_obj
self._setDefault(bootstrap=False)
self._setDefault(contamination=0.0)
self._setDefault(contaminationError=0.0)
self._setDefault(featuresCol="features")
self._setDefault(maxFeatures=1.0)
self._setDefault(maxSamples=256.0)
self._setDefault(numEstimators=100)
self._setDefault(predictionCol="predictedLabel")
self._setDefault(randomSeed=1)
self._setDefault(scoreCol="outlierScore")
if hasattr(self, "_input_kwargs"):
kwargs = self._input_kwargs
else:
kwargs = self.__init__._input_kwargs
if java_obj is None:
for k,v in kwargs.items():
if v is not None:
getattr(self, "set" + k[0].upper() + k[1:])(v)
[docs] @keyword_only
def setParams(
self,
bootstrap=False,
contamination=0.0,
contaminationError=0.0,
featuresCol="features",
maxFeatures=1.0,
maxSamples=256.0,
numEstimators=100,
predictionCol="predictedLabel",
randomSeed=1,
scoreCol="outlierScore"
):
"""
Set the (keyword only) parameters
"""
if hasattr(self, "_input_kwargs"):
kwargs = self._input_kwargs
else:
kwargs = self.__init__._input_kwargs
return self._set(**kwargs)
[docs] @classmethod
def read(cls):
""" Returns an MLReader instance for this class. """
return JavaMMLReader(cls)
[docs] @staticmethod
def getJavaPackage():
""" Returns package name String. """
return "com.microsoft.azure.synapse.ml.isolationforest.IsolationForest"
@staticmethod
def _from_java(java_stage):
module_name=IsolationForest.__module__
module_name=module_name.rsplit(".", 1)[0] + ".IsolationForest"
return from_java(java_stage, module_name)
[docs] def setBootstrap(self, value):
"""
Args:
bootstrap: If true, draw sample for each tree with replacement. If false, do not sample with replacement.
"""
self._set(bootstrap=value)
return self
[docs] def setContamination(self, value):
"""
Args:
contamination: The fraction of outliers in the training data set. If this is set to 0.0, it speeds up the training and all predicted labels will be false. The model and outlier scores are otherwise unaffected by this parameter.
"""
self._set(contamination=value)
return self
[docs] def setContaminationError(self, value):
"""
Args:
contaminationError: The error allowed when calculating the threshold required to achieve the specified contamination fraction. The default is 0.0, which forces an exact calculation of the threshold. The exact calculation is slow and can fail for large datasets. If there are issues with the exact calculation, a good choice for this parameter is often 1% of the specified contamination value.
"""
self._set(contaminationError=value)
return self
[docs] def setFeaturesCol(self, value):
"""
Args:
featuresCol: The feature vector.
"""
self._set(featuresCol=value)
return self
[docs] def setMaxFeatures(self, value):
"""
Args:
maxFeatures: The number of features used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.
"""
self._set(maxFeatures=value)
return self
[docs] def setMaxSamples(self, value):
"""
Args:
maxSamples: The number of samples used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.
"""
self._set(maxSamples=value)
return self
[docs] def setNumEstimators(self, value):
"""
Args:
numEstimators: The number of trees in the ensemble.
"""
self._set(numEstimators=value)
return self
[docs] def setPredictionCol(self, value):
"""
Args:
predictionCol: The predicted label.
"""
self._set(predictionCol=value)
return self
[docs] def setRandomSeed(self, value):
"""
Args:
randomSeed: The seed used for the random number generator.
"""
self._set(randomSeed=value)
return self
[docs] def setScoreCol(self, value):
"""
Args:
scoreCol: The outlier score.
"""
self._set(scoreCol=value)
return self
[docs] def getBootstrap(self):
"""
Returns:
bootstrap: If true, draw sample for each tree with replacement. If false, do not sample with replacement.
"""
return self.getOrDefault(self.bootstrap)
[docs] def getContamination(self):
"""
Returns:
contamination: The fraction of outliers in the training data set. If this is set to 0.0, it speeds up the training and all predicted labels will be false. The model and outlier scores are otherwise unaffected by this parameter.
"""
return self.getOrDefault(self.contamination)
[docs] def getContaminationError(self):
"""
Returns:
contaminationError: The error allowed when calculating the threshold required to achieve the specified contamination fraction. The default is 0.0, which forces an exact calculation of the threshold. The exact calculation is slow and can fail for large datasets. If there are issues with the exact calculation, a good choice for this parameter is often 1% of the specified contamination value.
"""
return self.getOrDefault(self.contaminationError)
[docs] def getFeaturesCol(self):
"""
Returns:
featuresCol: The feature vector.
"""
return self.getOrDefault(self.featuresCol)
[docs] def getMaxFeatures(self):
"""
Returns:
maxFeatures: The number of features used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.
"""
return self.getOrDefault(self.maxFeatures)
[docs] def getMaxSamples(self):
"""
Returns:
maxSamples: The number of samples used to train each tree. If this value is between 0.0 and 1.0, then it is treated as a fraction. If it is >1.0, then it is treated as a count.
"""
return self.getOrDefault(self.maxSamples)
[docs] def getNumEstimators(self):
"""
Returns:
numEstimators: The number of trees in the ensemble.
"""
return self.getOrDefault(self.numEstimators)
[docs] def getPredictionCol(self):
"""
Returns:
predictionCol: The predicted label.
"""
return self.getOrDefault(self.predictionCol)
[docs] def getRandomSeed(self):
"""
Returns:
randomSeed: The seed used for the random number generator.
"""
return self.getOrDefault(self.randomSeed)
[docs] def getScoreCol(self):
"""
Returns:
scoreCol: The outlier score.
"""
return self.getOrDefault(self.scoreCol)
def _create_model(self, java_model):
try:
model = IsolationForestModel(java_obj=java_model)
model._transfer_params_from_java()
except TypeError:
model = IsolationForestModel._from_java(java_model)
return model
def _fit(self, dataset):
java_model = self._fit_java(dataset)
return self._create_model(java_model)