From 73a4a45b6034eca31a69f55b6a60050909d66d86 Mon Sep 17 00:00:00 2001 From: James Verbus Date: Thu, 30 May 2024 10:06:27 -0700 Subject: [PATCH] Fixed build warnings due to upcoming Scala 3 changes and previously deprecated methods. (#52) --- .../isolationforest/IsolationForest.scala | 8 ++--- .../IsolationForestModelReadWrite.scala | 4 +-- .../isolationforest/IsolationTree.scala | 4 +-- .../isolationforest/BaggedPointTest.scala | 16 +++++----- .../IsolationForestModelWriteReadTest.scala | 32 +++++++++---------- .../isolationforest/IsolationForestTest.scala | 12 +++---- .../relevance/isolationforest/TestUtils.scala | 2 +- 7 files changed, 39 insertions(+), 39 deletions(-) diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala index 4894db0..e885339 100644 --- a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala +++ b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForest.scala @@ -46,7 +46,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores // Validate $(maxFeatures) and $(maxSamples) against input dataset and determine the values // actually used to train the model: numFeatures and numSamples - val totalNumFeatures = dataset.head.features.length + val totalNumFeatures = dataset.head().features.length val numFeatures = if ($(maxFeatures) > 1.0) { math.floor($(maxFeatures)).toInt } else { @@ -94,7 +94,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores .partitionBy(new HashPartitioner($(numEstimators))) val repartitionedFlattenedSampledDataset = repartitionedFlattenedSampledRdd .mapPartitions(x => x.map(y => y._2), preservesPartitioning = true) - .toDS + .toDS() logInfo(s"Training ${$(numEstimators)} isolation trees using" + s" ${repartitionedFlattenedSampledDataset.rdd.getNumPartitions} partitions.") @@ -106,7 +106,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores // Use a different seed for each partition to ensure each partition has an independent set of // random numbers. This ensures each tree is truly trained independently and doing so has a // measurable effect on the results. - val seed = $(randomSeed) + TaskContext.get.partitionId() + 2 + val seed = $(randomSeed) + TaskContext.get().partitionId() + 2 val rnd = new scala.util.Random(seed) val dataForTree = rnd.shuffle(x.toSeq).slice(0, numSamples).toArray @@ -124,7 +124,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores // random numbers. This ensures each tree is truly trained independently and doing so has a // measurable effect on the results. Iterator(IsolationTree - .fit(dataForTree, $(randomSeed) + $(numEstimators) + TaskContext.get.partitionId() + 2, featureIndices)) + .fit(dataForTree, $(randomSeed) + $(numEstimators) + TaskContext.get().partitionId() + 2, featureIndices)) }).collect() val isolationForestModel = copyValues( diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModelReadWrite.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModelReadWrite.scala index 4b593c3..aedaa8f 100644 --- a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModelReadWrite.scala +++ b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationForestModelReadWrite.scala @@ -254,7 +254,7 @@ private[isolationforest] case object IsolationForestModelReadWrite extends Loggi saveMetadata(instance, path, spark, Some(extraMetadata)) val dataPath = new Path(path, "data").toString - val nodeDataRDD = spark.sparkContext.parallelize(instance.isolationTrees.zipWithIndex) + val nodeDataRDD = spark.sparkContext.parallelize(instance.isolationTrees.zipWithIndex.toIndexedSeq) .flatMap { case (tree, treeID) => EnsembleNodeData.build(tree, treeID) } logInfo(s"Saving IsolationForestModel tree data to path ${dataPath}") spark.createDataFrame(nodeDataRDD) @@ -299,7 +299,7 @@ private[isolationforest] case object IsolationForestModelReadWrite extends Loggi val uid = instance.uid val cls = instance.getClass.getName - val params = instance.extractParamMap.toSeq + val params = instance.extractParamMap().toSeq val jsonParams = render(params.map { case ParamPair(p, v) => p.name -> parse(p.jsonEncode(v)) }.toList) diff --git a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationTree.scala b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationTree.scala index 94075cb..e99f251 100644 --- a/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationTree.scala +++ b/isolation-forest/src/main/scala/com/linkedin/relevance/isolationforest/IsolationTree.scala @@ -47,7 +47,7 @@ private[isolationforest] case object IsolationTree extends Logging { def fit(data: Array[DataPoint], randomSeed: Long, featureIndices: Array[Int]): IsolationTree = { logInfo(s"Fitting isolation tree with random seed ${randomSeed} on" + - s" ${featureIndices.seq.toString} features (indices) using ${data.length} data points.") + s" ${featureIndices.toIndexedSeq.toString} features (indices) using ${data.length} data points.") def log2(x: Double): Double = math.log10(x) / math.log10(2.0) val heightLimit = math.ceil(log2(data.length.toDouble)).toInt @@ -124,7 +124,7 @@ private[isolationforest] case object IsolationTree extends Logging { if (minFeatureValue != maxFeatureValue) { foundFeature = true featureIndex = featureIndexTrial - featureSplitValue = ((maxFeatureValue - minFeatureValue) * randomState.nextDouble + featureSplitValue = ((maxFeatureValue - minFeatureValue) * randomState.nextDouble() + minFeatureValue) } } diff --git a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/BaggedPointTest.scala b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/BaggedPointTest.scala index 0e52016..ff3f638 100644 --- a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/BaggedPointTest.scala +++ b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/BaggedPointTest.scala @@ -45,7 +45,7 @@ class BaggedPointTest { numCols: Int, expectedMean: Double, expectedStddev: Double, - epsilon: Double) { + epsilon: Double): Unit = { val values = new mutable.ArrayBuffer[Double]() data.foreach { row => @@ -63,7 +63,7 @@ class BaggedPointTest { val spark = getSparkSession val dataPointArray = generateDataPoints(1, 1000) - val rdd = spark.sparkContext.parallelize(dataPointArray) + val rdd = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq) val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42) baggedRDD.collect().foreach { baggedPoint => assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1) @@ -80,7 +80,7 @@ class BaggedPointTest { val seeds = Array(123, 5354, 230, 349867, 23987) val arr = generateDataPoints(1, 1000) - val rdd = spark.sparkContext.parallelize(arr) + val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD @@ -101,7 +101,7 @@ class BaggedPointTest { val seeds = Array(123, 5354, 230, 349867, 23987) val arr = generateDataPoints(1, 1000) - val rdd = spark.sparkContext.parallelize(arr) + val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD @@ -121,7 +121,7 @@ class BaggedPointTest { val seeds = Array(123, 5354, 230, 349867, 23987) val arr = generateDataPoints(1, 1000) - val rdd = spark.sparkContext.parallelize(arr) + val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD @@ -142,7 +142,7 @@ class BaggedPointTest { val seeds = Array(123, 5354, 230, 349867, 23987) val arr = generateDataPoints(1, 1000) - val rdd = spark.sparkContext.parallelize(arr) + val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq) seeds.foreach { seed => val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed) val subsampleCounts: Array[Array[Double]] = baggedRDD @@ -168,7 +168,7 @@ class BaggedPointTest { (1, dataPointArray(1)), (1, dataPointArray(1))) - val dataPointRDD = spark.sparkContext.parallelize(dataPointArray) + val dataPointRDD = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq) val baggedPointRDD = dataPointRDD.map(x => new BaggedPoint(x, subsampleWeights)) val flattenedBaggedPointRDD = BaggedPoint.flattenBaggedRDD(baggedPointRDD, 1L) val flattenedBaggedPointArray = flattenedBaggedPointRDD.collect() @@ -187,7 +187,7 @@ class BaggedPointTest { val dataPointArray = generateDataPoints(10, numRecords) val subsampleWeights = Array(1.3, 1.75) - val dataPointRDD = spark.sparkContext.parallelize(dataPointArray) + val dataPointRDD = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq) val baggedPointRDD = dataPointRDD.map(x => new BaggedPoint(x, subsampleWeights)) val flattenedBaggedPointRDD = BaggedPoint.flattenBaggedRDD(baggedPointRDD, 1L) val flattenedBaggedPointArray = flattenedBaggedPointRDD.collect() diff --git a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestModelWriteReadTest.scala b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestModelWriteReadTest.scala index 96d3d8c..dd94e09 100644 --- a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestModelWriteReadTest.scala +++ b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestModelWriteReadTest.scala @@ -38,14 +38,14 @@ class IsolationForestModelWriteReadTest extends Logging { // Write the trained model to disk and then read it back from disk val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModel" - isolationForestModel1.write.overwrite.save(savePath) + isolationForestModel1.write.overwrite().save(savePath) val isolationForestModel2 = IsolationForestModel.load(savePath) deleteDirectory(new File(savePath)) // Assert that all parameter values are equal Assert.assertEquals( - isolationForestModel1.extractParamMap.toString, - isolationForestModel2.extractParamMap.toString) + isolationForestModel1.extractParamMap().toString, + isolationForestModel2.extractParamMap().toString) Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples) Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures) Assert.assertEquals( @@ -64,8 +64,8 @@ class IsolationForestModelWriteReadTest extends Logging { Assert.assertEquals(auroc1, auroc2) // Assert the predicted labels are equal - val predictedLabels1 = scores1.map(x => x.predictedLabel).collect - val predictedLabels2 = scores2.map(x => x.predictedLabel).collect + val predictedLabels1 = scores1.map(x => x.predictedLabel).collect() + val predictedLabels2 = scores2.map(x => x.predictedLabel).collect() Assert.assertEquals(predictedLabels1.toSeq, predictedLabels2.toSeq) // Compare each tree in the original and saved/loaded model and assert they are equal @@ -102,14 +102,14 @@ class IsolationForestModelWriteReadTest extends Logging { // Write the trained model to disk and then read it back from disk val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModelZeroContamination" - isolationForestModel1.write.overwrite.save(savePath) + isolationForestModel1.write.overwrite().save(savePath) val isolationForestModel2 = IsolationForestModel.load(savePath) deleteDirectory(new File(savePath)) // Assert that all parameter values are equal Assert.assertEquals( - isolationForestModel1.extractParamMap.toString, - isolationForestModel2.extractParamMap.toString) + isolationForestModel1.extractParamMap().toString, + isolationForestModel2.extractParamMap().toString) Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples) Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures) Assert.assertEquals( @@ -128,8 +128,8 @@ class IsolationForestModelWriteReadTest extends Logging { Assert.assertEquals(auroc1, auroc2) // Assert the predicted labels are equal and always 0.0 - val predictedLabels1 = scores1.map(x => x.predictedLabel).collect - val predictedLabels2 = scores2.map(x => x.predictedLabel).collect + val predictedLabels1 = scores1.map(x => x.predictedLabel).collect() + val predictedLabels2 = scores2.map(x => x.predictedLabel).collect() val expectedLabels = Array.fill[Double](predictedLabels1.length)(0.0) Assert.assertEquals(predictedLabels1.toSeq, predictedLabels2.toSeq) Assert.assertEquals(predictedLabels2.toSeq, expectedLabels.toSeq) @@ -182,7 +182,7 @@ class IsolationForestModelWriteReadTest extends Logging { // Write the trained model to disk and then read it back from disk val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModelIdenticalFeatures" - isolationForestModel1.write.overwrite.save(savePath) + isolationForestModel1.write.overwrite().save(savePath) val isolationForestModel2 = IsolationForestModel.load(savePath) deleteDirectory(new File(savePath)) @@ -197,8 +197,8 @@ class IsolationForestModelWriteReadTest extends Logging { val scores2 = isolationForestModel2.transform(data).as[ScoringResult] Assert.assertEquals( - scores1.map(x => x.outlierScore).collect.toSeq, - scores2.map(x => x.outlierScore).collect.toSeq) + scores1.map(x => x.outlierScore).collect().toSeq, + scores2.map(x => x.outlierScore).collect().toSeq) spark.stop() } @@ -214,14 +214,14 @@ class IsolationForestModelWriteReadTest extends Logging { // Write the trained model to disk and then read it back from disk val savePath = System.getProperty("java.io.tmpdir") + "/emptyIsolationForestModelWriteReadTest" - isolationForestModel1.write.overwrite.save(savePath) + isolationForestModel1.write.overwrite().save(savePath) val isolationForestModel2 = IsolationForestModel.load(savePath) deleteDirectory(new File(savePath)) // Assert that all parameter values are equal Assert.assertEquals( - isolationForestModel1.extractParamMap.toString, - isolationForestModel2.extractParamMap.toString) + isolationForestModel1.extractParamMap().toString, + isolationForestModel2.extractParamMap().toString) Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples) Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures) Assert.assertEquals( diff --git a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala index 0ba5371..10435ca 100644 --- a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala +++ b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/IsolationForestTest.scala @@ -32,13 +32,13 @@ class IsolationForestTest { .setContaminationError(contamination * 0.01) .setRandomSeed(1) - isolationForest1.write.overwrite.save(savePath) + isolationForest1.write.overwrite().save(savePath) val isolationForest2 = IsolationForest.load(savePath) deleteDirectory(new File(savePath)) Assert.assertEquals( - isolationForest1.extractParamMap.toString, - isolationForest2.extractParamMap.toString) + isolationForest1.extractParamMap().toString, + isolationForest2.extractParamMap().toString) spark.stop() } @@ -148,7 +148,7 @@ class IsolationForestTest { // Calculate area under ROC curve and assert val scores = isolationForestModel.transform(data).as[ScoringResult] - val predictedLabels = scores.map(x => x.predictedLabel).collect + val predictedLabels = scores.map(x => x.predictedLabel).collect() val expectedLabels = Array.fill[Double](predictedLabels.length)(0.0) Assert.assertEquals( @@ -195,10 +195,10 @@ class IsolationForestTest { val labeledOutlierScoresMean = labeledOutlierScores .map(_.outlierScore) - .reduce(_+_) / labeledOutlierScores.count + .reduce(_+_) / labeledOutlierScores.count() val labeledInlierScoresMean = labeledInlierScores .map(_.outlierScore) - .reduce(_+_) / labeledInlierScores.count + .reduce(_+_) / labeledInlierScores.count() val uncert = 0.02 val expectedOutlierScoreMean = 0.61 diff --git a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/TestUtils.scala b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/TestUtils.scala index 608dca4..463423b 100644 --- a/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/TestUtils.scala +++ b/isolation-forest/src/test/scala/com/linkedin/relevance/isolationforest/TestUtils.scala @@ -28,7 +28,7 @@ object TestUtils { } // local context with 4 threads - SparkSession.builder + SparkSession.builder() .master("local[4]") .appName("testing-spark") .config(sparkConf)