diff --git a/apache-spark/data/iris.data b/apache-spark/data/iris.data new file mode 100644 index 0000000000..396653cc98 --- /dev/null +++ b/apache-spark/data/iris.data @@ -0,0 +1,150 @@ +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/apache-spark/model/logistic-regression/data/._SUCCESS.crc b/apache-spark/model/logistic-regression/data/._SUCCESS.crc new file mode 100644 index 0000000000..3b7b044936 Binary files /dev/null and b/apache-spark/model/logistic-regression/data/._SUCCESS.crc differ diff --git a/apache-spark/model/logistic-regression/data/.part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet.crc b/apache-spark/model/logistic-regression/data/.part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet.crc new file mode 100644 index 0000000000..46311024cf Binary files /dev/null and b/apache-spark/model/logistic-regression/data/.part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet.crc differ diff --git a/apache-spark/model/logistic-regression/data/_SUCCESS b/apache-spark/model/logistic-regression/data/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apache-spark/model/logistic-regression/data/part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet b/apache-spark/model/logistic-regression/data/part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet new file mode 100644 index 0000000000..07f442ef34 Binary files /dev/null and b/apache-spark/model/logistic-regression/data/part-00000-f3a3ee61-f200-41ff-80d9-8e1edf399601-c000.snappy.parquet differ diff --git a/apache-spark/model/logistic-regression/metadata/._SUCCESS.crc b/apache-spark/model/logistic-regression/metadata/._SUCCESS.crc new file mode 100644 index 0000000000..3b7b044936 Binary files /dev/null and b/apache-spark/model/logistic-regression/metadata/._SUCCESS.crc differ diff --git a/apache-spark/model/logistic-regression/metadata/.part-00000.crc b/apache-spark/model/logistic-regression/metadata/.part-00000.crc new file mode 100644 index 0000000000..35d5043fbe Binary files /dev/null and b/apache-spark/model/logistic-regression/metadata/.part-00000.crc differ diff --git a/apache-spark/model/logistic-regression/metadata/_SUCCESS b/apache-spark/model/logistic-regression/metadata/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apache-spark/model/logistic-regression/metadata/part-00000 b/apache-spark/model/logistic-regression/metadata/part-00000 new file mode 100644 index 0000000000..e3f27fec10 --- /dev/null +++ b/apache-spark/model/logistic-regression/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.mllib.classification.LogisticRegressionModel","version":"1.0","numFeatures":4,"numClasses":3} diff --git a/apache-spark/pom.xml b/apache-spark/pom.xml index b8c1962dd4..3df81e5aee 100644 --- a/apache-spark/pom.xml +++ b/apache-spark/pom.xml @@ -1,31 +1,32 @@ - - 4.0.0 - com.baeldung - apache-spark - 1.0-SNAPSHOT - apache-spark - jar - http://maven.apache.org + + 4.0.0 + com.baeldung + apache-spark + 1.0-SNAPSHOT + apache-spark + jar + http://maven.apache.org - - com.baeldung - parent-modules - 1.0.0-SNAPSHOT - + + com.baeldung + parent-modules + 1.0.0-SNAPSHOT + - - - org.apache.spark - spark-core_2.11 - ${org.apache.spark.spark-core.version} - provided - + - org.apache.spark - spark-sql_2.11 - ${org.apache.spark.spark-sql.version} - provided + org.apache.spark + spark-core_2.11 + ${org.apache.spark.spark-core.version} + provided + + + org.apache.spark + spark-sql_2.11 + ${org.apache.spark.spark-sql.version} + provided org.apache.spark @@ -33,6 +34,12 @@ ${org.apache.spark.spark-streaming.version} provided + + org.apache.spark + spark-mllib_2.11 + ${org.apache.spark.spark-mllib.version} + provided + org.apache.spark spark-streaming-kafka-0-10_2.11 @@ -48,46 +55,47 @@ spark-cassandra-connector-java_2.11 ${com.datastax.spark.spark-cassandra-connector-java.version} - + - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${java.version} - ${java.version} - - - - maven-assembly-plugin - - - package - - single - - - - - - jar-with-dependencies - - - - - + + + org.apache.maven.plugins + maven-compiler-plugin + ${maven-compiler-plugin.version} + + ${java.version} + ${java.version} + + + + maven-assembly-plugin + + + package + + single + + + + + + jar-with-dependencies + + + + + - - 2.3.0 - 2.3.0 - 2.3.0 - 2.3.0 - 2.3.0 - 1.5.2 + + 2.3.0 + 2.3.0 + 2.3.0 + 2.3.0 + 2.3.0 + 2.3.0 + 1.5.2 3.2 - + diff --git a/apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java b/apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java new file mode 100644 index 0000000000..6094683031 --- /dev/null +++ b/apache-spark/src/main/java/com/baeldung/ml/MachineLearningApp.java @@ -0,0 +1,111 @@ +package com.baeldung.ml; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.classification.LogisticRegressionModel; +import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; +import org.apache.spark.mllib.evaluation.MulticlassMetrics; +import org.apache.spark.mllib.linalg.Matrix; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary; +import org.apache.spark.mllib.stat.Statistics; + +import scala.Tuple2; + +public class MachineLearningApp { + + public static void main(String[] args) { + + // 1. Setting the Spark Context + SparkConf conf = new SparkConf().setAppName("Main") + .setMaster("local[2]") + .set("spark.executor.memory", "3g") + .set("spark.driver.memory", "3g"); + JavaSparkContext sc = new JavaSparkContext(conf); + Logger.getLogger("org") + .setLevel(Level.OFF); + Logger.getLogger("akka") + .setLevel(Level.OFF); + + // 2. Loading the Data-set + String dataFile = "data\\iris.data"; + JavaRDD data = sc.textFile(dataFile); + + // 3. Exploratory Data Analysis + // 3.1. Creating Vector of Input Data + JavaRDD inputData = data.map(line -> { + String[] parts = line.split(","); + double[] v = new double[parts.length - 1]; + for (int i = 0; i < parts.length - 1; i++) { + v[i] = Double.parseDouble(parts[i]); + } + return Vectors.dense(v); + }); + // 3.2. Performing Statistical Analysis + MultivariateStatisticalSummary summary = Statistics.colStats(inputData.rdd()); + System.out.println("Summary Mean:"); + System.out.println(summary.mean()); + System.out.println("Summary Variance:"); + System.out.println(summary.variance()); + System.out.println("Summary Non-zero:"); + System.out.println(summary.numNonzeros()); + // 3.3. Performing Correlation Analysis + Matrix correlMatrix = Statistics.corr(inputData.rdd(), "pearson"); + System.out.println("Correlation Matrix:"); + System.out.println(correlMatrix.toString()); + + // 4. Data Preparation + // 4.1. Creating Map for Textual Output Labels + Map map = new HashMap(); + map.put("Iris-setosa", 0); + map.put("Iris-versicolor", 1); + map.put("Iris-virginica", 2); + // 4.2. Creating LabeledPoint of Input and Output Data + JavaRDD parsedData = data.map(line -> { + String[] parts = line.split(","); + double[] v = new double[parts.length - 1]; + for (int i = 0; i < parts.length - 1; i++) { + v[i] = Double.parseDouble(parts[i]); + } + return new LabeledPoint(map.get(parts[parts.length - 1]), Vectors.dense(v)); + }); + + // 5. Data Splitting into 80% Training and 20% Test Sets + JavaRDD[] splits = parsedData.randomSplit(new double[] { 0.8, 0.2 }, 11L); + JavaRDD trainingData = splits[0].cache(); + JavaRDD testData = splits[1]; + + // 6. Modeling + // 6.1. Model Training + LogisticRegressionModel model = new LogisticRegressionWithLBFGS().setNumClasses(3) + .run(trainingData.rdd()); + // 6.2. Model Evaluation + JavaPairRDD predictionAndLabels = testData.mapToPair(p -> new Tuple2<>(model.predict(p.features()), p.label())); + MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); + double accuracy = metrics.accuracy(); + System.out.println("Model Accuracy on Test Data: " + accuracy); + + // 7. Model Saving and Loading + // 7.1. Model Saving + model.save(sc.sc(), "model\\logistic-regression"); + // 7.2. Model Loading + LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc.sc(), "model\\logistic-regression"); + // 7.3. Prediction on New Data + Vector newData = Vectors.dense(new double[] { 1, 1, 1, 1 }); + double prediction = sameModel.predict(newData); + System.out.println("Model Prediction on New Data = " + prediction); + + // 8. Clean-up + sc.close(); + } + +}