I tak skończę pewnie jako kierowca tira albo kasjer w Żabce, ale chętnie bym się dowiedział jak użyć w sparku spark.read.csv(...)
na pliku lokalnym.
Kod na githubie jaki znalazłem ma nieosiągalne zależności i nie jestem w stanie tego odpalić: https://github.com/sryza/aas/tree/master/ch02-intro
Spróbowałem tak:
import org.apache.spark.sql.SparkSession
object Main extends Serializable {
def main(args: Array[String]): Unit = {
val title = "Advance Analitycs with Spark 02";
val pathIn: String = "file:///D:/myfolder/block_1.csv"
val spark = SparkSession.builder
.master("local")
.appName(title)
.getOrCreate()
val csv = spark.read.csv(pathIn)
}
}
pom.xml:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.julian.spark</groupId>
<artifactId>spark-dataframe-start</artifactId>
<version>1.0-SNAPSHOT</version>
<inceptionYear>2008</inceptionYear>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.12.1</scala.version>
<scala.compat.version>2.12</scala.compat.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.12.1</version>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.12</artifactId>
<version>3.0.8</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>2.4.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>2.4.4</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.1</version>
<configuration>
<mainClass>App</mainClass>
<scalaCompatVersion>2.11</scalaCompatVersion>
</configuration>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- disable surefire -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.7</version>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
<!-- enable scalatest -->
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<configuration>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>WDF TestSuite.txt</filereports>
</configuration>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
błąd dostaję taki:
Exception in thread "main" java.lang.NoSuchMethodError: 'boolean scala.util.Properties$.coloredOutputEnabled()'
at scala.reflect.internal.TypeDebugging$typeDebug$.<init>(TypeDebugging.scala:69)
at scala.reflect.internal.SymbolTable.typeDebug$lzycompute$1(SymbolTable.scala:27)
at scala.reflect.internal.SymbolTable.typeDebug(SymbolTable.scala:27)
at scala.reflect.runtime.JavaUniverseForce.force(JavaUniverseForce.scala:67)
at scala.reflect.runtime.JavaUniverseForce.force$(JavaUniverseForce.scala:18)
at scala.reflect.runtime.JavaUniverse.force(JavaUniverse.scala:30)
at scala.reflect.runtime.JavaUniverse.init(JavaUniverse.scala:162)
at scala.reflect.runtime.JavaUniverse.<init>(JavaUniverse.scala:93)
at scala.reflect.runtime.package$.universe$lzycompute(package.scala:29)
at scala.reflect.runtime.package$.universe(package.scala:29)
at org.apache.spark.sql.catalyst.ScalaReflection$.<init>(ScalaReflection.scala:50)
at org.apache.spark.sql.catalyst.ScalaReflection$.<clinit>(ScalaReflection.scala)
at org.apache.spark.sql.catalyst.encoders.RowEncoder$.serializerFor(RowEncoder.scala:74)
at org.apache.spark.sql.catalyst.encoders.RowEncoder$.apply(RowEncoder.scala:61)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:79)
at org.apache.spark.sql.SparkSession.baseRelationToDataFrame(SparkSession.scala:432)
at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.createBaseDataset(CSVDataSource.scala:280)
at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.infer(CSVDataSource.scala:235)
at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:68)
at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:63)
at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$12(DataSource.scala:183)
at scala.Option.orElse(Option.scala:289)
at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:180)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:373)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:618)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:467)
at com.julian.scalastarter.Main$.main(Main.scala:18)
at com.julian.scalastarter.Main.main(Main.scala)