$ pip install pyspark
$ pyspark
$ docker run -it --rm spark:python3 /opt/spark/bin/pyspark
df = spark.read.json("logs.json")
df.where("age > 21").select("name.first").show()
# Every record contains a label and feature vector
df = spark.createDataFrame(data, ["label", "features"])
# Split the data into train/test datasets
train_df, test_df = df.randomSplit([.80, .20], seed=42)
# Set hyperparameters for the algorithm
rf = RandomForestRegressor(numTrees=100)
# Fit the model to the training data
model = rf.fit(train_df)
# Generate predictions on the test dataset.
model.transform(test_df).show()
df = spark.read.csv("accounts.csv", header=True)
# Select subset of features and filter for balance > 0
filtered_df = df.select("AccountBalance", "CountOfDependents").filter("AccountBalance > 0")
# Generate summary statistics
filtered_df.summary().show()
$ docker run -it --rm spark /opt/spark/bin/spark-sql
spark-sql>
SELECT
name.first AS first_name,
name.last AS last_name,
age
FROM json.`logs.json`
WHERE age > 21;
$ docker run -it --rm spark /opt/spark/bin/spark-shell
scala>
val df = spark.read.json("logs.json")
df.where("age > 21")
.select("name.first").show()
$ docker run -it --rm spark /opt/spark/bin/spark-shell
scala>
Dataset df = spark.read().json("logs.json");
df.where("age > 21")
.select("name.first").show();
$ docker run -it --rm spark:r /opt/spark/bin/sparkR
>
df <- read.json(path = "logs.json")
df <- filter(df, df$age > 21)
head(select(df, df$name.first))
Spark SQL adapts the execution plan at runtime, such as automatically setting the number of reducers and join algorithms.
Use the same SQL you’re already comfortable with.
Spark SQL works on structured tables and unstructured data such as JSON or images.