Skip to content

Commit 90eccfd

Browse files
Luis YamadaLuis Yamada
Luis Yamada
authored and
Luis Yamada
committed
(develop): prep env to run spark over k8s cluster
1 parent c8e6a19 commit 90eccfd

File tree

15 files changed

+1079
-0
lines changed

15 files changed

+1079
-0
lines changed

.DS_Store

8 KB
Binary file not shown.

.gitignore

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Whole Spark files
2+
devops/spark/
3+
4+
# IntelliJ
5+
.idea
6+
*.ipr
7+
*.iml
8+
*.iws
9+
10+
# Maven
11+
target/
12+
pom.xml.tag
13+
pom.xml.releaseBackup
14+
pom.xml.versionsBackup
15+
release.properties

devops/.DS_Store

8 KB
Binary file not shown.

devops/Dockerfile

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Java 8
2+
# ARG java_image_tag=8-jre-slim
3+
4+
# Java 11
5+
ARG java_image_tag=11.0.8-jre-slim
6+
ARG spark_uid=185
7+
8+
FROM openjdk:${java_image_tag}
9+
10+
RUN set -ex && \
11+
sed -i 's/http:\/\/deb.\(.*\)/https:\/\/deb.\1/g' /etc/apt/sources.list && \
12+
apt-get update && \
13+
ln -s /lib /lib64 && \
14+
apt install -y bash tini libc6 libpam-modules krb5-user libnss3 procps && \
15+
mkdir -p /opt/spark && \
16+
mkdir -p /opt/spark/examples && \
17+
mkdir -p /opt/spark/work-dir && \
18+
touch /opt/spark/RELEASE && \
19+
rm /bin/sh && \
20+
ln -sv /bin/bash /bin/sh && \
21+
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
22+
chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
23+
rm -rf /var/cache/apt/*
24+
25+
26+
COPY spark/dist/jars /opt/spark/jars
27+
COPY spark/dist/bin /opt/spark/bin
28+
COPY spark/dist/sbin /opt/spark/sbin
29+
COPY spark/dist/kubernetes/dockerfiles/spark/entrypoint.sh /opt/
30+
COPY spark/dist/kubernetes/dockerfiles/spark/decom.sh /opt/
31+
COPY spark/dist/examples /opt/spark/examples
32+
# ------------------
33+
# insert my custom jars
34+
COPY custom_jars/ /opt/spark/examples/
35+
# ------------------
36+
COPY spark/dist/kubernetes/tests /opt/spark/tests
37+
COPY spark/dist/data /opt/spark/data
38+
39+
ENV SPARK_HOME /opt/spark
40+
41+
WORKDIR /opt/spark/work-dir
42+
RUN chmod g+w /opt/spark/work-dir
43+
RUN chmod a+x /opt/decom.sh
44+
45+
ENTRYPOINT [ "/opt/entrypoint.sh" ]
46+
47+
# Specify the User that the actual main process will run as
48+
USER ${spark_uid}

devops/custom_jars/spark_sql_project/Datasets/Valorant.csv

+790
Large diffs are not rendered by default.
Binary file not shown.
Binary file not shown.

devops/img_builder.sh

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
echo "building new image for spark:latest"
3+
eval $(minikube docker-env)
4+
docker build . -t spark:latest
5+
echo "image built"

devops/k8s_svc_accounts.sh

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
echo "creating service account and rolebind"
3+
kubectl create serviceaccount spark
4+
kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default

devops/setup.md

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# setup Spark (it may take several minutes)
2+
$ git clone https://github.com/apache/spark.git
3+
$ cd spark
4+
$ dev/make-distribution.sh -Pkubernetes
5+
6+
# setup and starting minikube
7+
$ minikube start --memory 6144 --cpus 4
8+
9+
# building image for spark
10+
You can execute the file "img_builder.sh" or execute the commands below:
11+
$ eval $(minikube docker-env)
12+
$ docker build . -t spark:latest
13+
14+
# creating service account and role bind
15+
You can execute the file "k8s_svc_accounts.sh" or execute the commands below:
16+
$ kubectl create serviceaccount spark
17+
$ kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
18+
19+
# now you're good to go!
20+
Now you can execute the spark-submit over your k8s cluster (minikube)

devops/spark_ignition/sql_project.sh

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
k8s_cluster=192.168.64.6:8443
3+
job_name=spark-lyamada
4+
class=com.lyamada.spark.SparkSQLJob
5+
docker_img=spark:latest
6+
jar_path=opt/spark/examples/spark_sql_project/spark-lyamada-1.0-SNAPSHOT.jar
7+
8+
./spark/bin/spark-submit \
9+
--master k8s://https://$k8s_cluster \
10+
--deploy-mode cluster \
11+
--name $job_name \
12+
--class $class \
13+
--conf spark.executor.instances=2 \
14+
--conf spark.executor.memory=512m \
15+
--conf spark.driver.memory=1g \
16+
--conf spark.driver.memoryOverhead=512 \
17+
--conf spark.kubernetes.container.image=$docker_img \
18+
--conf spark.kubernetes.container.image.pullPolicy=Never \
19+
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
20+
local:///$jar_path

pom.xml

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
3+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5+
<modelVersion>4.0.0</modelVersion>
6+
7+
<groupId>com.lyamada.spark</groupId>
8+
<artifactId>spark-lyamada</artifactId>
9+
<version>1.0-SNAPSHOT</version>
10+
11+
<name>spark-lyamada</name>
12+
<url>https://lyamada.tech.blog/</url>
13+
14+
<properties>
15+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16+
<maven.compiler.source>1.8</maven.compiler.source>
17+
<maven.compiler.target>1.8</maven.compiler.target>
18+
</properties>
19+
20+
<dependencies>
21+
<dependency>
22+
<groupId>org.apache.spark</groupId>
23+
<artifactId>spark-sql_2.11</artifactId>
24+
<version>2.3.0</version>
25+
</dependency>
26+
<dependency>
27+
<groupId>junit</groupId>
28+
<artifactId>junit</artifactId>
29+
<version>4.11</version>
30+
<scope>test</scope>
31+
</dependency>
32+
</dependencies>
33+
34+
<build>
35+
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
36+
<plugins>
37+
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
38+
<plugin>
39+
<artifactId>maven-clean-plugin</artifactId>
40+
<version>3.1.0</version>
41+
</plugin>
42+
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
43+
<plugin>
44+
<artifactId>maven-resources-plugin</artifactId>
45+
<version>3.0.2</version>
46+
</plugin>
47+
<plugin>
48+
<artifactId>maven-compiler-plugin</artifactId>
49+
<version>3.8.0</version>
50+
</plugin>
51+
<plugin>
52+
<artifactId>maven-surefire-plugin</artifactId>
53+
<version>2.22.1</version>
54+
</plugin>
55+
<plugin>
56+
<artifactId>maven-jar-plugin</artifactId>
57+
<version>3.0.2</version>
58+
</plugin>
59+
<plugin>
60+
<artifactId>maven-install-plugin</artifactId>
61+
<version>2.5.2</version>
62+
</plugin>
63+
<plugin>
64+
<artifactId>maven-deploy-plugin</artifactId>
65+
<version>2.8.2</version>
66+
</plugin>
67+
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
68+
<plugin>
69+
<artifactId>maven-site-plugin</artifactId>
70+
<version>3.7.1</version>
71+
</plugin>
72+
<plugin>
73+
<artifactId>maven-project-info-reports-plugin</artifactId>
74+
<version>3.0.0</version>
75+
</plugin>
76+
</plugins>
77+
</pluginManagement>
78+
</build>
79+
</project>
+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package com.lyamada.spark;
2+
3+
import org.apache.spark.api.java.JavaRDD;
4+
import org.apache.spark.api.java.JavaSparkContext;
5+
import org.apache.spark.api.java.function.VoidFunction;
6+
import org.apache.spark.sql.SparkSession;
7+
8+
import java.util.ArrayList;
9+
import java.util.List;
10+
11+
public class App
12+
{
13+
public static void main( String[] args ) throws InterruptedException {
14+
System.out.println( "Hello World! :)" );
15+
16+
SparkSession session = SparkSession.builder().appName("spark-lyamada-test").getOrCreate();
17+
18+
JavaSparkContext sparkContext = new JavaSparkContext(session.sparkContext());
19+
20+
List<Integer> integers = new ArrayList<>();
21+
for (int counter = 1; counter <= 15; counter++) {
22+
integers.add(counter);
23+
}
24+
25+
JavaRDD<Integer> javaRDD = sparkContext.parallelize(integers,2);
26+
javaRDD.foreach((VoidFunction<Integer>) integer -> {
27+
System.out.println("Java RDD: "+integer);
28+
Thread.sleep(2000);
29+
});
30+
31+
System.out.println("SaulGoodman ;) - 1m to turn off");
32+
33+
Thread.sleep(60000);
34+
35+
sparkContext.stop();
36+
sparkContext.close();
37+
}
38+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package com.lyamada.spark;
2+
3+
import org.apache.spark.api.java.function.ForeachPartitionFunction;
4+
import org.apache.spark.sql.Dataset;
5+
import org.apache.spark.sql.Row;
6+
import org.apache.spark.sql.SparkSession;
7+
8+
import java.util.Iterator;
9+
10+
public class SparkSQLJob
11+
{
12+
public static void main( String[] args ) throws InterruptedException {
13+
System.out.println( "Hello World! :)" );
14+
15+
SparkSession session = SparkSession.builder().appName("spark-lyamada-test").getOrCreate();
16+
17+
Dataset<Row> df = session
18+
.read()
19+
.parquet("/opt/spark/examples/spark_sql_project/Datasets/user_reviews.parquet");
20+
21+
df.createOrReplaceTempView("reviews");
22+
23+
session.sql("SELECT score, count(score) from reviews group by score order by score").show();
24+
25+
df.foreachPartition(new ForeachPartitionFunction<Row>() {
26+
public void call(Iterator<Row> iterator) throws Exception {
27+
while (iterator.hasNext()){
28+
Row row = iterator.next();
29+
System.out.println("Score: " + row.getString(3) + ", Review: " + row.getString(5));
30+
}
31+
}
32+
});
33+
34+
System.out.println("Waiting two minutes until finish the job :)");
35+
Thread.sleep(180000);
36+
37+
session.stop();
38+
session.close();
39+
}
40+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package com.lyamada.spark;
2+
3+
import static org.junit.Assert.assertTrue;
4+
5+
import org.junit.Test;
6+
7+
/**
8+
* Unit test for simple App.
9+
*/
10+
public class AppTest
11+
{
12+
/**
13+
* Rigorous Test :-)
14+
*/
15+
@Test
16+
public void shouldAnswerWithTrue()
17+
{
18+
assertTrue( true );
19+
}
20+
}

0 commit comments

Comments
 (0)