<?xml version="1.0" encoding="UTF-8"?>
<!-- generator="FeedCreator 1.8" -->
<?xml-stylesheet href="https://wiki.korotkin.co.il/lib/exe/css.php?s=feed" type="text/css"?>
<rdf:RDF
    xmlns="http://purl.org/rss/1.0/"
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
    xmlns:dc="http://purl.org/dc/elements/1.1/">
    <channel rdf:about="https://wiki.korotkin.co.il/feed.php">
        <title>KB - kb:bigdata:spark</title>
        <description></description>
        <link>https://wiki.korotkin.co.il/</link>
        <image rdf:resource="https://wiki.korotkin.co.il/_media/wiki/logo.png" />
       <dc:date>2026-04-09T23:50:14+00:00</dc:date>
        <items>
            <rdf:Seq>
                <rdf:li rdf:resource="https://wiki.korotkin.co.il/kb/bigdata/spark/import_json?rev=1641225830&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.korotkin.co.il/kb/bigdata/spark/ml_save_and_load_model?rev=1641225830&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.korotkin.co.il/kb/bigdata/spark/pyspark_and_hbase?rev=1641225830&amp;do=diff"/>
                <rdf:li rdf:resource="https://wiki.korotkin.co.il/kb/bigdata/spark/spark_scala_and_maven_pom.xml?rev=1641225830&amp;do=diff"/>
            </rdf:Seq>
        </items>
    </channel>
    <image rdf:about="https://wiki.korotkin.co.il/_media/wiki/logo.png">
        <title>KB</title>
        <link>https://wiki.korotkin.co.il/</link>
        <url>https://wiki.korotkin.co.il/_media/wiki/logo.png</url>
    </image>
    <item rdf:about="https://wiki.korotkin.co.il/kb/bigdata/spark/import_json?rev=1641225830&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-01-03T16:03:50+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>import_json</title>
        <link>https://wiki.korotkin.co.il/kb/bigdata/spark/import_json?rev=1641225830&amp;do=diff</link>
        <description>Import JSON


import json


# import json
df = sc.wholeTextFiles(&#039;/user/yehuda/development/raw_data/*.json&#039;).flatMap(lambda x: json.loads(x[1])).toDF()




jsonRDD = sc.wholeTextFiles(&quot;/user/yehuda/development/raw_data/file-*.json&quot;).map(lambda x: json.loads(x[1]))
namesJson = sqlContext.read.json(jsonRDD)
namesJson.printSchema



## see https://stackoverflow.com/questions/46521235/hivecontext-object-has-no-attribute-jsonrdd-spark-2-1-1
import json
from pyspark.sql.types import StructField, Struc…</description>
    </item>
    <item rdf:about="https://wiki.korotkin.co.il/kb/bigdata/spark/ml_save_and_load_model?rev=1641225830&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-01-03T16:03:50+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>ml_save_and_load_model</title>
        <link>https://wiki.korotkin.co.il/kb/bigdata/spark/ml_save_and_load_model?rev=1641225830&amp;do=diff</link>
        <description>ML Save and Load model

Train &amp; Save


from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.session import SparkSession

# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext(&quot;local[2]&quot;, &quot;NetworkWordCountML&quot;)
ssc = StreamingContext(sc, 1)
spark = SparkSession(sc…</description>
    </item>
    <item rdf:about="https://wiki.korotkin.co.il/kb/bigdata/spark/pyspark_and_hbase?rev=1641225830&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-01-03T16:03:50+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>pyspark_and_hbase</title>
        <link>https://wiki.korotkin.co.il/kb/bigdata/spark/pyspark_and_hbase?rev=1641225830&amp;do=diff</link>
        <description>Spark HBase


import sys  
import json  
from pyspark import SparkContext  
from pyspark.streaming import StreamingContext  
  
  
def SaveRecord(rdd):  
  host = &#039;sparkmaster.example.com&#039;  
  table = &#039;cats&#039;  
  keyConv = &quot;org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter&quot;  
  valueConv = &quot;org.apache.spark.examples.pythonconverters.StringListToPutConverter&quot;  
  conf = {&quot;hbase.zookeeper.quorum&quot;: host,  
      &quot;hbase.mapred.outputtable&quot;: table,  
      &quot;mapreduce.o…</description>
    </item>
    <item rdf:about="https://wiki.korotkin.co.il/kb/bigdata/spark/spark_scala_and_maven_pom.xml?rev=1641225830&amp;do=diff">
        <dc:format>text/html</dc:format>
        <dc:date>2022-01-03T16:03:50+00:00</dc:date>
        <dc:creator>Anonymous (anonymous@undisclosed.example.com)</dc:creator>
        <title>spark_scala_and_maven_pom.xml</title>
        <link>https://wiki.korotkin.co.il/kb/bigdata/spark/spark_scala_and_maven_pom.xml?rev=1641225830&amp;do=diff</link>
        <description>Spark Scala and Maven pom.xml


&lt;project xmlns=&quot;http://maven.apache.org/POM/4.0.0&quot; xmlns:xsi=&quot;http://www.w3.org/2001/XMLSchema-instance&quot;
	xsi:schemaLocation=&quot;http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd&quot;&gt;
	&lt;modelVersion&gt;4.0.0&lt;/modelVersion&gt;
	&lt;groupId&gt;com.alefbt.bigdata.ml&lt;/groupId&gt;
	&lt;artifactId&gt;mlserv&lt;/artifactId&gt;
	&lt;version&gt;0.0.1-SNAPSHOT&lt;/version&gt;
	&lt;name&gt;MLServe&lt;/name&gt;



	&lt;properties&gt;
		&lt;maven.compiler.source&gt;1.8&lt;/maven.compiler.source&gt;
		&lt;maven.compiler.target…</description>
    </item>
</rdf:RDF>
