====== Spark HBase ====== import sys import json from pyspark import SparkContext from pyspark.streaming import StreamingContext def SaveRecord(rdd): host = 'sparkmaster.example.com' table = 'cats' keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} datamap = rdd.map(lambda x: (str(json.loads(x)["id"]),[str(json.loads(x)["id"]),"cfamily","cats_json",x])) datamap.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: StreamCatsToHBase.py ") exit(-1) sc = SparkContext(appName="StreamCatsToHBase") ssc = StreamingContext(sc, 1) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) lines.foreachRDD(SaveRecord) ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate