Import JSON

import json
 
 
# import json
df = sc.wholeTextFiles('/user/yehuda/development/raw_data/*.json').flatMap(lambda x: json.loads(x[1])).toDF()
 
 
 
 
jsonRDD = sc.wholeTextFiles("/user/yehuda/development/raw_data/file-*.json").map(lambda x: json.loads(x[1]))
namesJson = sqlContext.read.json(jsonRDD)
namesJson.printSchema
 
 
 
## see https://stackoverflow.com/questions/46521235/hivecontext-object-has-no-attribute-jsonrdd-spark-2-1-1
import json
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
 
r = [{'a': 'aaa', 'b': 'bbb', 'c': 'ccc'},
       {'a': 'aaaa','b': 'bbbb','c': 'cccc','d': 'dddd'}]
r = [json.dumps(d) for d in r]
 
# known schema
schema = ['a', 'b', 'c', 'd']
fields = [StructField(field_name, StringType(), True) for field_name in schema]
schema = StructType(fields)
 
rdd = sc.parallelize(r)
df = sqlContext.read.schema(schema).json(rdd)
df.collect()
kb/bigdata/spark/import_json.txt · Last modified: 2022/01/03 16:03 by 127.0.0.1
Back to top
Driven by DokuWiki Recent changes RSS feed Valid CSS Valid XHTML 1.0