Import JSON
import json
# import json
df = sc.wholeTextFiles('/user/yehuda/development/raw_data/*.json').flatMap(lambda x: json.loads(x[1])).toDF()
jsonRDD = sc.wholeTextFiles("/user/yehuda/development/raw_data/file-*.json").map(lambda x: json.loads(x[1]))
namesJson = sqlContext.read.json(jsonRDD)
namesJson.printSchema
## see https://stackoverflow.com/questions/46521235/hivecontext-object-has-no-attribute-jsonrdd-spark-2-1-1
import json
from pyspark.sql.types import StructField, StructType, IntegerType, StringType
r = [{'a': 'aaa', 'b': 'bbb', 'c': 'ccc'},
{'a': 'aaaa','b': 'bbbb','c': 'cccc','d': 'dddd'}]
r = [json.dumps(d) for d in r]
# known schema
schema = ['a', 'b', 'c', 'd']
fields = [StructField(field_name, StringType(), True) for field_name in schema]
schema = StructType(fields)
rdd = sc.parallelize(r)
df = sqlContext.read.schema(schema).json(rdd)
df.collect()
Back to top