====== Import JSON ====== import json # import json df = sc.wholeTextFiles('/user/yehuda/development/raw_data/*.json').flatMap(lambda x: json.loads(x[1])).toDF() jsonRDD = sc.wholeTextFiles("/user/yehuda/development/raw_data/file-*.json").map(lambda x: json.loads(x[1])) namesJson = sqlContext.read.json(jsonRDD) namesJson.printSchema ## see https://stackoverflow.com/questions/46521235/hivecontext-object-has-no-attribute-jsonrdd-spark-2-1-1 import json from pyspark.sql.types import StructField, StructType, IntegerType, StringType r = [{'a': 'aaa', 'b': 'bbb', 'c': 'ccc'}, {'a': 'aaaa','b': 'bbbb','c': 'cccc','d': 'dddd'}] r = [json.dumps(d) for d in r] # known schema schema = ['a', 'b', 'c', 'd'] fields = [StructField(field_name, StringType(), True) for field_name in schema] schema = StructType(fields) rdd = sc.parallelize(r) df = sqlContext.read.schema(schema).json(rdd) df.collect()