cat sql.py from sql import * import tempfile, shutil from array import array from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) srdd = sqlCtx.jsonFile('jsonFile') srdd.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}}, {"f1": 2, "f2": "row2", "f3":{"field4":22}}, {"f1": 3, "f2": "row3", "f3":{"field4":33}}] from array import array from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) rdd = sc.parallelize([ {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}}, {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]) srdd = sqlCtx.inferSchema(rdd) srdd.collect() == [{"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}}, {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}] rdd = sc.parallelize([ {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)}, {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]) srdd = sqlCtx.inferSchema(rdd) srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)}, {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}] jsonFile !cat 'jsonFile' jsonStrings = !cat jsonFile ofn = open(jsonFile, 'w') for json in jsonStrings: print>>ofn, json json ofn.close() !cat '/var/folders/dj/92mp96m54d90mdpqlmwbz1m40000gn/T/tmpMFXhlP' srdd = sqlCtx.jsonFile(jsonFile) srdd.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}}, {"f1": 2, "f2": "row2", "f3":{"field4":22}}, {"f1": 3, "f2": "row3", "f3":{"field4":33}}] from pyspark.sql import SQLContext sqlContext = SQLContext(sc) lines = sc.textFile("../../examples/src/main/resources/people.txt") parts = lines.map(lambda l: l.split(",")) people = parts.map(lambda p: {"name": p[0], "age": int(p[1])}) schemaPeople = sqlContext.inferSchema(people) schemaPeople.registerAsTable("people") teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") teenNames = teenagers.map(lambda p: "Name: " + p.name) for teenName in teenNames.collect(): print teenName !cat ../../examples/src/main/resources/people.txt