cat sql.py

from sql import *

import tempfile, shutil

from array import array
from pyspark.sql import SQLContext

sqlCtx = SQLContext(sc)

srdd = sqlCtx.jsonFile('jsonFile')

srdd.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
                    {"f1": 2, "f2": "row2", "f3":{"field4":22}},
                    {"f1": 3, "f2": "row3", "f3":{"field4":33}}]

from array import array
from pyspark.sql import SQLContext

sqlCtx = SQLContext(sc)
rdd = sc.parallelize([
        {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
        {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}])

srdd = sqlCtx.inferSchema(rdd)
srdd.collect() == [{"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}},
                   {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]

rdd = sc.parallelize([
        {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
        {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}])

srdd = sqlCtx.inferSchema(rdd)
srdd.collect() == [{"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)},
                   {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]

jsonFile
!cat 'jsonFile'

jsonStrings = !cat jsonFile

ofn = open(jsonFile, 'w')

for json in jsonStrings:
    print>>ofn, json

json

ofn.close()
!cat '/var/folders/dj/92mp96m54d90mdpqlmwbz1m40000gn/T/tmpMFXhlP'

srdd = sqlCtx.jsonFile(jsonFile)

srdd.collect() == [{"f1": 1, "f2": "row1", "f3":{"field4":11}},
                    {"f1": 2, "f2": "row2", "f3":{"field4":22}},
                    {"f1": 3, "f2": "row3", "f3":{"field4":33}}]

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

lines = sc.textFile("../../examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: {"name": p[0], "age": int(p[1])})


schemaPeople = sqlContext.inferSchema(people)
schemaPeople.registerAsTable("people")

teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")


teenNames = teenagers.map(lambda p: "Name: " + p.name)
for teenName in teenNames.collect():
  print teenName

!cat ../../examples/src/main/resources/people.txt