'''
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
'''
'''
{
"Fare": 71.2833,
"Name": "Cumings, Mrs. John Bradley (Florence Briggs Thayer)",
"Embarked": "C",
"Age": 38,
"Parch": 0,
"Pclass": 1,
"Sex": "female",
"Survived": 1,
"SibSp": 1,
"PassengerId": 2,
"Ticket": "PC 17599",
"Cabin": "C85"
}
'''
# AVRO SCHEMA
{
"namespace": "avro.example.titanic",
"type": "record",
"name": "demo",
"doc": "Titanic data set",
"fields": [
{"name": "PassengerId", "type": "int", "doc": "Passenger ID"},
{"name": "Survived", "type": "int", "doc": "Survival (0 = No; 1 = Yes)"},
{"name": "Pclass", "type": "int", "doc": "Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)"},
{"name": "Name", "type": "string", "doc": "Name"},
{"name": "Sex", "type": "string", "doc": "Sex"},
{"name": "Age", "type": "int", "doc": "Age"},
{"name": "SibSp", "type": "int", "doc": "Number of Siblings/Spouses Aboard"},
{"name": "Parch", "type": "int", "doc": "Number of Parents/Children Aboard"},
{"name": "Ticket", "type": "string", "doc": "Ticket Number"},
{"name": "Fare", "type": "float", "doc": "Passenger Fare"},
{"name": "Cabin", "type": "string", "doc": "Cabin"},
{"name": "Embarked", {"type":"enum","symbols":["C","S","Q"], "doc": "Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)"}
]
}
import avro.schema
import io, random
from avro.io import DatumWriter, BinaryEncoder
from avro.datafile import DataFileReader, DataFileWriter
# Path to user.avsc avro schema
schema_path="schemas/titanic.avsc"
schema = avro.schema.parse(open(schema_path).read())
writer = DataFileWriter(open("avroFiles/titanic.avro",'w'), DatumWriter(), schema, codec="null")
writer.append({"Fare": 71.2833, "Name": "Cumings, Mrs. John Bradley (Florence Briggs Thayer)", "Embarked": "C", "Age": 38, "Parch": 0, "Pclass": 1, "Sex": "female", "Survived": 1, "SibSp": 1, "PassengerId": 2, "Ticket": "PC 17599", "Cabin": "C85"})
writer.append({"Fare": 7.925, "Name": "Heikkinen, Miss. Laina", "Embarked": "S", "Age": 26, "Parch": 0, "Pclass": 3, "Sex": "female", "Survived": 1, "SibSp": 0, "PassengerId": 3, "Ticket": "STON/O2. 3101282", "Cabin": ""})
writer.close()
# Schema Version1
{
"type": "record",
"name": "Employee",
"fields": [
{"name": "name", "type": "string"},
{"name": "age", "type": "int"},
{"name": "emails", "type": {"type": "array", "items": "string"}},
{"name": "boss", "type": ["Employee","null"]}
]
}
# Schema Version2
{
"type": "record",
"name": "Employee",
"fields": [
{"name": "name", "type": "string"},
{"name": "yrs", "type": "int", "aliases": ["age"]},
{"name": "gender", "type": "string", "default":"unknown"},
{"name": "emails", "type": {"type": "array", "items": "string"}}
]
}
New Schema | Writer | Reader | Action |
---|---|---|---|
Added Field | Old | New | Reader uses default value |
New | Old | Reader Ignores it. | |
Remove Field | Old | New | Reader ignores the removed field |
New | Old | Old reads only if there is a default value;Other wise error. | |
Change Name | Old | New | Define Aliases. |
For best results, always provide a default value for the fields in your schema. This makes it possible to delete fields later on if you decide it is necessary. If you do not provide a default value for a field, you cannot delete that field from your schema.
You cannot change a field's data type. If you have decided that a field should be some data type other than what it was originally created using, then add a whole new field to your schema that uses the appropriate data type.
You cannot rename an existing field. However, if you want to access the field by some name other than what it was originally created using, add and use aliases for the field.