%pip install --upgrade sqlite_utils
%pip install --upgrade csvs-to-sqlite
sqlite_utils
magic¶Wondering: what might sqlite_utils
magic look like, and why should we bother?
One way for it to look would be to start to replicate the CLI.
As to why bother? Maybe folk are using a notebook but they aren't a Python programmer, or maybe they just want some shortcut, bespoke magic to do common operations in a CLI rather than a Python function way.
Along the way, we might as well make use of other tools in @simonw's -to-sqlite
universe.
Let's have a quick hack at some magic..
from IPython.core.magic import magics_class, line_magic, cell_magic, line_cell_magic, Magics
from IPython.core import magic_arguments
from sqlite_utils import Database
from io import StringIO
import pandas as pd
import json
import requests
@magics_class
class SQLite_Utils(Magics):
def clean_string(self, string):
"""Clean string argument."""
if isinstance(string, str):
string = string.strip("\"\'")
return f'{string}'
return string
def download_file(self, url, filename=None, chunk_size=8192):
"""Download file."""
# We stream in case the file is a large one...
# Should we download to a temporary file?
# Or is persistence good for audit / if anything goes wrong?
fn = url.split('/')[-1] if not filename else filename
with requests.get(url, stream=True) as r:
#r.raise_for_status() # Error reporting
with open(fn, 'wb') as f:
for chunk in r.iter_content(chunk_size=chunk_size):
f.write(chunk)
return fn
# We'll simplify things for now by saying you MUST call this first
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--dbvar", "-d", default='_sqlite_utils_db', help="DB var.")
@magic_arguments.argument("--memory", "-m", action='store_true', help="DB var.")
@magic_arguments.argument("--recreate", "-r", action='store_true', help="DB var.")
@magic_arguments.argument(
"--filename", "-f", default="sqlite_utils_magic.db", help="SQLite database pathname.")
def sql_utils_create(self, line):
"Create a database."
args = magic_arguments.parse_argstring(self.sql_utils_create, line)
# TO DO - the --recreate flag seems to create a read only database?
if args.memory:
# This is a bit of trickery - keep the db reference in a variable
# in the parent Python environment
self.shell.user_ns[args.dbvar] = Database(memory=True, recreate=args.recreate)
else:
self.shell.user_ns[args.dbvar] = Database(args.filename, recreate=args.recreate)
self.shell.user_ns['_sql_utils_magic_db_path'] =args.filename
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--dbvar", "-d", default='_sqlite_utils_db', help="DB var.")
@magic_arguments.argument("--table", "-t", default=None, help="Table name.")
@magic_arguments.argument("--columns", "-c", action='store_true', help="Show columns.")
def sql_utils_tables(self, line):
"""Show tables."""
args = magic_arguments.parse_argstring(self.sql_utils_tables, line)
db = self.shell.user_ns[args.dbvar]
if args.table and args.columns:
return db[args.table].columns
return db.table_names()
@line_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--dbvar", "-d", default='_sqlite_utils_db', help="DB var.")
def sql_utils_views(self, line):
"""Show views."""
args = magic_arguments.parse_argstring(self.sql_utils_views, line)
db = self.shell.user_ns[args.dbvar]
return db.view_names()
# Remember for now we have simplified by requirig the db has been set up
@line_cell_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--csv", "-c", default=None, help="CSV path.")
@magic_arguments.argument("--filename", "-f", default=None, help="Filename.")
def sql_utils_import_csv(self, line, cell=None):
"""Import CSV file."""
args = magic_arguments.parse_argstring(self.sql_utils_import_csv, line)
if not args.csv or '_sql_utils_magic_db_path' not in self.shell.user_ns:
return
if args.csv.startswith('http'):
# It's a URL... so download it...
fn = self.download_file(args.csv, args.filename)
# Given we're in magic land!
db_path = self.shell.user_ns['_sql_utils_magic_db_path']
!csvs-to-sqlite {fn} {db_path}
@cell_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--dbvar", "-d", default='_sqlite_utils_db', help="DB var.")
@magic_arguments.argument("--table", "-t", default=None, help="Table name.")
def sql_utils_add_rows(self, line, cell):
"""Add rows from JSON."""
args = magic_arguments.parse_argstring(self.sql_utils_add_rows, line)
if not args.table:
return
db = self.shell.user_ns[args.dbvar]
df = pd.DataFrame(StringIO(cell))
df = pd.json_normalize(df[0].str.strip().apply(json.loads))
db[args.table].insert_all(df.to_dict(orient='records'))
@line_cell_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--dbvar", "-d", default='_sqlite_utils_db', help="DB var.")
@magic_arguments.argument("--create", "-k", action='store_true', help="Create.")
@magic_arguments.argument("--table", "-t", default=None, help="Table name.")
@magic_arguments.argument("--tabledef", "-T", default=None, help="Table definition.")
@magic_arguments.argument("--pk", "-p", default=None, help="Primary key.")
@magic_arguments.argument("--fk", "-f", default=None, help="Foreign keys.")
@magic_arguments.argument("--columnorder", "-c", default=None, help="Column order.")
@magic_arguments.argument("--notnull", "-n", default=None, help="Not null.")
def sql_utils_table(self, line, cell=None):
args = magic_arguments.parse_argstring(self.sql_utils_table, line)
db = self.shell.user_ns[args.dbvar]
tabledef = self.clean_string(args.tabledef)
pk = self.clean_string(args.pk)
fk = self.clean_string(args.fk)
columnorder = self.clean_string(args.columnorder)
if columnorder:
columnorder = tuple([f"{c.strip()}" for c in columnorder.split(',')])
notnull = self.clean_string(args.notnull)
if notnull:
notnull = set([f"{n.strip()}" for n in notnull.split(',')])
if cell is None and args.table:
if args.create:
db[args.table].create(tabledef,
pk=pk,
foreign_keys=fk,
column_order=columnorder,
not_null=notnull
)
else:
db.table(args.table,
pk=pk,
foreign_keys=fk,
column_order=columnorder,
not_null=notnull
)
else:
db[args.table].create(eval(cell),
pk=pk,
foreign_keys=fk,
column_order=columnorder,
not_null=notnull
)
@line_cell_magic
@magic_arguments.magic_arguments()
@magic_arguments.argument("--dbvar", "-d", default='_sqlite_utils_db', help="DB var.")
@magic_arguments.argument("--table", "-t", default=None, help="Table name.")
@magic_arguments.argument("--select", "-s", default=None, help="Query.")
@magic_arguments.argument("--where", "-w", default=None, help="Where.")
@magic_arguments.argument("--orderby", "-o", default=None, help="Order by.")
@magic_arguments.argument("--generator", "-G", default=None, help="Return generator.")
@magic_arguments.argument("--outvar", "-O", default=None, help="Output variable.")
@magic_arguments.argument("--query", "-q", default=None, help="Open query.")
@magic_arguments.argument("--get", "-g", default=None, help="Get record by primary key.")
def sql_utils_query(self, line, cell=None):
args = magic_arguments.parse_argstring(self.sql_utils_query, line)
db = self.shell.user_ns[args.dbvar]
if cell is None:
# Allow a quoted string to be passed via -s
if args.table:
if args.get:
# Does this need heuristic typecasting?
# Or can we pass vars that retain type?
return db[args.table].get(args.get)
else:
select = self.clean_string(args.select)
where = self.clean_string(args.where)
orderby = self.clean_string(args.orderby)
resp = db[args.table].rows_where(where=where,
select=select,
order_by=orderby)
if args.generator:
return resp
df = pd.DataFrame(resp)
if args.outvar:
self.shell.user_ns[args.outvar] = df
return pd.DataFrame(df)
elif args.query:
# TO DO - DRY - we should be able to abstract most
# of this into a separate utility function
query = self.clean_string(args.query)
df = pd.DataFrame(db.execute(query).fetchall())
if args.outvar:
self.shell.user_ns[args.outvar] = df
return pd.DataFrame(df)
else:
# For now assume this is just a query
df = pd.DataFrame(db.execute(cell).fetchall())
if args.outvar:
self.shell.user_ns[args.outvar] = df
return pd.DataFrame(df)
ip = get_ipython()
ip.register_magics(SQLite_Utils)
Start off by creating a db - at the moment this is a MUST first step.
Known issues:
recreate
seems to result in a read-only db?#%load_ext SQLite_Utils
%sql_utils_create --filename mydemo1.db
We actually have a reference to the database:
_sqlite_utils_db
<Database <sqlite3.Connection object at 0x116cd38f0>>
We can add some rows to a table:
%%sql_utils_add_rows -t test1
{"id": 1, "name": "Cleo", "twitter": "cleopaws", "age": 3,"is_good_dog": true}
{"id": 2, "name": "Marnie", "twitter": "MarnieTheDog", "age": 16,"is_good_dog": true}
{"id": 3, "name": "Cleo2", "twitter": "cleopaws2", "age": 3,"is_good_dog": true}
{"id": 4, "name": "Marnie2", "twitter": "MarnieTheDog2", "age": 16,"is_good_dog": true}
We can list the tables:
%sql_utils_tables
['test1', '?f=csv', 'mps']
We can query tables:
%sql_utils_query -t test1 -s name,age
name | age | |
---|---|---|
0 | Cleo | 3 |
1 | Marnie | 16 |
2 | Cleo2 | 3 |
3 | Marnie2 | 16 |
4 | Cleo | 3 |
5 | Marnie | 16 |
6 | Cleo2 | 3 |
7 | Marnie2 | 16 |
8 | Cleo | 3 |
9 | Marnie | 16 |
10 | Cleo2 | 3 |
11 | Marnie2 | 16 |
12 | Cleo | 3 |
13 | Marnie | 16 |
14 | Cleo2 | 3 |
15 | Marnie2 | 16 |
We can have more elaborate queries:
%sql_utils_query -t test1 -s 'name, age'
name | age | |
---|---|---|
0 | Cleo | 3 |
1 | Marnie | 16 |
2 | Cleo2 | 3 |
3 | Marnie2 | 16 |
4 | Cleo | 3 |
5 | Marnie | 16 |
6 | Cleo2 | 3 |
7 | Marnie2 | 16 |
8 | Cleo | 3 |
9 | Marnie | 16 |
10 | Cleo2 | 3 |
11 | Marnie2 | 16 |
12 | Cleo | 3 |
13 | Marnie | 16 |
14 | Cleo2 | 3 |
15 | Marnie2 | 16 |
We can import data from a CSV file downloaded from a URL:
%sql_utils_import_csv --csv https://www.theyworkforyou.com/mps/?f=csv --filename mps.csv
Loaded 1 dataframes /usr/local/lib/python3.7/site-packages/pandas/core/generic.py:2612: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores. method=method, Added 1 CSV file to mydemo1.db
Recheck the tables:
%sql_utils_tables
['test1', '?f=csv', 'mps']
%sql_utils_query -t mps -s '*'
Person ID | First name | Last name | Party | Constituency | URI | |
---|---|---|---|---|---|---|
0 | 10001 | Diane | Abbott | Labour | Hackney North and Stoke Newington | https://www.theyworkforyou.com/mp/10001/diane_... |
1 | 25034 | Debbie | Abrahams | Labour | Oldham East and Saddleworth | https://www.theyworkforyou.com/mp/25034/debbie... |
2 | 24878 | Nigel | Adams | Conservative | Selby and Ainsty | https://www.theyworkforyou.com/mp/24878/nigel_... |
3 | 25661 | Bim | Afolami | Conservative | Hitchin and Harpenden | https://www.theyworkforyou.com/mp/25661/bim_af... |
4 | 11929 | Adam | Afriyie | Conservative | Windsor | https://www.theyworkforyou.com/mp/11929/adam_a... |
... | ... | ... | ... | ... | ... | ... |
645 | 11791 | Jeremy | Wright | Conservative | Kenilworth and Southam | https://www.theyworkforyou.com/mp/11791/jeremy... |
646 | 25649 | Mohammad | Yasin | Labour | Bedford | https://www.theyworkforyou.com/mp/25649/mohamm... |
647 | 25806 | Jacob | Young | Conservative | Redcar | https://www.theyworkforyou.com/mp/25806/jacob_... |
648 | 24822 | Nadhim | Zahawi | Conservative | Stratford-on-Avon | https://www.theyworkforyou.com/mp/24822/nadhim... |
649 | 25386 | Daniel | Zeichner | Labour | Cambridge | https://www.theyworkforyou.com/mp/25386/daniel... |
650 rows × 6 columns
Run an arbitrary query:
df = %sql_utils_query -q "SELECT * FROM mps LIMIT 3;"
df
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 10001 | Diane | Abbott | Labour | Hackney North and Stoke Newington | https://www.theyworkforyou.com/mp/10001/diane_... |
1 | 25034 | Debbie | Abrahams | Labour | Oldham East and Saddleworth | https://www.theyworkforyou.com/mp/25034/debbie... |
2 | 24878 | Nigel | Adams | Conservative | Selby and Ainsty | https://www.theyworkforyou.com/mp/24878/nigel_... |
If we run the block magic we assume it's an open query:
%%sql_utils_query --outvar mydf
SELECT *
-- with a comment
FROM mps -- and another comment
LIMIT 10
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 10001 | Diane | Abbott | Labour | Hackney North and Stoke Newington | https://www.theyworkforyou.com/mp/10001/diane_... |
1 | 25034 | Debbie | Abrahams | Labour | Oldham East and Saddleworth | https://www.theyworkforyou.com/mp/25034/debbie... |
2 | 24878 | Nigel | Adams | Conservative | Selby and Ainsty | https://www.theyworkforyou.com/mp/24878/nigel_... |
3 | 25661 | Bim | Afolami | Conservative | Hitchin and Harpenden | https://www.theyworkforyou.com/mp/25661/bim_af... |
4 | 11929 | Adam | Afriyie | Conservative | Windsor | https://www.theyworkforyou.com/mp/11929/adam_a... |
5 | 25817 | Nickie | Aiken | Conservative | Cities of London and Westminster | https://www.theyworkforyou.com/mp/25817/nickie... |
6 | 24904 | Peter | Aldous | Conservative | Waveney | https://www.theyworkforyou.com/mp/24904/peter_... |
7 | 24958 | Rushanara | Ali | Labour | Bethnal Green and Bow | https://www.theyworkforyou.com/mp/24958/rushan... |
8 | 25888 | Tahir | Ali | Labour | Birmingham, Hall Green | https://www.theyworkforyou.com/mp/25888/tahir_... |
9 | 25337 | Lucy | Allan | Conservative | Telford | https://www.theyworkforyou.com/mp/25337/lucy_a... |
The --outvar
switch lets us assign the result to a variable:
mydf
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 10001 | Diane | Abbott | Labour | Hackney North and Stoke Newington | https://www.theyworkforyou.com/mp/10001/diane_... |
1 | 25034 | Debbie | Abrahams | Labour | Oldham East and Saddleworth | https://www.theyworkforyou.com/mp/25034/debbie... |
2 | 24878 | Nigel | Adams | Conservative | Selby and Ainsty | https://www.theyworkforyou.com/mp/24878/nigel_... |
3 | 25661 | Bim | Afolami | Conservative | Hitchin and Harpenden | https://www.theyworkforyou.com/mp/25661/bim_af... |
4 | 11929 | Adam | Afriyie | Conservative | Windsor | https://www.theyworkforyou.com/mp/11929/adam_a... |
5 | 25817 | Nickie | Aiken | Conservative | Cities of London and Westminster | https://www.theyworkforyou.com/mp/25817/nickie... |
6 | 24904 | Peter | Aldous | Conservative | Waveney | https://www.theyworkforyou.com/mp/24904/peter_... |
7 | 24958 | Rushanara | Ali | Labour | Bethnal Green and Bow | https://www.theyworkforyou.com/mp/24958/rushan... |
8 | 25888 | Tahir | Ali | Labour | Birmingham, Hall Green | https://www.theyworkforyou.com/mp/25888/tahir_... |
9 | 25337 | Lucy | Allan | Conservative | Telford | https://www.theyworkforyou.com/mp/25337/lucy_a... |
We can preconfigure table attributes for when a table is created by an insert:
%sql_utils_table --table "authors" --pk "id" --notnull "name,score" -c "id,name,score,url"
We can explictly create a table:
%%sql_utils_table --table cats --pk "id"
{
"id": int,
"name": str,
"weight": float,
}
We can view table columns:
%sql_utils_tables --table cats -c
[Column(cid=0, name='id', type='INTEGER', notnull=0, default_value=None, is_pk=1), Column(cid=1, name='name', type='TEXT', notnull=0, default_value=None, is_pk=0), Column(cid=2, name='weight', type='FLOAT', notnull=0, default_value=None, is_pk=0)]