Pyparsing - python parsing library

Pavel Tysliatski

Expansa Group

In [1]:
from pyparsing import *
from string import *

ParserElement subclasses

Basic subclasses

Literal
Word
Regex
SkipTo

and etc.

Expression subclasses

Or - |
And - ^
Optional
ZeroOrMore
OneOrMore

and etc.

Positional subclasses

StringStart
StringEnd
LineStart
LineEnd

Converter subclasses

Suppress

and etc.

Parser actions

setParserAction
addParserAction

Base example

Make parser:

[0-9]
In [2]:
def action(s, loc, toks):
    print('s', type(s), s)
    print('loc', type(loc), loc)
    print('toks', type(toks), toks)
    return ['<'] + list(toks) + ['>']


pattern = Suppress('[') + Word(digits) + '-' + Word(digits).setParseAction(action) + Suppress(']')

Parsing methods

parseString

In [3]:
pattern.parseString('[0-9]')
s <class 'str'> [0-9]
loc <class 'int'> 3
toks <class 'pyparsing.ParseResults'> ['9']
Out[3]:
(['0', '-', '<', '9', '>'], {})
In [4]:
# >>> pattern.parseString('test[0-9]test')
# ParseException: Expected "[" (at char 0), (line:1, col:1)

searchString

In [5]:
pattern.searchString('test[0-9]test')
s <class 'str'> test[0-9]test
loc <class 'int'> 7
toks <class 'pyparsing.ParseResults'> ['9']
Out[5]:
([(['0', '-', '<', '9', '>'], {})], {})
In [6]:
pattern.searchString('test[0-9]test[0-9]')
s <class 'str'> test[0-9]test[0-9]
loc <class 'int'> 7
toks <class 'pyparsing.ParseResults'> ['9']
s <class 'str'> test[0-9]test[0-9]
loc <class 'int'> 16
toks <class 'pyparsing.ParseResults'> ['9']
Out[6]:
([(['0', '-', '<', '9', '>'], {}), (['0', '-', '<', '9', '>'], {})], {})

scanString

In [7]:
list(pattern.scanString('test[0-9]test'))
s <class 'str'> test[0-9]test
loc <class 'int'> 7
toks <class 'pyparsing.ParseResults'> ['9']
Out[7]:
[((['0', '-', '<', '9', '>'], {}), 4, 9)]
In [8]:
list(pattern.scanString('test[0-9]test[0-9]'))
s <class 'str'> test[0-9]test[0-9]
loc <class 'int'> 7
toks <class 'pyparsing.ParseResults'> ['9']
s <class 'str'> test[0-9]test[0-9]
loc <class 'int'> 16
toks <class 'pyparsing.ParseResults'> ['9']
Out[8]:
[((['0', '-', '<', '9', '>'], {}), 4, 9),
 ((['0', '-', '<', '9', '>'], {}), 13, 18)]

transformString

In [9]:
pattern.transformString('test[0-9]test[0-9]')
s <class 'str'> test[0-9]test[0-9]
loc <class 'int'> 7
toks <class 'pyparsing.ParseResults'> ['9']
s <class 'str'> test[0-9]test[0-9]
loc <class 'int'> 16
toks <class 'pyparsing.ParseResults'> ['9']
Out[9]:
'test0-<9>test0-<9>'

More complex example

Make iterator:

text_block[numeric_block][numeric_block]text_block...

For example:

>>> te\[\]st\\\\[0-9]test[0-9]
te[]st\\0test0
te[]st\\0test1
te[]st\\0test2
...
te[]st\\9test7
te[]st\\9test8
te[]st\\9test9

Parser

In [10]:
text = SkipTo(StringEnd() | '[')
numeric = Suppress('[') + Word(digits) + Suppress('-') + Word(digits) + Suppress(']')
pattern = ZeroOrMore(text | numeric)
In [11]:
def skip_empty(toks):
    if not toks[0]:
        raise ParseException('must be not empty')
        

text.addParseAction(skip_empty)
Out[11]:
SkipTo:({StringEnd | "["})
In [12]:
pattern.parseString('test[0-9]test[0-9]')
Out[12]:
(['test', '0', '9', 'test', '0', '9'], {})
In [13]:
pattern.parseString('te\[\]st\\\\[0-9]test[0-9]')
Out[13]:
(['te\\'], {})

Escaping

In [14]:
escape = (Literal('\\\\').addParseAction(replaceWith('\\')) | 
          Literal('\\[').addParseAction(replaceWith('[')) |
          Literal('\\]').addParseAction(replaceWith(']')))
text = SkipTo(StringEnd() | '[', ignore=escape).setParseAction(skip_empty)
pattern = ZeroOrMore(text | numeric)
In [15]:
pattern.parseString('te\[\]st\\\\[0-9]test[0-9]')
Out[15]:
(['te\\[\\]st\\\\', '0', '9', 'test', '0', '9'], {})
In [16]:
def unescape(toks):
    return [escape.transformString(item) for item in toks]


text.addParseAction(unescape)
Out[16]:
SkipTo:({StringEnd | "["})
In [17]:
pattern.parseString('te\[\]st\\\\[0-9]test[0-9]')
Out[17]:
(['te[]st\\', '0', '9', 'test', '0', '9'], {})

Iterator

In [18]:
def text_block(toks):
    return [iter(toks)]
    
    
def numeric_block(toks):
    from_value, to_value = toks
    return [map(str, range(int(from_value), int(to_value) + 1))]


text.addParseAction(text_block)
numeric.addParseAction(numeric_block)
Out[18]:
{Suppress:("[") W:(0123...) Suppress:("-") W:(0123...) Suppress:("]")}
In [19]:
pattern.parseString('te\[\]st\\\\[0-9]test[0-9]')
Out[19]:
([<list_iterator object at 0x7f35380c7fd0>, <map object at 0x7f35381252b0>, <list_iterator object at 0x7f3538125278>, <map object at 0x7f3538100668>], {})
In [20]:
from itertools import product


def iterator(string):
    return (''.join(items) for items in product(*pattern.parseString(string)))

Test

In [21]:
len(list(iterator('te\[\]st\\\\[0-9]test[0-9]')))
Out[21]:
100
In [22]:
list(iterator('te\[\]st\\\\[0-9]test[0-9]'))[:3]
Out[22]:
['te[]st\\0test0', 'te[]st\\0test1', 'te[]st\\0test2']
In [23]:
list(iterator('te\[\]st\\\\[0-9]test[0-9]'))[-3:]
Out[23]:
['te[]st\\9test7', 'te[]st\\9test8', 'te[]st\\9test9']