Posed here:
https://mail.python.org/pipermail/centraloh/2014-December/002232.html
from __future__ import print_function
import re
import numpy as np
import pandas as pd
Let's look at the source text, which shows a common pattern betwen H1
, H2
, H3
, and H4
origtext = open('formulas.txt').read()
print(origtext)
81.137644601433 + 36.6589616861417 * H1 + -31.1046441228118 * H2+ - 28.816467210923 * H3 + 29.9428662798989 * H4 H1: TanH(0.5 * ((-0.93608461826401) + 0.0279759444862288 * A + 0.00317168579532788 *B + -0.00427397729136767 * C + 0.0125193801427099 * D +0.00721746180478183 * E + 0.0170333918733729 * F + 0.39879852406802* G + -0.608563852112721 * H + 1.65930260987635 * J + 1.43483965025228 * K + -0.402472411054028 * L + 0.574879148093717 * M)) H2: TanH(0.5 * (2.54086700956048 + -0.0309791525829621 * A + 0.00237540217041462 *B + 0.000229714345399431 * C + 0.00656732772813456 * D +0.00858189985455511 * E + 0.0306145052833575 * F + 0.325324436920013* G + -0.0652380713439368 * H + 0.502808777611107 * J + 2.62515706116404 * K + 0.157259327153552 * L + - 0.0161835383305806 * M)) H3: TanH(0.5 * ((-0.215510423022572) + 0.00341969653419575 * A + 0.00595421019457033 * B + 0.00208464969350523 * C + 0.0439352627700078 * D + -0.00692214398088429 * E + 0.0651828126561487 * F + 0.688661878742455 * G + -1.18547032625557 * H + 2.44391733654282 * J + 1.26094801313471 * K + -0.504334460563991 * L + - 0.43207595594702 * M)) H4: TanH(0.5 * ((-3.1962875744613) + 0.0274843861132957 * A + 0.00431941802031792 *B + 0.000873073437785473 * C + 0.036096877808041 * D + -0.00616748059025476 * E + 0.0410646957742769 * F + 0.321067469333745 * G + -0.344778527751441 * H + 0.335721855268919 * J + 2.01028981590329 * K + 0.462469794510566 * L + 0.279409768455546 * M))
Since the formulas follow the same pattern, we can use regular expressions to extract the data that we want. Let's do this in three steps:
H#
equationsH#
equationOnce accomplished, it will be trivial to save the coefficient data to csv files (or similar) that can be reviewed, including loaded into Excel to validate the calculations.
# replace newlines with spaces for easier regex matching
text = origtext.replace('\n', ' ')
# 1. match two groups (key-val pairs); for example, "H1" and the equation excluding "Tanh(0.5 *"
equations = dict(re.findall('(H\d): +TanH.0.5[^(]+([^M]+M\)*)', text))
equations
{'H1': '((-0.93608461826401) + 0.0279759444862288 * A + 0.00317168579532788 *B + -0.00427397729136767 * C + 0.0125193801427099 * D +0.00721746180478183 * E + 0.0170333918733729 * F + 0.39879852406802* G + -0.608563852112721 * H + 1.65930260987635 * J + 1.43483965025228 * K + -0.402472411054028 * L + 0.574879148093717 * M))', 'H2': '(2.54086700956048 + -0.0309791525829621 * A + 0.00237540217041462 *B + 0.000229714345399431 * C + 0.00656732772813456 * D +0.00858189985455511 * E + 0.0306145052833575 * F + 0.325324436920013* G + -0.0652380713439368 * H + 0.502808777611107 * J + 2.62515706116404 * K + 0.157259327153552 * L + - 0.0161835383305806 * M))', 'H3': '((-0.215510423022572) + 0.00341969653419575 * A + 0.00595421019457033 * B + 0.00208464969350523 * C + 0.0439352627700078 * D + -0.00692214398088429 * E + 0.0651828126561487 * F + 0.688661878742455 * G + -1.18547032625557 * H + 2.44391733654282 * J + 1.26094801313471 * K + -0.504334460563991 * L + - 0.43207595594702 * M))', 'H4': '((-3.1962875744613) + 0.0274843861132957 * A + 0.00431941802031792 *B + 0.000873073437785473 * C + 0.036096877808041 * D + -0.00616748059025476 * E + 0.0410646957742769 * F + 0.321067469333745 * G + -0.344778527751441 * H + 0.335721855268919 * J + 2.01028981590329 * K + 0.462469794510566 * L + 0.279409768455546 * M))'}
Now let's get the coefficient data, which includes a single intercept and 12 variable coefficients for each H#
equation.
# construct the regex's and make sure they work on an equation
matchfloat = '-? *\d*\.\d*'
# match the intercept value
matchintercept = '({matchfloat}) *\)? *\+'.format(matchfloat=matchfloat)
# match both the coefficient value and the variable name [A-M]
matchvariable = '({matchfloat}) *\* *([A-M])'.format(matchfloat=matchfloat)
print('Intercept:')
print(re.findall(matchintercept, equations['H1']))
print('Coefficients:')
re.findall(matchvariable, equations['H1'])
Intercept: ['-0.93608461826401'] Coefficients:
[(' 0.0279759444862288', 'A'), (' 0.00317168579532788', 'B'), ('-0.00427397729136767', 'C'), (' 0.0125193801427099', 'D'), ('0.00721746180478183', 'E'), (' 0.0170333918733729', 'F'), (' 0.39879852406802', 'G'), ('-0.608563852112721', 'H'), (' 1.65930260987635', 'J'), (' 1.43483965025228', 'K'), ('-0.402472411054028', 'L'), (' 0.574879148093717', 'M')]
The coefficients above look like key-value pairs. Let's make dict of {varname: coeff}
and convert coeff
to float.
Use 'intercept'
as the key for the intercept value in the dict.
As a sanity check, also verify there is only one intercept and exactly 12 variables extracted by the regex.
def tofloat(sval):
return float(sval.replace(' ', ''))
def parsevars(eqtext):
d = dict(map(reversed, re.findall(matchvariable, eqtext)))
assert len(d) == 12
intercept = re.findall(matchintercept, eqtext)
assert len(intercept) == 1
d['intercept'] = intercept[0]
d = dict((key, tofloat(val))
for key, val in d.items())
return d
parsevars(equations['H1'])
{'A': 0.0279759444862288, 'B': 0.00317168579532788, 'C': -0.00427397729136767, 'D': 0.0125193801427099, 'E': 0.00721746180478183, 'F': 0.0170333918733729, 'G': 0.39879852406802, 'H': -0.608563852112721, 'J': 1.65930260987635, 'K': 1.43483965025228, 'L': -0.402472411054028, 'M': 0.574879148093717, 'intercept': -0.93608461826401}
Perfect!
Now do this for all the H#
equations and construct a pandas
DataFrame of coefficients.
eq_coeff = pd.DataFrame(map(parsevars, equations.values()), index=equations.keys()).sort()
eq_coeff.T # too wide for the screen, so take the transpose
H1 | H2 | H3 | H4 | |
---|---|---|---|---|
A | 0.027976 | -0.030979 | 0.003420 | 0.027484 |
B | 0.003172 | 0.002375 | 0.005954 | 0.004319 |
C | -0.004274 | 0.000230 | 0.002085 | 0.000873 |
D | 0.012519 | 0.006567 | 0.043935 | 0.036097 |
E | 0.007217 | 0.008582 | -0.006922 | -0.006167 |
F | 0.017033 | 0.030615 | 0.065183 | 0.041065 |
G | 0.398799 | 0.325324 | 0.688662 | 0.321067 |
H | -0.608564 | -0.065238 | -1.185470 | -0.344779 |
J | 1.659303 | 0.502809 | 2.443917 | 0.335722 |
K | 1.434840 | 2.625157 | 1.260948 | 2.010290 |
L | -0.402472 | 0.157259 | -0.504334 | 0.462470 |
M | 0.574879 | -0.016184 | -0.432076 | 0.279410 |
intercept | -0.936085 | 2.540867 | -0.215510 | -3.196288 |
Now that the coefficients are in a DataFrame, we can easily save the data to csv, excel, hdf5, a SQL database, etc. Another advantage of using a DataFrame is the equations can be calculated easily and efficiently.
The final regex operation below extracts the coefficients used in the final equation. Here again is the text we are matching against:
text[:text.index('H1:')]
'81.137644601433 + 36.6589616861417 * H1 + -31.1046441228118 * H2+ - 28.816467210923 * H3 + 29.9428662798989 * H4 '
coeff = re.findall(
'({matchfloat}){plus}'
'({matchfloat}){times}H1{plus}'
'({matchfloat}){times}H2{plus}'
'({matchfloat}){times}H3{plus}'
'({matchfloat}){times}H4'
.format(matchfloat=matchfloat, plus=' *\+ *', times=' *\* *'),
text)
coeff
[('81.137644601433', '36.6589616861417', '-31.1046441228118', '- 28.816467210923', '29.9428662798989')]
Perfect! Now load this to a pandas
Series and convert the values to float
H_coeff = pd.Series(map(tofloat, coeff[0]),
index=['intercept', 'H1', 'H2', 'H3', 'H4'])
H_coeff
intercept 81.137645 H1 36.658962 H2 -31.104644 H3 -28.816467 H4 29.942866 dtype: float64
We weren't given sample input, so let's make up our own input for demonstration
input_series = 0.1 + 0.1 * pd.Series(range(12), index=eq_coeff.columns[:-1]) # exclude 'intercept'
input_series
A 0.1 B 0.2 C 0.3 D 0.4 E 0.5 F 0.6 G 0.7 H 0.8 J 0.9 K 1.0 L 1.1 M 1.2 dtype: float64
Let's walk through the calculation one step at a time and inspect the results. We'll make the calculation cleaner and more flexible at the very end.
# calculate the value within the "0.5*(<value>)" parentheses
X = eq_coeff[input_series.index].dot(input_series) + eq_coeff['intercept']
X
H1 2.052557 H2 5.970386 H3 1.760772 H4 -0.051047 dtype: float64
# for sanity, let's look at the individual contributions of each factor
debug = eq_coeff[input_series.index] * input_series
debug['intercept'] = eq_coeff['intercept']
debug.T
H1 | H2 | H3 | H4 | |
---|---|---|---|---|
A | 0.002798 | -0.003098 | 0.000342 | 0.002748 |
B | 0.000634 | 0.000475 | 0.001191 | 0.000864 |
C | -0.001282 | 0.000069 | 0.000625 | 0.000262 |
D | 0.005008 | 0.002627 | 0.017574 | 0.014439 |
E | 0.003609 | 0.004291 | -0.003461 | -0.003084 |
F | 0.010220 | 0.018369 | 0.039110 | 0.024639 |
G | 0.279159 | 0.227727 | 0.482063 | 0.224747 |
H | -0.486851 | -0.052190 | -0.948376 | -0.275823 |
J | 1.493372 | 0.452528 | 2.199526 | 0.302150 |
K | 1.434840 | 2.625157 | 1.260948 | 2.010290 |
L | -0.442720 | 0.172985 | -0.554768 | 0.508717 |
M | 0.689855 | -0.019420 | -0.518491 | 0.335292 |
intercept | -0.936085 | 2.540867 | -0.215510 | -3.196288 |
# for kicks, let's use this debug DataFrame to perform a consistenty check
X.equals(debug.sum(axis=1))
True
# continuing with the calculation...
TANH_X = np.tanh(0.5 * X)
TANH_X
H1 0.772411 H2 0.994906 H3 0.706613 H4 -0.025518 dtype: float64
# and take the dot product again just as we did above
result = H_coeff[TANH_X.index].dot(TANH_X) + H_coeff['intercept']
result
57.381069996551943
Make the expressions more human-readable by introducing a helper function dotcoeff
. While we're at it, let's support both pandas
Series and DataFrames as input. By adding DataFrame support, the user can perform the calculation for multiple inputs simultaneously.
def dotcoeff(coeff, data):
"""Perform the dot product of coeff and data, and add coeff['intercept'].
Use the index (or columns) of the input data Series (or DataFrame).
"""
return coeff[data.T.index].dot(data.T).T + coeff['intercept']
def docalc(data):
X = dotcoeff(eq_coeff, data)
TANH_X = np.tanh(0.5 * X)
return dotcoeff(H_coeff, TANH_X)
# Make sure we still get the same result with Series input
docalc(input_series)
57.381069996551943
# Now try DataFrame input
input_df = pd.DataFrame([input_series, 2*input_series])
input_df
A | B | C | D | E | F | G | H | J | K | L | M | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1 | 1.1 | 1.2 |
1 | 0.2 | 0.4 | 0.6 | 0.8 | 1.0 | 1.2 | 1.4 | 1.6 | 1.8 | 2 | 2.2 | 2.4 |
docalc(input_df)
0 57.381070 1 86.097694 dtype: float64
And we're done! (well, I'm done anyway)
Hopefully any revisions of the coefficients will be done within a data store (such as csv or database) so we won't need to parse them again using regular expressions.