https://groups.google.com/forum/#!topic/python-etl/XRIJovpb6Qc

The conversion I'm trying to make is:

  • If nb_day is not integer:
    • if notes contains 'am':
      • split line: 1 with full days, modifying date_end = previous date_end - 1 1 with 0.5 day, modifying date_begin = date_end
    • if notes contacts 'pm':
      • split line: 1 with 0.5 day, modifying date_end = date_begin 1 with full days, modifying date_begin = previous date_begin + 1
In [1]:
import datetime
import petl.interactive as etl
print etl.__version__
0.26
In [2]:
data = """user_id,date_begin,date_end,nb_days,notes,projet_id
user1,2014-07-31,2014-08-07,5.5,5 days + am,cp
user2,2014-07-31,2014-08-07,5.5,5 days + pm,cp
user3,2014-07-31,2014-08-06,5,5 days,cp
"""
In [3]:
day = datetime.timedelta(days=1)


def split_partial_days(row):
    if isinstance(row.nb_days, float):
        # split out partial days into separate row
        if 'am' in row.notes:
            # full days
            yield (row.user_id, 
                   row.date_begin, 
                   row.date_end - day,
                   int(row.nb_days),
                   row.notes.split('+')[0].strip(), 
                   row.projet_id)
            # partial days
            yield (row.user_id, 
                   row.date_end, 
                   row.date_end, 
                   row.nb_days - int(row.nb_days),
                   row.notes.split('+')[1].strip(), 
                   row.projet_id)
        if 'pm' in row.notes:
            # partial days
            yield (row.user_id, 
                   row.date_begin, 
                   row.date_begin, 
                   row.nb_days - int(row.nb_days),
                   row.notes.split('+')[1].strip(), 
                   row.projet_id)
            # full days
            yield (row.user_id, 
                   row.date_begin + day, 
                   row.date_end,
                   int(row.nb_days),
                   row.notes.split('+')[0].strip(), 
                   row.projet_id)
    else:
        # do nothing
        yield row

        
tbl = (etl
    .fromcsv(etl.StringSource(data))
    .convert(('date_begin', 'date_end'), etl.dateparser('%Y-%m-%d'))
    .convert('nb_days', etl.parsenumber)
    .rowmapmany(split_partial_days, fields=['user_id', 'date_begin', 'date_end', 'nb_days', 'notes', 'projet_id'])
)
tbl
Out[3]:
user_id date_begin date_end nb_days notes projet_id
user1 2014-07-31 2014-08-06 5 5 days cp
user1 2014-08-07 2014-08-07 0.5 am cp
user2 2014-07-31 2014-07-31 0.5 pm cp
user2 2014-08-01 2014-08-07 5 5 days cp
user3 2014-07-31 2014-08-06 5 5 days cp
In [3]: