pdftra.py 1010 Bytes
Newer Older
1
2
3
4
from io import StringIO
import logging
import pandas as pd
import camelot
Pratap Vardhan's avatar
Pratap Vardhan committed
5
from gramex.http import BAD_REQUEST
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
logging.getLogger('pdfminer').setLevel(logging.ERROR)


def extract_table(path):
    tables = camelot.read_pdf(path)
    df = tables[0].df
    buf = pd.DataFrame(
        df.iloc[1:].values, columns=df.iloc[0].values
        ).to_csv(index=False, encoding='utf-8')
    df = pd.read_csv(StringIO(buf), encoding='utf-8')
    columns = df.columns.tolist()
    types = {
        'numeric': list(df.select_dtypes(include='number')),
        'categorical': list(df.select_dtypes(exclude='number'))
    }
    return {'df': df, 'columns': columns, 'types': types}


def pdftojson(handler):
    filename = handler.args.get('filename', [False])[0]
    if not filename:
Pratap Vardhan's avatar
Pratap Vardhan committed
27
        return handler.set_status(BAD_REQUEST)
28
29
30
31
32
    result = extract_table(handler.kwargs.path + filename)
    return {
        'data': result['df'].to_dict(orient='r'),
        'columns': result['columns'],
        'types': result['types']}