Commit b208ba82 authored by Pratap Vardhan's avatar Pratap Vardhan
Browse files

ADD: pdftra: Extract, Visualize data tables from your PDF files

Pipeline #114231 failed with stages
in 26 seconds
# .editorconfig maintains consistent coding styles between different editors.
# Get plugins at
# - Sublime text:
# - Notepad++:
root = true
# Apply common styles for most standard code files.
# Do not apply to * - that covers binary files as well
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
# Stick to 2-space indenting by default, to conserve space
indent_style = space
indent_size = 2
indent_size = 4
indent_style = tab
indent_size = 4
module.exports = {
"parserOptions": {
"ecmaVersion": 10 // Use ES6 parser. Browsers other than IE support it
"plugins": [
"template" // Handle Tornado templates and JS in HTML files
"env": {
"es6": true, // Allow ES6 in JavaScript
"browser": true, // Include browser globals
"jquery": true, // Include jQuery and $
"mocha": true // Include it(), assert(), etc
"globals": {
"_": true, // underscore.js
"d3": true, // d3.js
"vg": true, // vega.js
"L": true, // leaflet.js
"ga": true, // Google analytics
"g1": true, // g1.min.js
"topojson": true, // topojson.js
"moment": true, // moment.js
"numeral": true, // numeral.js
"assert": true // chai.js
"extends": "eslint:recommended",
"rules": {
/* Override default rules */
"indent": ["off", 2], // We eventually want 2 space indentation
"linebreak-style": ["off", "unix"], // We eventually want UNIX style line
"quotes": ["off", "double"], // We may go for a double-quotes style
"semi": ["off", "never"] // We may go for a no-semicolon style
# Ignore files that should not be committed.
# Do not commit data, passwords, secret information or large files.
# Don't commit the assets/ folder except for the file
# Store them in a shared folder and sync here.
# Ignore npm and bower modules. These should be created by yarn/npm and bower.
# Filenames should NOT have spaces
* *
# Ignore byte-compiled / optimised / DLL files
# Do not commit data files.
# Ignore compressed files
# Avoid documents
# Avoid media files
# Unit test
# Ignore Gramex 0.x artefacts and log files
# Ignore backup files
# Sublime-text workspaces, etc
# IPython Notebook checkpoints
# Windows / Mac OS artefacts
# bash.exe.stackdump on Cygwin
# R history files
# For Linux FUSE file system
# See
- validate v1
# Deploy application to a server
stage: deploy
script: deploy
only: [master, dev]
URL: pdftra
SETUP: bash
VERSION: py3v1
PORT: 8050
// Gramex .htmllintrc v1.2
"plugins": [],
"attr-bans": [
// "frameborder", // frameborder is used in YouTube embeds
"style", // DO NOT USE style= attribute
"attr-name-style": false,
"attr-no-dup": false, // attr name may be computed, and get replaced by {}
"attr-no-unsafe-char": false, // title contains single quotes '
"attr-quote-style": "double", // attributes contain double quotes
"attr-req-value": false,
"class-no-dup": true, // no duplicate classes in a tag
"doctype-first": false, // snippet templates need not begin with doctype
"doctype-html5": true,
"fig-req-figcaption": false,
"focusable-tabindex-style": false,
"head-req-title": false, // title may be inside a
"href-style": false,
"html-req-lang": false,
"id-class-ignore-regex": "\\{ *\\}", // ignore tornado template id / class
"id-class-no-ad": false,
"id-class-style": false, // no styles enforced for now
"id-no-dup": false, // template replacement IDs { } cause duplication
"img-req-alt": "allownull", // <img alt=""> needed for dynamic image content
"img-req-src": false,
"indent-style": "spaces",
"indent-width": 2,
"label-req-for": false, // cannot use if multiple forms with same key
"line-end-style": false, // raises too many errors
"raw-ignore-regex": "<%.*?%>\\s*|{[%#{].*?[%#}]}\\s*", // ignore templates
"spec-char-escape": false, // using > or < is not that big a deal
"table-req-caption": false,
"tag-bans": [
// "b", // Bootstrap caret example uses <b>
// "i", // Font-awesome icons use <i>
"s", // avoid strike tag, deprecated
"br", // avoid break tag, bad practice, use margin/padding instead
"style", // Put styles into CSS files
"tag-name-lowercase": true,
"tag-name-match": true,
"tag-self-close": false,
"title-max-len": false, // we sometimes have tables inside the title=""
"title-no-dup": true
"use strict"
module.exports = {
rules: {
"at-rule-no-unknown": true,
"block-no-empty": true,
"color-no-invalid-hex": true,
"comment-no-empty": true,
"declaration-block-no-duplicate-properties": [
ignore: ["consecutive-duplicates-with-different-values"]
"declaration-block-no-shorthand-property-overrides": true,
"font-family-no-duplicate-names": true,
"font-family-no-missing-generic-family-keyword": true,
"function-calc-no-unspaced-operator": true,
"function-linear-gradient-no-nonstandard-direction": true,
"keyframe-declaration-no-important": true,
"media-feature-name-no-unknown": true,
"no-descending-specificity": true,
"no-duplicate-at-import-rules": true,
"no-duplicate-selectors": true,
"no-empty-source": true,
"no-extra-semicolons": true,
"no-invalid-double-slash-comments": true,
"property-no-unknown": true,
"selector-pseudo-class-no-unknown": true,
"selector-pseudo-element-no-unknown": true,
"selector-type-no-unknown": true,
"string-no-newline": true,
"unit-no-unknown": true
# pdftra: Extract, Visualize data tables from your PDF files
- Upload PDF files with data tables
- Extract, parse, detect datatypes of first data table
- Show a data table
- Visual barchart of first dimension and metric
- ~150 lines of code (HTML, JS, Python) and gramex.yaml setup
## Setup
- [Install Gramex 1.x](
- `pip install opencv-python camelot`
- Clone this repository
- Setup assets -- `bash`
- From the repo folder, run `gramex`
## Contributions
- Pratap Vardhan <>

11.8 KB

# Generated by gramex init 1.58.0 (don't delete this line)
# on 2020-02-24 by Pratap Vardhan <>
# Configurations for app: pdftra
# ----------------------------------------------------------------------------
pattern: /$YAMLURL/
handler: FileHandler
path: $YAMLPATH/index.html
# See for auth rules
auth: true
template: true
Cache-Control: private, max-age=1
cache: {expiry: {duration: 1}}
pattern: /$YAMLURL/upload
handler: UploadHandler
path: $YAMLPATH/data/upload/
pattern: /$YAMLURL/pdftojson
handler: FunctionHandler
function: pdftra.pdftojson
path: $YAMLPATH/data/upload/
# Default login_url is /login/. If you change this, change auth.login_url
pattern: /$YAMLURL/login/
# You MUST change the auth before deploying. DBAuth is commonly used.
# See
handler: SimpleAuth
template: $YAMLPATH/login.html
credentials: { pdfmaker: pdf2020pass }
pattern: /$YAMLURL/logout/
handler: LogoutHandler
# Gramex init configurations for app: pdftra
# ----------------------------------------------------------------------------
# /ui/ has Gramex UI components -- use this like the node_modules/ directory
path: $GRAMEXAPPS/ui/gramex.yaml
user: pdfmaker
password: pdf2020pass
<!doctype html>
<html lang="en">
<meta charset="utf-8">
<meta http-equiv="x-ua-compatible" content="ie=edge">
<title>pdftra | Extract, Visualize data tables from your PDF files</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="style.css">
<link rel="stylesheet" href="ui/dropzone/dist/min/dropzone.min.css">
{% set base = '.' %}
{% include template-navbar.html %}
<div class="container-fluid py-4">
<div class="row vh-100">
<div class="col-6">
<form action="upload" class="dropzone"></form>
<div id="pdf-preview" class="h-75">
<iframe src="./data/upload/demo.pdf" class="w-100 h-100">This browser does not support PDFs. Download PDF</iframe>
<div class="col-6">
<div id="pdf-chart">
<h3>Visual Summary</h3>
<div class="viewbox w-100"></div>
<div id="pdf-table">
<h3>Extracted Data Table</h3>
<div class="viewbox"></div>
</div><!-- .container-fluid -->
<script src="ui/jquery/dist/jquery.min.js"></script>
<script src="ui/bootstrap/dist/js/bootstrap.bundle.min.js"></script>
<script src="ui/dropzone/dist/min/dropzone.min.js"></script>
<script src="ui/vega/build/vega.min.js"></script>
<script src="ui/vega-lite/build/vega-lite.min.js"></script>
<script src="ui/vega-embed/build/vega-embed.min.js"></script>
<script src="js/utils.js"></script>
<script src="js/app.js"></script>
/* global utils, Dropzone */
// draw
const draw = {
iframe: function (opts) {
const { file } = opts
$('#pdf-preview iframe').attr('src', `./data/upload/${file.file}`)
datatable: opts => $('#pdf-table .viewbox').html(utils.makeTable(opts)),
chart: opts => utils.barChart({el: '#pdf-chart .viewbox', ...opts}),
loading: () => $('#pdf-chart .viewbox, #pdf-table .viewbox').html(utils.loadElement),
init: () => {
apis.pdftojson({ filename: 'demo.pdf' })
// APIs
const apis = {
pdftojson: function (opts) {
return $.getJSON('./pdftojson', {filename: opts.filename})
.done(data => $(window).trigger('data-table', data))
.fail(() => console.log('ERROR: pdftojson'))
// setup elements
Dropzone.autoDiscover = false;
url: 'upload', acceptedFiles: 'application/pdf',
dictDefaultMessage: 'Drop your PDF file here.',
sending: () => $(window).trigger('data-loading'),
success: (file, response) => $(window).trigger('upload-file', {response})
// initial draw
// events handler
$(window).on('upload-file', function(e, data) {
const file = data.response.upload[0]
draw.iframe({ file })
apis.pdftojson({ filename: file.file })
}).on('data-table', function(e, data) {
}).on('data-loading', draw.loading)
/* global vegaEmbed */
/* exported utils */
const utils = {
makeTable: function(opts) {
const { data, columns } = opts
return `
<table class="table table-bordered table-sm table-striped">
<tr>${ => `<th>${col}</th>`).join('')}</tr>
${ => `
<tr>${ => `<td>${row[col]}</td>`).join('')}</tr>
barChart: function(opts) {
const { el, data, types } = opts
const specGen = (data, x, y) => `{
"$schema": "",
"width": "container", "height": 200,
"data": { "values": ${JSON.stringify(data)} },
"mark": "bar",
"encoding": {
"x": {"field": "${x}", "type": "ordinal"},
"y": {"field": "${y}", "type": "quantitative"}
const spec = specGen(data, types.categorical[0], types.numeric[0])
return vegaEmbed(el, JSON.parse(spec))
loadElement: `
<div class="spinner-border" role="status">
<span class="sr-only">Loading...</span>
<!doctype html>
<html lang="en">
<meta charset="utf-8">
<meta http-equiv="x-ua-compatible" content="ie=edge">
<title>pdftra Login</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="../style.css">
<body class="bg-primary gradient-tc bg-no-repeat text-white">
{% set base = '..' %}
{% include template-navbar.html %}
{% set kwargs = handler.kwargs %}
{% try %}{% set user = kwargs.user.arg %}{% except %}{% set user = 'user' %}{% end %}
{% try %}{% set password = kwargs.password.arg %}{% except %}{% set password = 'password' %}{% end %}
<div class="container d-flex flex-column align-items-center">
<div class="card shadow text-dark mx-auto my-4 px-5 py-3 col-md-6">
{% if error %}
<div class="alert alert-danger mx-n3">
<h1 class="h4">Error logging in</h1>
<p>{{ error['error'] }}</p>
<div><small><strong>code</strong>: {{ error['code'] }}</small></div>
{% end %}
<form method="POST">
<div class="form-group">
<label for="{{ user }}">Login</label>
<input type="text" class="form-control" name="{{ user }}" id="{{ user }}" value="{{ handler.get_argument(user, '') }}" placeholder="Login ID" autofocus required>
<div class="form-group">
<label for="{{ password }}">Password</label>
<input type="password" class="form-control" name="{{ password }}" id="{{ password }}" placeholder="Password" required>
<input type="hidden" name="_xsrf" value="{{ handler.xsrf_token }}">
<p><button type="submit" class="btn btn-primary w-100 small">Login</button></p>
{% if kwargs.get('forgot') %}
<p class="small"><a href="?{{ kwargs.forgot.key }}">Forgot password</a></p>
{% end %}
<div>Default login: pdfmaker (password: pdf2020pass)</div>
</div><!-- .card -->
<script src="../ui/jquery/dist/jquery.min.js"></script>
<script src="../ui/bootstrap/dist/js/bootstrap.bundle.min.js"></script>
{% if 'hash' in kwargs.get('password', {}) %}
<script src=""></script>
/* globals sha256 */
// hash the password before submitting
$('form').on('submit', function() {
var $password = $('#{{ password }}').get(0)
$password.value = sha256($password.value)
{% end %}
from io import StringIO
import logging
import pandas as pd
import camelot
def extract_table(path):
tables = camelot.read_pdf(path)
df = tables[0].df
buf = pd.DataFrame(
df.iloc[1:].values, columns=df.iloc[0].values
).to_csv(index=False, encoding='utf-8')
df = pd.read_csv(StringIO(buf), encoding='utf-8')
columns = df.columns.tolist()
types = {
'numeric': list(df.select_dtypes(include='number')),
'categorical': list(df.select_dtypes(exclude='number'))
return {'df': df, 'columns': columns, 'types': types}
def pdftojson(handler):
filename = handler.args.get('filename', [False])[0]
if not filename:
return handler.set_status(400)
result = extract_table(handler.kwargs.path + filename)
return {
'data': result['df'].to_dict(orient='r'),
'columns': result['columns'],
'types': result['types']}
wget -O
unzip; rm;
/* UI component styles. Customize via ?bootstrap-variable=encoded-value. Example:
Colors. Can be a name or a number (e.g. %23aabbcc). Preserve the hues below.
Fonts. Can be a system font or Open+Sans, Roboto, Lato, Anton, Monserrat
@import url("ui/bootstraptheme.css?body-bg=white&navbar-dark-color=rgba(255%2C255%2C255%2C.8)&navbar-dark-hover-color=white");
/* For v4 icons, use url("ui/font-awesome/css/font-awesome.min.css") */
@import url("ui/@fortawesome/fontawesome-free/css/all.min.css");
/* custom styles for app: pdftra */
<nav class="navbar navbar-expand-lg navbar-dark bg-dark py-0">
<a class="navbar-brand" href="{{ base }}">
<!-- Replace with your logo. Just specify height, not width. -->
<img height="36" src="" alt="Logo">
<span class="text-uppercase">pdftra</span>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<!-- Navbar middle -->
<div class="navbar-nav mr-auto">
<span class="nav-item nav-link active mb-0">Extract, Visualize data tables from your PDF files.</span>
<!-- Navbar right -->
<div class="navbar-nav mr-2">
{% if handler.current_user %}
<a class="nav-item nav-link" href="{{ base }}/login/" title="Log in again" data-placement="bottom">
{{ }}
<a class="nav-item nav-link" href="{{ base }}/logout/" title="Log out" data-placement="bottom"><span class="fas fa-arrow-right bg-light round text-dark p-1"></span></a>
{% else %}
<a class="nav-item nav-link" href="{{ base }}/login/" title="Log in again" data-placement="bottom">
Log in
{% end %}
Markdown is supported