Commit b208ba82 authored by Pratap Vardhan's avatar Pratap Vardhan
Browse files

ADD: pdftra: Extract, Visualize data tables from your PDF files

parents
Pipeline #114231 failed with stages
in 26 seconds
# .editorconfig maintains consistent coding styles between different editors.
# Get plugins at http://editorconfig.org/
# - Sublime text: https://github.com/sindresorhus/editorconfig-sublime
# - Notepad++: https://github.com/editorconfig/editorconfig-notepad-plus-plus
root = true
# Apply common styles for most standard code files.
# Do not apply to * - that covers binary files as well
[*.{js,html,php,py,css,svg,json,less,yaml,yml,scss,xml,sh,java,bat,R}]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
# Stick to 2-space indenting by default, to conserve space
indent_style = space
indent_size = 2
[*.py]
indent_size = 4
[Makefile]
indent_style = tab
indent_size = 4
module.exports = {
"parserOptions": {
"ecmaVersion": 10 // Use ES6 parser. Browsers other than IE support it
},
"plugins": [
"template" // Handle Tornado templates and JS in HTML files
],
"env": {
"es6": true, // Allow ES6 in JavaScript
"browser": true, // Include browser globals
"jquery": true, // Include jQuery and $
"mocha": true // Include it(), assert(), etc
},
"globals": {
"_": true, // underscore.js
"d3": true, // d3.js
"vg": true, // vega.js
"L": true, // leaflet.js
"ga": true, // Google analytics
"g1": true, // g1.min.js
"topojson": true, // topojson.js
"moment": true, // moment.js
"numeral": true, // numeral.js
"assert": true // chai.js
},
"extends": "eslint:recommended",
"rules": {
/* Override default rules */
"indent": ["off", 2], // We eventually want 2 space indentation
"linebreak-style": ["off", "unix"], // We eventually want UNIX style line
"quotes": ["off", "double"], // We may go for a double-quotes style
"semi": ["off", "never"] // We may go for a no-semicolon style
}
};
[flake8]
exclude=build,dist
max-line-length=99
# Ignore files that should not be committed.
# Do not commit data, passwords, secret information or large files.
# Don't commit the assets/ folder except for the README.md file
# Store them in a shared folder and sync here.
assets/**
!assets/README*
# Ignore npm and bower modules. These should be created by yarn/npm and bower.
node_modules/
bower_components/
# Filenames should NOT have spaces
* *
# Ignore byte-compiled / optimised / DLL files
__pycache__/
*.py[cod]
# Do not commit data files.
*.csv
*.db
*.sqlite3
*.sqlite3-journal
*.dat
*.mdb
# Ignore compressed files
*.7z
*.zip
# Avoid documents
*.doc*
*.pdf
*.ppt*
*.xls*
# Avoid media files
*.avi
*.mp*
*.wmv
# Unit test
.pytest_cache/
# Ignore Gramex 0.x artefacts and log files
.cache/
.gramex-compiled/
*.xhtml
*.log
# Ignore backup files
~$*
*~
*.bak*
# Sublime-text workspaces, etc
*.sublime-*
.vscode/
# IPython Notebook checkpoints
.ipynb_checkpoints
# Windows / Mac OS artefacts
*.lnk
Desktop.ini
$RECYCLE.BIN/
*[Tt]humbs.db
*.DS_Store
# bash.exe.stackdump on Cygwin
*.stackdump
# R history files
.RHistory
# For Linux FUSE file system
.fuse_hidden*
# See http://doc.gitlab.com/ce/ci/yaml/README.html
validate:
script:
- validate v1
# Deploy application to a server
deploy:
stage: deploy
script: deploy
only: [master, dev]
variables:
SERVER: ubuntu@uat.gramener.com
URL: pdftra
SETUP: bash setup.sh
VERSION: py3v1
PORT: 8050
// Gramex .htmllintrc v1.2
{
"plugins": [],
"attr-bans": [
"align",
"background",
"bgcolor",
"border",
// "frameborder", // frameborder is used in YouTube embeds
"longdesc",
"marginwidth",
"marginheight",
"scrolling",
"style", // DO NOT USE style= attribute
"onclick",
"onload"
],
"attr-name-style": false,
"attr-no-dup": false, // attr name may be computed, and get replaced by {}
"attr-no-unsafe-char": false, // title contains single quotes '
"attr-quote-style": "double", // attributes contain double quotes
"attr-req-value": false,
"class-no-dup": true, // no duplicate classes in a tag
"doctype-first": false, // snippet templates need not begin with doctype
"doctype-html5": true,
"fig-req-figcaption": false,
"focusable-tabindex-style": false,
"head-req-title": false, // title may be inside a Block.run()
"href-style": false,
"html-req-lang": false,
"id-class-ignore-regex": "\\{ *\\}", // ignore tornado template id / class
"id-class-no-ad": false,
"id-class-style": false, // no styles enforced for now
"id-no-dup": false, // template replacement IDs { } cause duplication
"img-req-alt": "allownull", // <img alt=""> needed for dynamic image content
"img-req-src": false,
"indent-style": "spaces",
"indent-width": 2,
"label-req-for": false, // cannot use if multiple forms with same key
"line-end-style": false, // raises too many errors
"raw-ignore-regex": "<%.*?%>\\s*|{[%#{].*?[%#}]}\\s*", // ignore templates
"spec-char-escape": false, // using > or < is not that big a deal
"table-req-caption": false,
"tag-bans": [
// "b", // Bootstrap caret example uses <b>
// "i", // Font-awesome icons use <i>
"s", // avoid strike tag, deprecated
"br", // avoid break tag, bad practice, use margin/padding instead
"style", // Put styles into CSS files
"u",
"strike",
"font",
"center"
],
"tag-name-lowercase": true,
"tag-name-match": true,
"tag-self-close": false,
"title-max-len": false, // we sometimes have tables inside the title=""
"title-no-dup": true
}
"use strict"
module.exports = {
rules: {
"at-rule-no-unknown": true,
"block-no-empty": true,
"color-no-invalid-hex": true,
"comment-no-empty": true,
"declaration-block-no-duplicate-properties": [
true,
{
ignore: ["consecutive-duplicates-with-different-values"]
}
],
"declaration-block-no-shorthand-property-overrides": true,
"font-family-no-duplicate-names": true,
"font-family-no-missing-generic-family-keyword": true,
"function-calc-no-unspaced-operator": true,
"function-linear-gradient-no-nonstandard-direction": true,
"keyframe-declaration-no-important": true,
"media-feature-name-no-unknown": true,
"no-descending-specificity": true,
"no-duplicate-at-import-rules": true,
"no-duplicate-selectors": true,
"no-empty-source": true,
"no-extra-semicolons": true,
"no-invalid-double-slash-comments": true,
"property-no-unknown": true,
"selector-pseudo-class-no-unknown": true,
"selector-pseudo-element-no-unknown": true,
"selector-type-no-unknown": true,
"string-no-newline": true,
"unit-no-unknown": true
}
}
# pdftra: Extract, Visualize data tables from your PDF files
![demo](https://cloud.gramener.com/f/04fb44304ec946348fe5/?dl=1)
- Upload PDF files with data tables
- Extract, parse, detect datatypes of first data table
- Show a data table
- Visual barchart of first dimension and metric
- ~150 lines of code (HTML, JS, Python) and gramex.yaml setup
## Setup
- [Install Gramex 1.x](https://learn.gramener.com/guide/install/)
- `pip install opencv-python camelot`
- Clone this repository
- Setup assets -- `bash setup.sh`
- From the repo folder, run `gramex`
## Contributions
- Pratap Vardhan <pratap.vardhan@gramener.com>
favicon.ico

11.8 KB

# Generated by gramex init 1.58.0 (don't delete this line)
# on 2020-02-24 by Pratap Vardhan <pratap.vardhan@gramener.com>
# Configurations for app: pdftra
# ----------------------------------------------------------------------------
url:
pdftra-home:
pattern: /$YAMLURL/
handler: FileHandler
kwargs:
path: $YAMLPATH/index.html
# See https://learn.gramener.com/guide/auth/#authorization for auth rules
auth: true
template: true
headers:
Cache-Control: private, max-age=1
cache: {expiry: {duration: 1}}
pdftra-upload:
pattern: /$YAMLURL/upload
handler: UploadHandler
kwargs:
path: $YAMLPATH/data/upload/
pdftra-pdftojson:
pattern: /$YAMLURL/pdftojson
handler: FunctionHandler
kwargs:
function: pdftra.pdftojson
path: $YAMLPATH/data/upload/
pdftra-login:
# Default login_url is /login/. If you change this, change auth.login_url
# https://learn.gramener.com/guide/auth/#login-urls
pattern: /$YAMLURL/login/
# You MUST change the auth before deploying. DBAuth is commonly used.
# See https://learn.gramener.com/guide/auth/#database-auth
handler: SimpleAuth
kwargs:
template: $YAMLPATH/login.html
credentials: { pdfmaker: pdf2020pass }
pdftra-logout:
pattern: /$YAMLURL/logout/
handler: LogoutHandler
# Gramex init configurations for app: pdftra
# ----------------------------------------------------------------------------
import:
# /ui/ has Gramex UI components -- use this like the node_modules/ directory
ui:
path: $GRAMEXAPPS/ui/gramex.yaml
YAMLURL: $YAMLURL/ui/
test:
auth:
user: pdfmaker
password: pdf2020pass
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="x-ua-compatible" content="ie=edge">
<title>pdftra | Extract, Visualize data tables from your PDF files</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="style.css">
<link rel="stylesheet" href="ui/dropzone/dist/min/dropzone.min.css">
</head>
<body>
{% set base = '.' %}
{% include template-navbar.html %}
<div class="container-fluid py-4">
<div class="row vh-100">
<div class="col-6">
<form action="upload" class="dropzone"></form>
<div id="pdf-preview" class="h-75">
<iframe src="./data/upload/demo.pdf" class="w-100 h-100">This browser does not support PDFs. Download PDF</iframe>
</div>
</div>
<div class="col-6">
<div id="pdf-chart">
<h3>Visual Summary</h3>
<div class="viewbox w-100"></div>
</div>
<div id="pdf-table">
<h3>Extracted Data Table</h3>
<div class="viewbox"></div>
</div>
</div>
</div>
</div><!-- .container-fluid -->
<script src="ui/jquery/dist/jquery.min.js"></script>
<script src="ui/bootstrap/dist/js/bootstrap.bundle.min.js"></script>
<script src="ui/dropzone/dist/min/dropzone.min.js"></script>
<script src="ui/vega/build/vega.min.js"></script>
<script src="ui/vega-lite/build/vega-lite.min.js"></script>
<script src="ui/vega-embed/build/vega-embed.min.js"></script>
<script src="js/utils.js"></script>
<script src="js/app.js"></script>
</body></html>
/* global utils, Dropzone */
// draw
const draw = {
iframe: function (opts) {
const { file } = opts
$('#pdf-preview iframe').attr('src', `./data/upload/${file.file}`)
},
datatable: opts => $('#pdf-table .viewbox').html(utils.makeTable(opts)),
chart: opts => utils.barChart({el: '#pdf-chart .viewbox', ...opts}),
loading: () => $('#pdf-chart .viewbox, #pdf-table .viewbox').html(utils.loadElement),
init: () => {
draw.loading()
apis.pdftojson({ filename: 'demo.pdf' })
}
}
// APIs
const apis = {
pdftojson: function (opts) {
return $.getJSON('./pdftojson', {filename: opts.filename})
.done(data => $(window).trigger('data-table', data))
.fail(() => console.log('ERROR: pdftojson'))
}
}
// setup elements
Dropzone.autoDiscover = false;
$('.dropzone').dropzone({
url: 'upload', acceptedFiles: 'application/pdf',
dictDefaultMessage: 'Drop your PDF file here.',
sending: () => $(window).trigger('data-loading'),
success: (file, response) => $(window).trigger('upload-file', {response})
})
// initial draw
draw.init()
// events handler
$(window).on('upload-file', function(e, data) {
const file = data.response.upload[0]
draw.iframe({ file })
apis.pdftojson({ filename: file.file })
}).on('data-table', function(e, data) {
draw.datatable(data)
draw.chart(data)
}).on('data-loading', draw.loading)
/* global vegaEmbed */
/* exported utils */
const utils = {
makeTable: function(opts) {
const { data, columns } = opts
return `
<table class="table table-bordered table-sm table-striped">
<thead>
<tr>${columns.map(col => `<th>${col}</th>`).join('')}</tr>
</thead>
<tbody>
${data.map(row => `
<tr>${columns.map(col => `<td>${row[col]}</td>`).join('')}</tr>
`).join('')}
</tbody>
</table>`
},
barChart: function(opts) {
const { el, data, types } = opts
const specGen = (data, x, y) => `{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"width": "container", "height": 200,
"data": { "values": ${JSON.stringify(data)} },
"mark": "bar",
"encoding": {
"x": {"field": "${x}", "type": "ordinal"},
"y": {"field": "${y}", "type": "quantitative"}
}
}`
const spec = specGen(data, types.categorical[0], types.numeric[0])
return vegaEmbed(el, JSON.parse(spec))
},
loadElement: `
<div class="spinner-border" role="status">
<span class="sr-only">Loading...</span>
</div>`
}
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="x-ua-compatible" content="ie=edge">
<title>pdftra Login</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="../style.css">
</head>
<body class="bg-primary gradient-tc bg-no-repeat text-white">
{% set base = '..' %}
{% include template-navbar.html %}
{% set kwargs = handler.kwargs %}
{% try %}{% set user = kwargs.user.arg %}{% except %}{% set user = 'user' %}{% end %}
{% try %}{% set password = kwargs.password.arg %}{% except %}{% set password = 'password' %}{% end %}
<div class="container d-flex flex-column align-items-center">
<div class="card shadow text-dark mx-auto my-4 px-5 py-3 col-md-6">
{% if error %}
<div class="alert alert-danger mx-n3">
<h1 class="h4">Error logging in</h1>
<p>{{ error['error'] }}</p>
<div><small><strong>code</strong>: {{ error['code'] }}</small></div>
</div>
{% end %}
<form method="POST">
<div class="form-group">
<label for="{{ user }}">Login</label>
<input type="text" class="form-control" name="{{ user }}" id="{{ user }}" value="{{ handler.get_argument(user, '') }}" placeholder="Login ID" autofocus required>
</div>
<div class="form-group">
<label for="{{ password }}">Password</label>
<input type="password" class="form-control" name="{{ password }}" id="{{ password }}" placeholder="Password" required>
</div>
<input type="hidden" name="_xsrf" value="{{ handler.xsrf_token }}">
<p><button type="submit" class="btn btn-primary w-100 small">Login</button></p>
{% if kwargs.get('forgot') %}
<p class="small"><a href="?{{ kwargs.forgot.key }}">Forgot password</a></p>
{% end %}
<div>Default login: pdfmaker (password: pdf2020pass)</div>
</form>
</div><!-- .card -->
</div>
<script src="../ui/jquery/dist/jquery.min.js"></script>
<script src="../ui/bootstrap/dist/js/bootstrap.bundle.min.js"></script>
{% if 'hash' in kwargs.get('password', {}) %}
<script src="https://cdnjs.cloudflare.com/ajax/libs/js-sha256/0.9.0/sha256.min.js"></script>
<script>
/* globals sha256 */
// hash the password before submitting
$('form').on('submit', function() {
var $password = $('#{{ password }}').get(0)
$password.value = sha256($password.value)
})
</script>
{% end %}
</body>
</html>
from io import StringIO
import logging
import pandas as pd
import camelot
logging.getLogger('pdfminer').setLevel(logging.ERROR)
def extract_table(path):
tables = camelot.read_pdf(path)
df = tables[0].df
buf = pd.DataFrame(
df.iloc[1:].values, columns=df.iloc[0].values
).to_csv(index=False, encoding='utf-8')
df = pd.read_csv(StringIO(buf), encoding='utf-8')
columns = df.columns.tolist()
types = {
'numeric': list(df.select_dtypes(include='number')),
'categorical': list(df.select_dtypes(exclude='number'))
}
return {'df': df, 'columns': columns, 'types': types}
def pdftojson(handler):
filename = handler.args.get('filename', [False])[0]
if not filename:
return handler.set_status(400)
result = extract_table(handler.kwargs.path + filename)
return {
'data': result['df'].to_dict(orient='r'),
'columns': result['columns'],
'types': result['types']}
wget -O temp.zip https://cloud.gramener.com/f/f38865cc7b9347f388bc/?dl=1
unzip temp.zip; rm temp.zip;
/* UI component styles. Customize via ?bootstrap-variable=encoded-value. Example:
Colors. Can be a name or a number (e.g. %23aabbcc). Preserve the hues below.
primary=blue
success=green
info=cyan
warning=orange
danger=red
secondary=grey
light=lightgrey
dark=darkgrey
body-bg=white
body-color=black
Fonts. Can be a system font or Open+Sans, Roboto, Lato, Anton, Monserrat
font-family-base=Segoe+UI
headings-font-family=Segoe+UI
Other
https://github.com/twbs/bootstrap/blob/v4-dev/scss/_variables.scss
*/
@import url("ui/bootstraptheme.css?body-bg=white&navbar-dark-color=rgba(255%2C255%2C255%2C.8)&navbar-dark-hover-color=white");
/* For v4 icons, use url("ui/font-awesome/css/font-awesome.min.css") */
@import url("ui/@fortawesome/fontawesome-free/css/all.min.css");
/* custom styles for app: pdftra */
<nav class="navbar navbar-expand-lg navbar-dark bg-dark py-0">
<a class="navbar-brand" href="{{ base }}">
<!-- Replace with your logo. Just specify height, not width. -->
<img height="36" src="" alt="Logo">
<span class="text-uppercase">pdftra</span>
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<!-- Navbar middle -->
<div class="navbar-nav mr-auto">
<span class="nav-item nav-link active mb-0">Extract, Visualize data tables from your PDF files.</span>
</div>
<!-- Navbar right -->
<div class="navbar-nav mr-2">
{% if handler.current_user %}
<a class="nav-item nav-link" href="{{ base }}/login/" title="Log in again" data-placement="bottom">
{{ handler.current_user.id }}
</a>
<a class="nav-item nav-link" href="{{ base }}/logout/" title="Log out" data-placement="bottom"><span class="fas fa-arrow-right bg-light round text-dark p-1"></span></a>
{% else %}
<a class="nav-item nav-link" href="{{ base }}/login/" title="Log in again" data-placement="bottom">
Log in
</a>
{% end %}
</div>
</div>
</nav>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment