Skip to content

Commit

Permalink
Merge pull request #1051 from 18F/simple-pdf-import
Browse files Browse the repository at this point in the history
Add very rudimentary PDF import
  • Loading branch information
cmc333333 authored Mar 19, 2018
2 parents 7fec670 + b292b7b commit 2d8e21c
Show file tree
Hide file tree
Showing 12 changed files with 326 additions and 106 deletions.
13 changes: 0 additions & 13 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,6 @@ api: &API
environment:
DEBUG: "True"
DATABASE_URL: postgres://postgres@localhost/postgres
VCAP_SERVICES: |
{"s3": [{
"credentials": {
"access_key_id": "LOCAL_ID",
"bucket": "pdfs",
"region": "irrelevant fake region",
"endpoint": "http://localhost:9100",
"secret_access_key": "LOCAL_KEY"
},
"label": "s3",
"name": "storage-s3"
}]
}
admin-ui: &ADMIN_UI
docker:
- image: node:6
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ ui/lcov.info
.next
api/webpack-static/
api/collected-static/
api/media/

# Data
*.csv
Expand Down
40 changes: 19 additions & 21 deletions api/omb_eregs/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,30 +199,28 @@
]

# File storage

DEFAULT_FILE_STORAGE = 'storages.backends.s3boto3.S3Boto3Storage'

# https://django-storages.readthedocs.io/en/latest/backends/amazon-S3.html
# In addition to using django-storages, we're maintaining parity between the
# local docker-compose environment and the cloud.gov environments; see the
# environment section of the prod-api service in docker-compose.yml for where
# these values are coming from for local development.
s3service = env.get_service(label="s3")
AWS_ACCESS_KEY_ID = s3service.credentials.get("access_key_id")
AWS_SECRET_ACCESS_KEY = s3service.credentials.get("secret_access_key")
AWS_STORAGE_BUCKET_NAME = s3service.credentials.get("bucket")
AWS_AUTO_CREATE_BUCKET = True
if "endpoint" in s3service.credentials:
# For local development, we must override the endpoint, and region is
# irrelevant.
AWS_S3_ENDPOINT_URL = s3service.credentials["endpoint"]
if s3service:
AWS_ACCESS_KEY_ID = s3service.credentials.get("access_key_id")
AWS_SECRET_ACCESS_KEY = s3service.credentials.get("secret_access_key")
AWS_STORAGE_BUCKET_NAME = s3service.credentials.get("bucket")
AWS_AUTO_CREATE_BUCKET = True
if "endpoint" in s3service.credentials:
# For local development, we must override the endpoint, and region is
# irrelevant.
AWS_S3_ENDPOINT_URL = s3service.credentials["endpoint"]
else:
# On cloud.gov, we need region and we want django-storages to infer the
# correct URL for us rather than setting an endpoint ourselves.
AWS_S3_REGION_NAME = s3service.credentials["region"]
AWS_S3_OBJECT_PARAMETERS = {
'ContentDisposition': 'attachment', # Browsers should download files
}
DEFAULT_FILE_STORAGE = 'storages.backends.s3boto3.S3Boto3Storage'
else:
# On cloud.gov, we need region and we want django-storages to infer the
# correct URL for us rather than setting an endpoint ourselves.
AWS_S3_REGION_NAME = s3service.credentials["region"]
AWS_S3_OBJECT_PARAMETERS = {
'ContentDisposition': 'attachment', # Browsers should download files
}
MEDIA_ROOT = 'media'


TAGGIT_CASE_INSENSITIVE = True

Expand Down
16 changes: 11 additions & 5 deletions api/ombpdf/management/commands/migrate_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ def ensure_section_has_heading(doc: DocCursor) -> DocCursor:
]


def migrate_doc(doc: DocCursor) -> DocCursor:
"""Apply all transforms to a given document. Save it and return."""
for transform in transforms:
doc = transform(doc)
doc.nested_set_renumber(bulk_create=False)
for node in doc.walk():
node.save()
return doc


class Command(BaseCommand):
help = ( # noqa (overriding a builtin)
"Run through (idempotent) document migrations to mass-fixup docs.")
Expand All @@ -63,9 +73,5 @@ def handle(self, *args, **kwargs):
for root_docnode in roots:
with transaction.atomic():
doc = DocCursor.load_from_model(root_docnode)
for transform in transforms:
doc = transform(doc)
doc.nested_set_renumber(bulk_create=False)
for node in doc.walk():
node.save()
migrate_doc(doc)
pbar.update(1)
63 changes: 40 additions & 23 deletions api/reqs/admin.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,14 @@
from django import forms
from django.conf.urls import url
from django.contrib import admin
from django.core.exceptions import ValidationError
from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404, render
from django.urls import reverse

from ereqs_admin.revision_admin import EReqsVersionAdmin
from reqs.forms import DocumentUploadForm
from reqs.models import Agency, AgencyGroup, Office, Policy, Requirement, Topic


def is_extension_pdf(uploaded_file):
if (not uploaded_file.name.endswith('pdf')
or uploaded_file.content_type != 'application/pdf'):
raise ValidationError('The file must be a PDF.')


# This form is temporarily no-longer used. We'll add it back with the pdf
# upload workflow
class PolicyForm(forms.ModelForm):
document_source = forms.FileField(required=False,
validators=[is_extension_pdf])

class Meta:
model = Policy
fields = '__all__'


@admin.register(Policy)
class PolicyAdmin(EReqsVersionAdmin):
actions = None
Expand All @@ -42,14 +26,47 @@ class PolicyAdmin(EReqsVersionAdmin):
]

def response_post_save_change(self, request, obj):
"""Redirect to the document editor, if that's the button the user
clicked."""
if '_savethendoc' in request.POST:
policy_id = obj.omb_policy_id or obj.slug
"""Redirect to the document import or editor, if that's the button the
user clicked."""
policy_id = obj.omb_policy_id or obj.slug
if '_savethendoc' in request.POST and obj.has_no_document:
return HttpResponseRedirect(reverse(
'admin:document_upload', kwargs={'pk': obj.pk}))
elif '_savethendoc' in request.POST:
return HttpResponseRedirect(
reverse('document_editor', kwargs={'policy_id': policy_id}))
return super().response_post_save_change(request, obj)

def get_urls(self):
"""Add the document upload view"""
urls = super().get_urls()
return [
url('^(?P<pk>\d+)/document_upload$',
self.admin_site.admin_view(self.document_upload),
name='document_upload'),
] + urls

def document_upload(self, request, pk):
"""Handler for uploading new documents."""
policy = get_object_or_404(Policy, pk=pk)
if request.method == 'POST': # submitted form
form = DocumentUploadForm(request.POST, request.FILES,
instance=policy)
if form.is_valid():
form.save()
return HttpResponseRedirect(reverse(
'document_editor',
kwargs={'policy_id': policy.omb_policy_id or policy.slug},
))
else:
form = DocumentUploadForm(instance=policy)
context = dict(
self.admin_site.each_context(request),
form=form,
title='Document Upload',
)
return render(request, 'reqs/admin/document_upload.html', context)


@admin.register(Topic)
class TopicAdmin(EReqsVersionAdmin):
Expand Down
77 changes: 77 additions & 0 deletions api/reqs/forms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging

import reversion
from django import forms
from django.core.validators import FileExtensionValidator
from django.db import connection, transaction

from document.models import DocNode
from ombpdf.document import OMBDocument
from ombpdf.management.commands.migrate_documents import migrate_doc
from ombpdf.semdb import to_db
from reqs.models import Policy, Requirement, WorkflowPhases

logger = logging.getLogger(__name__)


def create_document(policy: Policy):
"""Attempt to parse the PDF associated with this policy, applying document
migrations for misparses."""
try:
DocNode.objects.filter(policy=policy).delete()
parsed_pdf = OMBDocument.from_file(policy.document_source.file)
cursor = to_db(parsed_pdf, policy)
policy.workflow_phase = WorkflowPhases.cleanup.name
policy.save()
migrate_doc(cursor)
except: # noqa - don't know the exceptions we'll raise
logger.exception('Error loading document')
policy.workflow_phase = WorkflowPhases.failed.name
policy.save()


def create_requirements(policy: Policy):
"""We're currently searching over *requirements* rather than policy text.
This is a big hack to keep the app functional as users upload new
documents. It creates a requirement for every doc node."""
Requirement.objects.filter(policy=policy).delete()
with connection.cursor() as cursor:
cursor.execute("""
INSERT INTO reqs_requirement
(policy_id, req_id, req_text, public,
-- blank text fields
policy_section, policy_sub_section, verb, impacted_entity,
req_deadline, citation, req_status, precedent, related_reqs,
omb_data_collection)
SELECT
document_docnode.policy_id,
-- construct a unique req_id
'pdf.' || CAST(document_docnode.id AS text),
document_docnode.text, -- req_text
true, -- public
-- blank text fields
'', '', '', '', '', '', '', '', '', ''
FROM document_docnode
WHERE document_docnode.policy_id = %s
AND document_docnode.text <> ''
""", [policy.pk])


class DocumentUploadForm(forms.ModelForm):
document_source = forms.FileField(
required=True, validators=[FileExtensionValidator(['pdf'])])

class Meta:
model = Policy
fields = ['document_source']

@transaction.atomic
def save(self, commit=True):
if commit:
with reversion.create_revision():
policy = super().save()
create_document(policy)
create_requirements(policy)
return policy
else:
return super().save(False)
7 changes: 7 additions & 0 deletions api/reqs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,13 @@ def requirements_with_topics(self):
def has_published_document(self):
return self.workflow_phase == WorkflowPhases.published.name

@property
def has_no_document(self):
return self.workflow_phase in (
WorkflowPhases.failed.name,
WorkflowPhases.no_doc.name,
)

def __str__(self):
text = self.title_with_number
if len(text) > 100:
Expand Down
23 changes: 23 additions & 0 deletions api/reqs/templates/reqs/admin/document_upload.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{% extends "admin/base_site.html" %}

{% block content %}
<div id="content-main">
<form enctype="multipart/form-data" method="POST">
{% csrf_token %}
<div>
{{ form.non_field_errors }}
<div class="fieldWrapper">
{{ form.document_source.errors }}
<label for="{{ form.document_source.id_for_label }}">Document:</label>
<input
accept=".pdf"
id="{{ form.document_source.id_for_label }}"
name="{{ form.document_source.html_name }}"
type="file"
/>
</div>
<input type="submit" value="Submit" />
</div>
</form>
</div>
{% endblock %}
52 changes: 51 additions & 1 deletion api/reqs/tests/admin_tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from unittest.mock import Mock

from django.core.files.uploadedfile import SimpleUploadedFile
from model_mommy import mommy

from document.tree import DocCursor
from reqs import forms
from reqs.models import Policy


Expand All @@ -14,7 +18,25 @@ def test_policy_edit_doc_button(admin_client):
assert b'Save and edit document' in result.content


def test_policy_redirect(admin_client):
def test_policy_redirect_no_doc(admin_client):
policy = mommy.make(Policy, slug='some-policy')

result = admin_client.post(f'/admin/reqs/policy/{policy.pk}/change/', {
'title': policy.title,
'omb_policy_id': '',
'issuance': policy.issuance.isoformat(),
'sunset': '',
'public': 'on',
'workflow_phase': 'no_doc',
'_savethendoc': 'Save and edit document',
})

assert result.status_code == 302
assert result['Location'] == (f"/admin/reqs/policy/{policy.pk}/"
"document_upload")


def test_policy_redirect_to_editor(admin_client):
policy = mommy.make(Policy, slug='some-policy')
root = DocCursor.new_tree('root', policy=policy)
root.nested_set_renumber()
Expand All @@ -34,3 +56,31 @@ def test_policy_redirect(admin_client):

policy.refresh_from_db()
assert policy.title == 'Some new policy title'


def test_document_upload_404(admin_client):
result = admin_client.get(f'/admin/reqs/policy/010101/document_upload')
assert result.status_code == 404


def test_document_upload_get(admin_client):
policy = mommy.make(Policy)
result = admin_client.get(f'/admin/reqs/policy/{policy.pk}/'
'document_upload')
assert b'form' in result.content
assert b'accept=".pdf"' in result.content
assert b'document_source' in result.content


def test_document_upload_post(admin_client, monkeypatch):
monkeypatch.setattr(forms, 'create_document', Mock())

policy = mommy.make(Policy, slug='some-policy')
pdf = SimpleUploadedFile('a-file.pdf', b'contents')
result = admin_client.post(
f'/admin/reqs/policy/{policy.pk}/document_upload',
{'document_source': pdf},
)

assert result.status_code == 302
assert result['Location'] == '/admin/document-editor/some-policy'
Loading

0 comments on commit 2d8e21c

Please sign in to comment.