Merge pull request #1051 from 18F/simple-pdf-import

Add very rudimentary PDF import
18F · Mar 19, 2018 · 2d8e21c · 2d8e21c
2 parents 7fec670 + b292b7b
commit 2d8e21c
Show file tree

Hide file tree

Showing 12 changed files with 326 additions and 106 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -14,19 +14,6 @@ api: &API
   environment:
     DEBUG: "True"
     DATABASE_URL: postgres://postgres@localhost/postgres
-    VCAP_SERVICES: |
-      {"s3": [{
-         "credentials": {
-           "access_key_id": "LOCAL_ID",
-           "bucket": "pdfs",
-           "region": "irrelevant fake region",
-           "endpoint": "http://localhost:9100",
-           "secret_access_key": "LOCAL_KEY"
-         },
-         "label": "s3",
-         "name": "storage-s3"
-       }]
-      }
 admin-ui: &ADMIN_UI
   docker:
     - image: node:6

diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,7 @@ ui/lcov.info
 .next
 api/webpack-static/
 api/collected-static/
+api/media/
 
 # Data
 *.csv

diff --git a/api/omb_eregs/settings.py b/api/omb_eregs/settings.py
@@ -199,30 +199,28 @@
 ]
 
 # File storage
-
-DEFAULT_FILE_STORAGE = 'storages.backends.s3boto3.S3Boto3Storage'
-
 # https://django-storages.readthedocs.io/en/latest/backends/amazon-S3.html
-# In addition to using django-storages, we're maintaining parity between the
-# local docker-compose environment and the cloud.gov environments; see the
-# environment section of the prod-api service in docker-compose.yml for where
-# these values are coming from for local development.
 s3service = env.get_service(label="s3")
-AWS_ACCESS_KEY_ID = s3service.credentials.get("access_key_id")
-AWS_SECRET_ACCESS_KEY = s3service.credentials.get("secret_access_key")
-AWS_STORAGE_BUCKET_NAME = s3service.credentials.get("bucket")
-AWS_AUTO_CREATE_BUCKET = True
-if "endpoint" in s3service.credentials:
-    # For local development, we must override the endpoint, and region is
-    # irrelevant.
-    AWS_S3_ENDPOINT_URL = s3service.credentials["endpoint"]
+if s3service:
+    AWS_ACCESS_KEY_ID = s3service.credentials.get("access_key_id")
+    AWS_SECRET_ACCESS_KEY = s3service.credentials.get("secret_access_key")
+    AWS_STORAGE_BUCKET_NAME = s3service.credentials.get("bucket")
+    AWS_AUTO_CREATE_BUCKET = True
+    if "endpoint" in s3service.credentials:
+        # For local development, we must override the endpoint, and region is
+        # irrelevant.
+        AWS_S3_ENDPOINT_URL = s3service.credentials["endpoint"]
+    else:
+        # On cloud.gov, we need region and we want django-storages to infer the
+        # correct URL for us rather than setting an endpoint ourselves.
+        AWS_S3_REGION_NAME = s3service.credentials["region"]
+    AWS_S3_OBJECT_PARAMETERS = {
+        'ContentDisposition': 'attachment',    # Browsers should download files
+    }
+    DEFAULT_FILE_STORAGE = 'storages.backends.s3boto3.S3Boto3Storage'
 else:
-    # On cloud.gov, we need region and we want django-storages to infer the
-    # correct URL for us rather than setting an endpoint ourselves.
-    AWS_S3_REGION_NAME = s3service.credentials["region"]
-AWS_S3_OBJECT_PARAMETERS = {
-    'ContentDisposition': 'attachment',     # Browsers should download files
-}
+    MEDIA_ROOT = 'media'
+
 
 TAGGIT_CASE_INSENSITIVE = True
 

diff --git a/api/ombpdf/management/commands/migrate_documents.py b/api/ombpdf/management/commands/migrate_documents.py
@@ -53,6 +53,16 @@ def ensure_section_has_heading(doc: DocCursor) -> DocCursor:
 ]
 
 
+def migrate_doc(doc: DocCursor) -> DocCursor:
+    """Apply all transforms to a given document. Save it and return."""
+    for transform in transforms:
+        doc = transform(doc)
+    doc.nested_set_renumber(bulk_create=False)
+    for node in doc.walk():
+        node.save()
+    return doc
+
+
 class Command(BaseCommand):
     help = ( # noqa (overriding a builtin)
         "Run through (idempotent) document migrations to mass-fixup docs.")
@@ -63,9 +73,5 @@ def handle(self, *args, **kwargs):
             for root_docnode in roots:
                 with transaction.atomic():
                     doc = DocCursor.load_from_model(root_docnode)
-                    for transform in transforms:
-                        doc = transform(doc)
-                    doc.nested_set_renumber(bulk_create=False)
-                    for node in doc.walk():
-                        node.save()
+                    migrate_doc(doc)
                 pbar.update(1)
diff --git a/api/reqs/admin.py b/api/reqs/admin.py
@@ -1,30 +1,14 @@
-from django import forms
+from django.conf.urls import url
 from django.contrib import admin
-from django.core.exceptions import ValidationError
 from django.http import HttpResponseRedirect
+from django.shortcuts import get_object_or_404, render
 from django.urls import reverse
 
 from ereqs_admin.revision_admin import EReqsVersionAdmin
+from reqs.forms import DocumentUploadForm
 from reqs.models import Agency, AgencyGroup, Office, Policy, Requirement, Topic
 
 
-def is_extension_pdf(uploaded_file):
-    if (not uploaded_file.name.endswith('pdf')
-            or uploaded_file.content_type != 'application/pdf'):
-        raise ValidationError('The file must be a PDF.')
-
-
-# This form is temporarily no-longer used. We'll add it back with the pdf
-# upload workflow
-class PolicyForm(forms.ModelForm):
-    document_source = forms.FileField(required=False,
-                                      validators=[is_extension_pdf])
-
-    class Meta:
-        model = Policy
-        fields = '__all__'
-
-
 @admin.register(Policy)
 class PolicyAdmin(EReqsVersionAdmin):
     actions = None
@@ -42,14 +26,47 @@ class PolicyAdmin(EReqsVersionAdmin):
     ]
 
     def response_post_save_change(self, request, obj):
-        """Redirect to the document editor, if that's the button the user
-        clicked."""
-        if '_savethendoc' in request.POST:
-            policy_id = obj.omb_policy_id or obj.slug
+        """Redirect to the document import or editor, if that's the button the
+        user clicked."""
+        policy_id = obj.omb_policy_id or obj.slug
+        if '_savethendoc' in request.POST and obj.has_no_document:
+            return HttpResponseRedirect(reverse(
+                'admin:document_upload', kwargs={'pk': obj.pk}))
+        elif '_savethendoc' in request.POST:
             return HttpResponseRedirect(
                 reverse('document_editor', kwargs={'policy_id': policy_id}))
         return super().response_post_save_change(request, obj)
 
+    def get_urls(self):
+        """Add the document upload view"""
+        urls = super().get_urls()
+        return [
+            url('^(?P<pk>\d+)/document_upload$',
+                self.admin_site.admin_view(self.document_upload),
+                name='document_upload'),
+        ] + urls
+
+    def document_upload(self, request, pk):
+        """Handler for uploading new documents."""
+        policy = get_object_or_404(Policy, pk=pk)
+        if request.method == 'POST':    # submitted form
+            form = DocumentUploadForm(request.POST, request.FILES,
+                                      instance=policy)
+            if form.is_valid():
+                form.save()
+                return HttpResponseRedirect(reverse(
+                    'document_editor',
+                    kwargs={'policy_id': policy.omb_policy_id or policy.slug},
+                ))
+        else:
+            form = DocumentUploadForm(instance=policy)
+        context = dict(
+            self.admin_site.each_context(request),
+            form=form,
+            title='Document Upload',
+        )
+        return render(request, 'reqs/admin/document_upload.html', context)
+
 
 @admin.register(Topic)
 class TopicAdmin(EReqsVersionAdmin):

diff --git a/api/reqs/forms.py b/api/reqs/forms.py
@@ -0,0 +1,77 @@
+import logging
+
+import reversion
+from django import forms
+from django.core.validators import FileExtensionValidator
+from django.db import connection, transaction
+
+from document.models import DocNode
+from ombpdf.document import OMBDocument
+from ombpdf.management.commands.migrate_documents import migrate_doc
+from ombpdf.semdb import to_db
+from reqs.models import Policy, Requirement, WorkflowPhases
+
+logger = logging.getLogger(__name__)
+
+
+def create_document(policy: Policy):
+    """Attempt to parse the PDF associated with this policy, applying document
+    migrations for misparses."""
+    try:
+        DocNode.objects.filter(policy=policy).delete()
+        parsed_pdf = OMBDocument.from_file(policy.document_source.file)
+        cursor = to_db(parsed_pdf, policy)
+        policy.workflow_phase = WorkflowPhases.cleanup.name
+        policy.save()
+        migrate_doc(cursor)
+    except:     # noqa - don't know the exceptions we'll raise
+        logger.exception('Error loading document')
+        policy.workflow_phase = WorkflowPhases.failed.name
+        policy.save()
+
+
+def create_requirements(policy: Policy):
+    """We're currently searching over *requirements* rather than policy text.
+    This is a big hack to keep the app functional as users upload new
+    documents. It creates a requirement for every doc node."""
+    Requirement.objects.filter(policy=policy).delete()
+    with connection.cursor() as cursor:
+        cursor.execute("""
+            INSERT INTO reqs_requirement
+                (policy_id, req_id, req_text, public,
+                 -- blank text fields
+                 policy_section, policy_sub_section, verb, impacted_entity,
+                 req_deadline, citation, req_status, precedent, related_reqs,
+                 omb_data_collection)
+            SELECT
+                document_docnode.policy_id,
+                 -- construct a unique req_id
+                 'pdf.' || CAST(document_docnode.id AS text),
+                 document_docnode.text, -- req_text
+                 true, -- public
+                 -- blank text fields
+                 '', '', '', '', '', '', '', '', '', ''
+            FROM document_docnode
+            WHERE document_docnode.policy_id = %s
+            AND document_docnode.text <> ''
+        """, [policy.pk])
+
+
+class DocumentUploadForm(forms.ModelForm):
+    document_source = forms.FileField(
+        required=True, validators=[FileExtensionValidator(['pdf'])])
+
+    class Meta:
+        model = Policy
+        fields = ['document_source']
+
+    @transaction.atomic
+    def save(self, commit=True):
+        if commit:
+            with reversion.create_revision():
+                policy = super().save()
+                create_document(policy)
+                create_requirements(policy)
+                return policy
+        else:
+            return super().save(False)
diff --git a/api/reqs/models.py b/api/reqs/models.py
@@ -158,6 +158,13 @@ def requirements_with_topics(self):
     def has_published_document(self):
         return self.workflow_phase == WorkflowPhases.published.name
 
+    @property
+    def has_no_document(self):
+        return self.workflow_phase in (
+            WorkflowPhases.failed.name,
+            WorkflowPhases.no_doc.name,
+        )
+
     def __str__(self):
         text = self.title_with_number
         if len(text) > 100:

diff --git a/api/reqs/templates/reqs/admin/document_upload.html b/api/reqs/templates/reqs/admin/document_upload.html
@@ -0,0 +1,23 @@
+{% extends "admin/base_site.html" %}
+
+{% block content %}
+<div id="content-main">
+  <form enctype="multipart/form-data" method="POST">
+    {% csrf_token %}
+    <div>
+      {{ form.non_field_errors }}
+      <div class="fieldWrapper">
+        {{ form.document_source.errors }}
+        <label for="{{ form.document_source.id_for_label }}">Document:</label>
+        <input
+          accept=".pdf"
+          id="{{ form.document_source.id_for_label }}"
+          name="{{ form.document_source.html_name }}"
+          type="file"
+        />
+      </div>
+      <input type="submit" value="Submit" />
+    </div>
+  </form>
+</div>
+{% endblock %}
diff --git a/api/reqs/tests/admin_tests.py b/api/reqs/tests/admin_tests.py
@@ -1,6 +1,10 @@
+from unittest.mock import Mock
+
+from django.core.files.uploadedfile import SimpleUploadedFile
 from model_mommy import mommy
 
 from document.tree import DocCursor
+from reqs import forms
 from reqs.models import Policy
 
 
@@ -14,7 +18,25 @@ def test_policy_edit_doc_button(admin_client):
     assert b'Save and edit document' in result.content
 
 
-def test_policy_redirect(admin_client):
+def test_policy_redirect_no_doc(admin_client):
+    policy = mommy.make(Policy, slug='some-policy')
+
+    result = admin_client.post(f'/admin/reqs/policy/{policy.pk}/change/', {
+       'title': policy.title,
+       'omb_policy_id': '',
+       'issuance': policy.issuance.isoformat(),
+       'sunset': '',
+       'public': 'on',
+       'workflow_phase': 'no_doc',
+       '_savethendoc': 'Save and edit document',
+    })
+
+    assert result.status_code == 302
+    assert result['Location'] == (f"/admin/reqs/policy/{policy.pk}/"
+                                  "document_upload")
+
+
+def test_policy_redirect_to_editor(admin_client):
     policy = mommy.make(Policy, slug='some-policy')
     root = DocCursor.new_tree('root', policy=policy)
     root.nested_set_renumber()
@@ -34,3 +56,31 @@ def test_policy_redirect(admin_client):
 
     policy.refresh_from_db()
     assert policy.title == 'Some new policy title'
+
+
+def test_document_upload_404(admin_client):
+    result = admin_client.get(f'/admin/reqs/policy/010101/document_upload')
+    assert result.status_code == 404
+
+
+def test_document_upload_get(admin_client):
+    policy = mommy.make(Policy)
+    result = admin_client.get(f'/admin/reqs/policy/{policy.pk}/'
+                              'document_upload')
+    assert b'form' in result.content
+    assert b'accept=".pdf"' in result.content
+    assert b'document_source' in result.content
+
+
+def test_document_upload_post(admin_client, monkeypatch):
+    monkeypatch.setattr(forms, 'create_document', Mock())
+
+    policy = mommy.make(Policy, slug='some-policy')
+    pdf = SimpleUploadedFile('a-file.pdf', b'contents')
+    result = admin_client.post(
+        f'/admin/reqs/policy/{policy.pk}/document_upload',
+        {'document_source': pdf},
+    )
+
+    assert result.status_code == 302
+    assert result['Location'] == '/admin/document-editor/some-policy'