okfn-brasil · rodolfolottin · Feb 3, 2018 · Feb 19, 2018 · Feb 19, 2018 · Feb 24, 2018
diff --git a/Dockerfile b/Dockerfile
@@ -1,22 +1,12 @@
-FROM python:3.6.3-alpine
+FROM continuumio/anaconda3
 
-RUN apk add --no-cache --virtual build-base \
-  && apk add --no-cache --virtual libxml2-dev \
-  && apk add --no-cache --virtual libxslt-dev \
-  && mkdir -p /usr/include/libxml \
-  && ln -s /usr/include/libxml2/libxml/xmlexports.h /usr/include/libxml/xmlexports.h \
-  && ln -s /usr/include/libxml2/libxml/xmlversion.h /usr/include/libxml/xmlversion.h
-
-# RUN mkdir rosie
-# COPY rosie/config.ini.example ./config.ini
-# COPY rosie/requirements.txt ./rosie
-# RUN pip install -r rosie/requirements.txt
+RUN apt-get install -y libmagickwand-dev
 
 WORKDIR /usr/src/app
 COPY requirements.txt ./
 RUN pip install -r requirements.txt
 
-RUN adduser -S serenata_de_amor
+RUN adduser --system serenata_de_amor
 RUN chown -hR serenata_de_amor .
 USER serenata_de_amor
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,10 @@
 amqp==2.2.2
 celery==4.1.0
 ipdb==0.10.3
+opencv-python==3.4.0.12
 pandas==0.21.0
 pymongo==3.5.1
 python-twitter==3.3
 requests==2.18.4
 serenata-toolbox==12.2.2
+wand==0.4.4
diff --git a/tests/fixtures/10.pdf b/tests/fixtures/10.pdf
diff --git a/tests/fixtures/10.png b/tests/fixtures/10.png
diff --git a/tests/targets/test_twitter.py b/tests/targets/test_twitter.py
@@ -1,8 +1,10 @@
 import datetime
+from io import BytesIO, BufferedReader
 from unittest import TestCase, mock
 
 import pandas as pd
 from twitter import TwitterError
+import urllib.error
 
 from whistleblower.targets.twitter import Post, Twitter
 
@@ -108,6 +110,8 @@ def setUp(self):
         self.reimbursement = {
             'congressperson_name': 'Eduardo Cunha',
             'document_id': 10,
+            'applicant_id': 10,
+            'year': 2015,
             'state': 'RJ',
             'twitter_profile': 'DepEduardoCunha',
         }
@@ -117,19 +121,48 @@ def setUp(self):
 
     def test_publish(self):
         self.subject.publish()
-        self.api.PostUpdate.assert_called_once_with(self.subject.text())
+        text, reimbursement_image = self.subject.tweet_data()
+        self.api.PostUpdate.assert_called_once_with(
+            media=reimbursement_image, status=text)
         dict_representation = dict(self.subject)
         self.database.posts.insert_one.assert_called_once_with(
             dict_representation)
 
-    def test_text(self):
+    def test_tweet_data(self):
         message = (
             '🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). '
             'Você pode me ajudar a verificar? '
             'https://jarbas.serenata.ai/layers/#/documentId/10 '
             '#SerenataDeAmor na @CamaraDeputados'
         )
-        self.assertEqual(message, self.subject.text())
+        self.assertEqual(
+            (message, None), self.subject.tweet_data())
         self.reimbursement['twitter_profile'] = None
         with self.assertRaises(ValueError):
-            self.subject.text()
+            self.subject.tweet_data()
+
+    def test_tweet_text(self):
+        message = (
+            '🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). '
+            'Você pode me ajudar a verificar? '
+            'https://jarbas.serenata.ai/layers/#/documentId/10 '
+            '#SerenataDeAmor na @CamaraDeputados'
+        )
+        self.assertEqual(message, self.subject.tweet_text())
+
+    def test_camara_image_url(self):
+        url = 'http://www.camara.gov.br/cota-parlamentar/documentos/publ/10/2015/10.pdf'
+        self.assertEqual(url, self.subject.camara_image_url())
+
+    @mock.patch('whistleblower.targets.twitter.urllib.request.urlopen')
+    def test_tweet_image_success(self, urlopen_mock):
+        with open('tests/fixtures/10.pdf', 'rb') as mock_response:
+            urlopen_mock.return_value = BytesIO(mock_response.read())
+        self.assertIsInstance(self.subject.tweet_image(), BufferedReader)
+
+    @mock.patch('whistleblower.targets.twitter.urllib.request.urlopen')
+    def test_tweet_image_error(self, urlopen_mock):
+        urlopen_mock.side_effect = urllib.error.HTTPError(
+            url='mock_url', code=404, msg='Not Found',
+            hdrs='mock_headers', fp=None)
+        self.assertIsNone(self.subject.tweet_image())
diff --git a/whistleblower/helpers/crop.py b/whistleblower/helpers/crop.py
@@ -0,0 +1,92 @@
+import sys
+
+import cv2
+import numpy
+
+TEXT_MIN_WIDTH = 35
+TEXT_MIN_HEIGHT = 10
+
+DEFAULT_WIDTH  = 850
+DEFAULT_HEIGHT = 1100
+
+KERNEL_WIDTH  = 25
+KERNEL_HEIGHT = 15
+
+
+def remove_borders(image, threshold, max_width, max_height):
+    height, width = image.shape[:2]
+
+    for i in range(max_width):
+        total = image[:, i].sum() / 255
+        if total > threshold:
+            image[:, i] = numpy.ones(height) * 255
+
+        total = image[:, width - i - 1].sum() / 255
+        if total > threshold:
+            image[:, i - 1] = numpy.ones(height) * 255
+
+    for i in range(max_height):
+        total = image[i, :].sum() / 255
+        if total > threshold:
+            image[i, :] = numpy.ones(width) * 255
+
+        total = image[height - i - 1, :].sum()
+        if total > threshold:
+            image[height - i - 1, :] = numpy.ones(width) * 255
+
+    return image
+
+
+def crop(numpy_array, filename):
+    image = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR)
+    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+
+    gray = remove_borders(gray, 0.8, 15, 15)
+
+    adjusted_width  = image.shape[1] / DEFAULT_WIDTH
+    adjusted_height = image.shape[0] / DEFAULT_HEIGHT
+
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (KERNEL_WIDTH, KERNEL_HEIGHT))
+    eroded = cv2.erode(gray, kernel)
+
+    _, bw = cv2.threshold(eroded, 127, 255, cv2.THRESH_BINARY_INV)
+
+    total, markers = cv2.connectedComponents(bw)
+
+    images = [numpy.uint8(markers==i) * 255 for i in range(total) if numpy.uint8(markers==i).sum() > 10]
+
+    rectangles = []
+
+    for label in images:
+        countours = cv2.findContours(label, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+        (x,y,w,h) = cv2.boundingRect(countours[0])
+
+        rectangles.append((x, y, w, h, label.sum() / 255.0))
+
+    rectangles = sorted(rectangles, key=lambda x:x[4], reverse=True)
+
+    rectangles = rectangles[1:]
+
+    expanded = [sys.maxsize, sys.maxsize, -sys.maxsize, -sys.maxsize]
+
+    for rect in rectangles:
+
+        x0, y0, w0, h0 = expanded
+        x1, y1, w1, h1, _ = rect
+
+        if w1 <= (TEXT_MIN_WIDTH * adjusted_width):
+            continue
+
+        if h1 <= (TEXT_MIN_HEIGHT * adjusted_height):
+            continue
+
+        x = min(x0, x1)
+        y = min(y0, y1)
+
+        w = max(x0 + w0, x1 + w1) - x
+        h = max(y0 + h0, y1 + h1) - y
+
+        expanded = [x, y, w, h]
+
+    cv2.imwrite(filename, image[y:y+h, x:x+w])
diff --git a/whistleblower/targets/twitter.py b/whistleblower/targets/twitter.py
@@ -2,14 +2,18 @@
 import logging
 import os
 import re
+from tempfile import NamedTemporaryFile
 import urllib.request
+import urllib.error
 
 import numpy as np
 import pandas as pd
 from pymongo import MongoClient
 import twitter
+from wand.image import Image
 
 from whistleblower.suspicions import Suspicions
+from whistleblower.helpers.crop import crop
 
 ACCESS_TOKEN_KEY = os.environ['TWITTER_ACCESS_TOKEN_KEY']
 ACCESS_TOKEN_SECRET = os.environ['TWITTER_ACCESS_TOKEN_SECRET']
@@ -138,27 +142,73 @@ def __iter__(self):
         yield 'text', self.status.text
         yield 'document_id', self.reimbursement['document_id']
 
-    def text(self):
+    def tweet_data(self):
         """
-        Proper tweet message for the given reimbursement.
+        Proper tweet data for the given reimbursement.
         """
         profile = self.reimbursement['twitter_profile']
         if profile:
-            link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format(
-                self.reimbursement['document_id'])
-            message = (
-                '🚨Gasto suspeito de Dep. @{} ({}). '
-                'Você pode me ajudar a verificar? '
-                '{} #SerenataDeAmor na @CamaraDeputados'
-            ).format(profile, self.reimbursement['state'], link)
-            return message
+            return self.tweet_text(), self.tweet_image()
         else:
             raise ValueError(
                 'Congressperson does not have a registered Twitter account.')
 
+
+    def tweet_text(self):
+        link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format(
+            self.reimbursement['document_id'])
+        message = (
+            '🚨Gasto suspeito de Dep. @{} ({}). '
+            'Você pode me ajudar a verificar? '
+            '{} #SerenataDeAmor na @CamaraDeputados'
+        ).format(
+            self.reimbursement['twitter_profile'],
+            self.reimbursement['state'],
+            link
+        )
+        return message
+
+    def camara_image_url(self):
+        """
+        Proper image url for the given reimbursement.
+        """
+        url = (
+            'http://www.camara.gov.br/cota-parlamentar/documentos/publ/'
+            '{}/{}/{}.pdf'.format(
+                self.reimbursement['applicant_id'],
+                self.reimbursement['year'],
+                self.reimbursement['document_id'])
+        )
+
+        return url
+
+    def tweet_image(self):
+        """
+        Download, crop and open the image for the given reimbursement.
+        """
+        try:
+            response = urllib.request.urlopen(self.camara_image_url())
+        except urllib.error.HTTPError:
+            return None
+
+        image_bin = Image(file=response).make_blob('png')
+        numpy_array = np.frombuffer(image_bin, np.uint8)
+
+        with NamedTemporaryFile(suffix='.png') as temp:
+            crop(numpy_array, temp.name)
+
+            with open(temp.name, 'rb') as cropped_file:
+                cropped_image = cropped_file
+
+        return cropped_image
+
     def publish(self):
         """
         Post the update to Twitter's timeline.
         """
-        self.status = self.api.PostUpdate(self.text())
+        text, reimbursement_image = self.tweet_data()
+
+        self.status = self.api.PostUpdate(
+            status=text,
+            media=reimbursement_image)
         self.database.posts.insert_one(dict(self))