Revamp document hashing

This PS revamps document hashing. Instead of relying on Python's built-in hash function to hash the contents of a document (i.e. metadata and data values), sha256 from hashlib is used instead, mostly for security purposes. Further, new parameters have been added to the document DB model: data_hash and metadata_hash, and the old value hash has been dropped. The data type for storing the hashes has been changed to String from BigInt. Finally, testing documentation was added. Change-Id: I428ddcbce1007ea990ca0df1aa630072a050c722
2017-09-26 01:08:05 +01:00 · 2017-09-26 01:08:05 +01:00 · 8bf4f7407d
parent 81b3e42013
commit 8bf4f7407d
13 changed files with 168 additions and 58 deletions
--- a/deckhand/db/sqlalchemy/api.py
+++ b/deckhand/db/sqlalchemy/api.py
@ -18,6 +18,7 @@
 import ast
 import copy
 import functools
+import hashlib
 import threading

 from oslo_config import cfg
@ -25,6 +26,7 @@ from oslo_db import exception as db_exception
 from oslo_db import options
 from oslo_db.sqlalchemy import session
 from oslo_log import log as logging
+from oslo_serialization import jsonutils as json
 import six
 import sqlalchemy.orm as sa_orm

@ -136,7 +138,8 @@ def documents_create(bucket_name, documents, session=None):
                doc['name'] = d[1]
                doc['data'] = {}
                doc['_metadata'] = {}
-                doc['hash'] = utils.make_hash({})
+                doc['data_hash'] = _make_hash({})
+                doc['metadata_hash'] = _make_hash({})
                doc['bucket_id'] = bucket['id']
                doc['revision_id'] = revision['id']

@ -178,19 +181,12 @@ def _documents_create(bucket_name, values_list, session=None):
    for values in values_list:
        values['_metadata'] = values.pop('metadata')
        values['name'] = values['_metadata']['name']
-
-        # Hash the combination of the document's metadata and data to later
-        # efficiently check whether those data have changed.
-        dict_to_hash = values['_metadata'].copy()
-        dict_to_hash.update(values['data'])
-        values['hash'] = utils.make_hash(dict_to_hash)
-
        values['is_secret'] = 'secret' in values['data']
-        # Hash the combination of the document's metadata and data to later
-        # efficiently check whether those data have changed.
-        dict_to_hash = values['_metadata'].copy()
-        dict_to_hash.update(values['data'])
-        values['hash'] = utils.make_hash(dict_to_hash)
+
+        # Hash the document's metadata and data to later  efficiently check
+        # whether those data have changed.
+        values['data_hash'] = _make_hash(values['data'])
+        values['metadata_hash'] = _make_hash(values['_metadata'])

        try:
            existing_document = document_get(
@ -211,7 +207,8 @@ def _documents_create(bucket_name, values_list, session=None):
                    name=existing_document['name'],
                    bucket=existing_document['bucket_name'])

-            if existing_document['hash'] == values['hash']:
+            if (existing_document['data_hash'] == values['data_hash'] and
+                existing_document['metadata_hash'] == values['metadata_hash']):
                # Since the document has not changed, reference the original
                # revision in which it was created. This is necessary so that
                # the correct revision history is maintained.
@ -231,6 +228,11 @@ def _documents_create(bucket_name, values_list, session=None):
    return changed_documents


+def _make_hash(data):
+    return hashlib.sha256(
+        json.dumps(data, sort_keys=True).encode('utf-8')).hexdigest()
+
+
 def document_get(session=None, raw_dict=False, **filters):
    """Retrieve a document from the DB.

@ -482,6 +484,7 @@ def _filter_revision_documents(documents, unique_only, **filters):
            if unique_key not in filtered_documents:
                filtered_documents[unique_key] = document

+    # TODO(fmontei): Sort by user-specified parameter.
    return sorted(filtered_documents.values(), key=lambda d: d['created_at'])


@ -586,8 +589,8 @@ def revision_diff(revision_id, comparison_revision_id):

    def _compare_buckets(b1, b2):
        # Checks whether buckets' documents are identical.
-        return (sorted([d['hash'] for d in b1]) ==
-                sorted([d['hash'] for d in b2]))
+        return (sorted([(d['data_hash'], d['metadata_hash']) for d in b1]) ==
+                sorted([(d['data_hash'], d['metadata_hash']) for d in b2]))

    # If the list of documents for each bucket is indentical, then the result
    # is "unmodified", else "modified".
@ -753,7 +756,9 @@ def revision_rollback(revision_id, session=None):
    latest_revision = session.query(models.Revision)\
        .order_by(models.Revision.created_at.desc())\
        .first()
-    latest_revision_hashes = [d['hash'] for d in latest_revision['documents']]
+    latest_revision_hashes = [
+        (d['data_hash'], d['metadata_hash'])
+        for d in latest_revision['documents']]

    # If the rollback revision is the same as the latest revision, then there's
    # no point in rolling back.
@ -767,12 +772,13 @@ def revision_rollback(revision_id, session=None):
    # it has changed, else False.
    doc_diff = {}
    for orig_doc in orig_revision['documents']:
-        if orig_doc['hash'] not in latest_revision_hashes:
+        if ((orig_doc['data_hash'], orig_doc['metadata_hash'])
+            not in latest_revision_hashes):
            doc_diff[orig_doc['id']] = True
        else:
            doc_diff[orig_doc['id']] = False

-    # If no changges have been made between the target revision to rollback to
+    # If no changes have been made between the target revision to rollback to
    # and the latest revision, raise an exception.
    if set(doc_diff.values()) == set([False]):
        raise errors.InvalidRollback(revision_id=revision_id)
@ -789,8 +795,8 @@ def revision_rollback(revision_id, session=None):

        new_document = models.Document()
        new_document.update({x: orig_document[x] for x in (
-            'name', '_metadata', 'data', 'hash', 'schema', 'bucket_id')})
-
+            'name', '_metadata', 'data', 'data_hash', 'metadata_hash',
+            'schema', 'bucket_id')})
        new_document['revision_id'] = new_revision['id']

        # If the document has changed, then use the revision_id of the new
--- a/deckhand/db/sqlalchemy/models.py
+++ b/deckhand/db/sqlalchemy/models.py
@ -15,7 +15,6 @@
 from oslo_db.sqlalchemy import models
 from oslo_db.sqlalchemy import types as oslo_types
 from oslo_utils import timeutils
-from sqlalchemy import BigInteger
 from sqlalchemy import Boolean
 from sqlalchemy import Column
 from sqlalchemy import DateTime
@ -141,7 +140,8 @@ class Document(BASE, DeckhandBase):
    # "metadata" is reserved, so use "_metadata" instead.
    _metadata = Column(oslo_types.JsonEncodedDict(), nullable=False)
    data = Column(oslo_types.JsonEncodedDict(), nullable=True)
-    hash = Column(BigInteger, nullable=False)
+    data_hash = Column(String, nullable=False)
+    metadata_hash = Column(String, nullable=False)
    is_secret = Column(Boolean, nullable=False, default=False)
    bucket_id = Column(Integer, ForeignKey('buckets.id', ondelete='CASCADE'),
                       nullable=False)
--- a/deckhand/tests/unit/base.py
+++ b/deckhand/tests/unit/base.py
@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import os
+
 import fixtures
 import mock
 from oslo_config import cfg
@ -76,6 +78,8 @@ class DeckhandWithDBTestCase(DeckhandTestCase):

    def setUp(self):
        super(DeckhandWithDBTestCase, self).setUp()
-        self.override_config('connection', "sqlite://", group='database')
+        self.override_config(
+            'connection', os.environ.get('PIFPAF_URL', 'sqlite://'),
+            group='database')
        db_api.setup_db()
        self.addCleanup(db_api.drop_db)
--- a/deckhand/tests/unit/db/base.py
+++ b/deckhand/tests/unit/db/base.py
@ -20,8 +20,8 @@ from deckhand.tests.unit import base

 BASE_EXPECTED_FIELDS = ("created_at", "updated_at", "deleted_at", "deleted")
 DOCUMENT_EXPECTED_FIELDS = BASE_EXPECTED_FIELDS + (
-    "id", "schema", "name", "metadata", "data", "hash", "revision_id",
-    "bucket_id")
+    "id", "schema", "name", "metadata", "data", "data_hash", "metadata_hash",
+    "revision_id", "bucket_id")
 REVISION_EXPECTED_FIELDS = ("id", "documents", "tags")


--- a/deckhand/tests/unit/db/test_documents_negative.py
+++ b/deckhand/tests/unit/db/test_documents_negative.py
@ -44,7 +44,7 @@ class TestDocumentsNegative(base.TestDbBase):
    def test_delete_document_invalid_id(self):
        self.assertRaises(errors.DocumentNotFound,
                          self.show_document,
-                          id=test_utils.rand_uuid_hex())
+                          id=-1)

    def test_create_bucket_conflict(self):
        # Create the document in one bucket.
--- a/deckhand/tests/unit/db/test_revision_tags_negative.py
+++ b/deckhand/tests/unit/db/test_revision_tags_negative.py
@ -14,7 +14,6 @@

 from deckhand.db.sqlalchemy import api as db_api
 from deckhand import errors
-from deckhand.tests import test_utils
 from deckhand.tests.unit.db import base


@ -22,25 +21,20 @@ class TestRevisionTagsNegative(base.TestDbBase):

    def test_create_tag_revision_not_found(self):
        self.assertRaises(
-            errors.RevisionNotFound, db_api.revision_tag_create,
-            test_utils.rand_uuid_hex())
+            errors.RevisionNotFound, db_api.revision_tag_create, -1)

    def test_show_tag_revision_not_found(self):
        self.assertRaises(
-            errors.RevisionNotFound, db_api.revision_tag_get,
-            test_utils.rand_uuid_hex())
+            errors.RevisionNotFound, db_api.revision_tag_get, -1)

    def test_delete_tag_revision_not_found(self):
        self.assertRaises(
-            errors.RevisionNotFound, db_api.revision_tag_delete,
-            test_utils.rand_uuid_hex())
+            errors.RevisionNotFound, db_api.revision_tag_delete, -1)

    def test_list_tags_revision_not_found(self):
        self.assertRaises(
-            errors.RevisionNotFound, db_api.revision_tag_get_all,
-            test_utils.rand_uuid_hex())
+            errors.RevisionNotFound, db_api.revision_tag_get_all, -1)

    def test_delete_all_tags_revision_not_found(self):
        self.assertRaises(
-            errors.RevisionNotFound, db_api.revision_tag_delete_all,
-            test_utils.rand_uuid_hex())
+            errors.RevisionNotFound, db_api.revision_tag_delete_all, -1)
--- a/deckhand/utils.py
+++ b/deckhand/utils.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 import re
 import string

@ -61,21 +60,3 @@ def multi_getattr(multi_key, dict_data):
        data = data.get(attr)

    return data
-
-
-def make_hash(o):
-    """Makes a hash from a dictionary, list, tuple or set to any level, that
-    contains only other hashable types (including any lists, tuples, sets, and
-    dictionaries).
-    """
-    if isinstance(o, (set, tuple, list)):
-        return tuple([make_hash(e) for e in o])
-
-    elif not isinstance(o, dict):
-        return hash(o)
-
-    new_o = copy.deepcopy(o)
-    for k, v in new_o.items():
-        new_o[k] = make_hash(v)
-
-    return hash(tuple(frozenset(sorted(new_o.items()))))
--- a/doc/design.md
+++ b/doc/design.md
@ -674,7 +674,7 @@ Supported query string parameters:
  `metadata.label=key=value`. Repeating this parameter indicates all
  requested labels must apply (AND not OR).
 * `sort` - string, optional, repeatable - Defines the sort order for returning
-  results.  Default is `metadata.name`.  Repeating this parameter indicates use
+  results.  Default is by creation date.  Repeating this parameter indicates use
  of multi-column sort with the most significant sorting column applied first.
 * `status.bucket` - string, optional, repeatable - Used to select documents
  only from a particular bucket.  Repeating this parameter indicates documents
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -36,6 +36,7 @@ consumption by other UCP services.
   :maxdepth: 2

   HACKING
+   testing

 .. toctree::
   :maxdepth: 1
--- a/doc/source/testing.rst
+++ b/doc/source/testing.rst
@ -0,0 +1,97 @@
+..
+    Copyright 2017 AT&T Intellectual Property.  All other rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+=======
+Testing
+=======
+
+Unit testing
+============
+
+Unit testing currently uses an in-memory sqlite database. Since Deckhand's
+primary function is to serve as the back-end storage for UCP, the majority
+of unit tests perform actual database operations. Mocking is used sparingly
+because Deckhand is a fairly insular application that lives at the bottom
+of a very deep stack; Deckhand only communicates with Keystone and Barbican.
+As such, validating database operations is paramount to correctly testing
+Deckhand.
+
+To run unit tests using sqlite, execute::
+
+    $ tox -epy27
+    $ tox -epy35
+
+against a py27- or py35-backed environment, respectively. To run individual
+unit tests, run::
+
+    $ tox -e py27 -- deckhand.tests.unit.db.test_revisions
+
+for example.
+
+To run unit tests using postgresql, execute::
+
+    $ tox -epy27-postgresql
+    $ tox -epy35-postgresql
+
+against a py27- or py35-backed environment, respectively. Individual unit tests
+can be executed the same way as above.
+
+`pifpaf <https://github.com/jd/pifpaf>`_ is used to spin up a temporary
+postgresql database. The URL is set up as an environment variable via
+``PIFPAF_URL``.
+
+.. warning::
+
+    It is **not** recommended to run postgresql-backed unit tests concurrently.
+    Only run them serially. This is because, to guarantee true test isolation,
+    the DB tables are re-created each test run. Only one instance of postgresql
+    is created across all threads, thus causing major conflicts if concurrency
+    > 1.
+
+Functional testing
+==================
+
+Prerequisites
+-------------
+Deckhand requires Docker to run its functional tests. A basic installation
+guide for Docker for Ubuntu can be found
+`here <https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/>`_.
+
+Overview
+--------
+Deckhand uses `gabbi <https://github.com/cdent/gabbi>`_ as its functional
+testing framework. Functional tests can be executed via::
+
+    $ tox -e functional
+
+You can also run a subset of tests via a regex::
+
+    $ tox -e functional -- gabbi.suitemaker.test_gabbi_document-crud-success-multi-bucket
+
+The command executes ``tools/functional-tests.sh`` which:
+
+    1) Launches Postgresql inside a Docker container.
+    2) Sets up a basic Deckhand configuration file that uses Postgresql
+       in its ``oslo_db`` connection string.
+    3) Sets up a custom policy file with very liberal permissions so that
+       gabbi can talk to Deckhand without having to authenticate against
+       Keystone and pass an admin token to Deckhand.
+    4) Instantiates Deckhand via ``uwisgi``.
+    5) Calls gabbi which runs a battery of functional tests.
+
+At this time, there are no functional tests for policy enforcement
+verification. Negative tests will be added at a later date to confirm that
+a 403 Forbidden is raised for each endpoint that does policy enforcement
+absent necessary permissions.
--- a/test-requirements.txt
+++ b/test-requirements.txt
@ -17,3 +17,4 @@ bandit>=1.1.0 # Apache-2.0
 sphinx>=1.6.2 # BSD
 gabbi==1.35.1
 sphinx_rtd_theme==0.2.4
+pifpaf==0.10.0
--- a/tools/pretty_tox.sh
+++ b/tools/pretty_tox.sh
@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+set -o pipefail
+
+TESTRARGS=$1
+
+# --until-failure is not compatible with --subunit see:
+#
+# https://bugs.launchpad.net/testrepository/+bug/1411804
+#
+# this work around exists until that is addressed
+if [[ "$TESTARGS" =~ "until-failure" ]]; then
+    python setup.py testr --slowest --testr-args="$TESTRARGS"
+else
+    python setup.py testr --slowest --testr-args="--subunit $TESTRARGS" | subunit-trace -f
+fi
--- a/tox.ini
+++ b/tox.ini
@ -23,11 +23,21 @@ commands =
    {[testenv]commands}
    ostestr '{posargs}'

+[testenv:py27-postgresql]
+commands =
+    {[testenv]commands}
+    pifpaf run postgresql -- '{toxinidir}'/tools/pretty_tox.sh '--concurrency=1 {posargs}'
+
 [testenv:py35]
 commands =
    {[testenv]commands}
    ostestr '{posargs}'

+[testenv:py35-postgresql]
+commands =
+    {[testenv]commands}
+    pifpaf run postgresql -- '{toxinidir}'/tools/pretty_tox.sh '--concurrency=1 {posargs}'
+
 [testenv:functional]
 usedevelop = True
 setenv = VIRTUAL_ENV={envdir}