deckhand/deckhand/engine/revision_diff.py

# Copyright 2018 AT&T Intellectual Property.  All other rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from deepdiff import DeepDiff
from deepdiff.helper import RemapDict
import jsonpickle

from deckhand.control import common
from deckhand.db.sqlalchemy import api as db_api
from deckhand.engine import utils
from deckhand import errors


def revision_diff(revision_id, comparison_revision_id, deepdiff=False):
    """Generate the diff between two revisions.

    Generate the diff between the two revisions: `revision_id` and
    `comparison_revision_id`.
    a. When deepdiff=False: A basic comparison of the revisions in terms of
    how the buckets involved have changed is generated. Only buckets with
    existing documents in either of the two revisions in question will be
    reported.
    b. When deepdiff=True: Along with basic comparision, It will generate deep
    diff between revisions' modified buckets.

    Only in case of diff, The ordering of the two revision IDs is
    interchangeable, i.e. no matter the order, the same result is generated.

    The differences include:

        - "created": A bucket has been created between the revisions.
        - "deleted": A bucket has been deleted between the revisions.
        - "modified": A bucket has been modified between the revisions.
                      When deepdiff is enabled, It also includes deep
                      difference between the revisions.
        - "unmodified": A bucket remains unmodified between the revisions.

    :param revision_id: ID of the first revision.
    :param comparison_revision_id: ID of the second revision.
    :param deepdiff: Whether deepdiff needed or not.
    :returns: A dictionary, keyed with the bucket IDs, containing any of the
        differences enumerated above.

    Examples Diff::

        # GET /api/v1.0/revisions/6/diff/3
        bucket_a: created
        bucket_b: deleted
        bucket_c: modified
        bucket_d: unmodified

        # GET /api/v1.0/revisions/0/diff/6
        bucket_a: created
        bucket_c: created
        bucket_d: created

        # GET /api/v1.0/revisions/6/diff/6
        bucket_a: unmodified
        bucket_c: unmodified
        bucket_d: unmodified

        # GET /api/v1.0/revisions/0/diff/0
        {}

    Examples DeepDiff::

        # GET /api/v1.0/revisions/3/deepdiff/4
        bucket_a: modified
        bucket_a diff:
          document_changed:
            count: 1
            details:
              ('example/Kind/v1', 'doc-b'):
                data_changed:
                  values_changed:
                    root['foo']: {new_value: 3, old_value: 2}
                metadata_changed: {}

        # GET /api/v1.0/revisions/2/deepdiff/3
        bucket_a: modified
        bucket_a diff:
          document_added:
            count: 1
            details:
            - [example/Kind/v1, doc-c]

        # GET /api/v1.0/revisions/0/deepdiff/0
        {}

        # GET /api/v1.0/revisions/0/deepdiff/3
        bucket_a: created
    """
    if deepdiff:
        docs = (_rendered_doc(revision_id) if revision_id != 0 else [])
        comparison_docs = (_rendered_doc(comparison_revision_id)
                           if comparison_revision_id != 0 else [])
    else:
        # Retrieve document history for each revision. Since `revision_id` of 0
        # doesn't exist, treat it as a special case: empty list.
        docs = (db_api.revision_documents_get(revision_id,
                                              include_history=True,
                                              unique_only=False)
                if revision_id != 0 else [])
        comparison_docs = (db_api.revision_documents_get(
                           comparison_revision_id,
                           include_history=True,
                           unique_only=False
                           ) if comparison_revision_id != 0 else [])

    # Remove each deleted document and its older counterparts because those
    # documents technically don't exist.
    docs = utils.exclude_deleted_documents(docs)
    comparison_docs = utils.exclude_deleted_documents(comparison_docs)

    revision = db_api.revision_get(revision_id) if revision_id != 0 else None
    comparison_revision = (db_api.revision_get(comparison_revision_id)
                           if comparison_revision_id != 0 else None)

    # Each dictionary below, keyed with the bucket's name, references the list
    # of documents related to each bucket.
    buckets = {}
    comparison_buckets = {}
    for doc in docs:
        buckets.setdefault(doc['bucket_name'], [])
        buckets[doc['bucket_name']].append(doc)
    for doc in comparison_docs:
        comparison_buckets.setdefault(doc['bucket_name'], [])
        comparison_buckets[doc['bucket_name']].append(doc)

    # `shared_buckets` references buckets shared by both `revision_id` and
    # `comparison_revision_id` -- i.e. their intersection.
    shared_buckets = set(buckets.keys()).intersection(
        comparison_buckets.keys())
    # `unshared_buckets` references buckets not shared by both `revision_id`
    # and `comparison_revision_id` -- i.e. their non-intersection.
    unshared_buckets = set(buckets.keys()).union(
        comparison_buckets.keys()) - shared_buckets

    result = {}

    def _compare_buckets(b1, b2):
        # Checks whether buckets' documents are identical.
        return (sorted([(d['data_hash'], d['metadata_hash']) for d in b1]) ==
                sorted([(d['data_hash'], d['metadata_hash']) for d in b2]))

    # If the list of documents for each bucket is identical, then the result
    # is "unmodified", else "modified".
    for bucket_name in shared_buckets:
        unmodified = _compare_buckets(buckets[bucket_name],
                                      comparison_buckets[bucket_name])
        if unmodified:
            result[bucket_name] = 'unmodified'
        else:
            result[bucket_name] = 'modified'
            # If deepdiff enabled
            if deepdiff:
                # find out diff between buckets
                bucket_diff = _diff_buckets(buckets[bucket_name],
                                            comparison_buckets[bucket_name])
                result[bucket_name + ' diff'] = bucket_diff

    for bucket_name in unshared_buckets:
        # If neither revision has documents, then there's nothing to compare.
        # This is always True for revision_id == comparison_revision_id == 0.
        if not any([revision, comparison_revision]):
            break
        # Else if one revision == 0 and the other revision != 0, then the
        # bucket has been created. Which is zero or non-zero doesn't matter.
        elif not all([revision, comparison_revision]):
            result[bucket_name] = 'created'
        # Else if `revision` is newer than `comparison_revision`, then if the
        # `bucket_name` isn't in the `revision` buckets, then it has been
        # deleted. Otherwise it has been created.
        elif revision['created_at'] > comparison_revision['created_at']:
            if bucket_name not in buckets:
                result[bucket_name] = 'deleted'
            elif bucket_name not in comparison_buckets:
                result[bucket_name] = 'created'
        # Else if `comparison_revision` is newer than `revision`, then if the
        # `bucket_name` isn't in the `revision` buckets, then it has been
        # created. Otherwise it has been deleted.
        else:
            if bucket_name not in buckets:
                result[bucket_name] = 'created'
            elif bucket_name not in comparison_buckets:
                result[bucket_name] = 'deleted'

    return result


def _diff_buckets(b1, b2):
    """Function to provide deep diff between two revisions"""
    b1_tmp = {}
    b2_tmp = {}
    change_count = 0
    change_details = {}
    diff_result = {}
    alias = lambda d: (d['schema'], d['name'])

    b1_tmp.update({
        alias(d): d
        for d in b1
    })
    b2_tmp.update({
        alias(d): d
        for d in b2
    })

    # doc deleted
    doc_deleted = list(set(b1_tmp.keys()) - set(b2_tmp.keys()))
    # new doc added
    doc_added = list(set(b2_tmp.keys()) - set(b1_tmp.keys()))

    if len(doc_added) > 0:
        diff_result.update({'document_added': {
            'count': len(doc_added),
            'details': doc_added}})
    if len(doc_deleted) > 0:
        diff_result.update({'document_deleted': {
            'count': len(doc_deleted),
            'details': doc_deleted}})

    # find modified documents
    for k, d in b1_tmp.items():
        if k in b2_tmp:
            # matched document, lets see changes
            if (d['data_hash'], d['metadata_hash']) != (
               b2_tmp[k]['data_hash'], b2_tmp[k]['metadata_hash']):
                # document change counter
                change_count += 1

                data_changed = {'encrypted': True}
                # if document is not encrypted then show diff else hide diff
                # data.
                if not b2_tmp[k].is_encrypted:
                    try:
                        # deepdiff returns deepdiff object. So first
                        # serializing to json then deserializing
                        # to dict.
                        data_changed = jsonpickle.decode(
                            DeepDiff(d['data'], b2_tmp[k]['data']).json)
                    # deepdiff doesn't provide custom exceptions;
                    # have to use Exception.
                    except Exception as ex:
                        raise errors.DeepDiffException(details=str(ex))
                try:
                    metadata_changed = jsonpickle.decode(
                        DeepDiff(d['metadata'],
                                 b2_tmp[k]['metadata']).json)
                except Exception as ex:
                        raise errors.DeepDiffException(details=str(ex))

                change_details.update({
                    str(k): {'data_changed': data_changed,
                             'metadata_changed': metadata_changed}})

    if change_count > 0:
        diff_result.update({'document_changed': {
            'count': change_count,
            'details': change_details
        }})
    # yaml_safedump failed to parse python objects;
    # need diff result format before pass it yaml_safedump
    return _format_diff_result(diff_result)


def _format_diff_result(dr):
    """Formats diff result per yaml_safedump parsing."""
    if isinstance(dr, dict):
        for k, v in dr.items():
            if isinstance(v, RemapDict):
                v = dict(v)
                dr.update({k: v})
            if isinstance(v, type):
                dr.update({k: str(v)})
            if isinstance(v, dict):
                _format_diff_result(v)
    return dr


def _rendered_doc(revision_id):
    """Provides rendered document by given revision id."""
    filters = {'deleted': False}
    rendered_documents, _ = common.get_rendered_docs(revision_id, **filters)
    return rendered_documents