#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2020-02-20 14:24:18 +0200 (Thu, 20 Feb 2020) $
#$Revision: 7746 $
#$URL: svn://www.crystallography.net/cod-tools/tags/v2.10/scripts/cif_validate $
#------------------------------------------------------------------------------
#*
#* Validate CIF files against DDL1-compliant CIF dictionaries.
#*
#* USAGE:
#*    $0 --dictionaries 'cif_core.dic,cif_cod.dic' --options input1.cif input*.cif
#**

use strict;
use warnings;
use Clone qw( clone );
use File::Basename qw( basename );
use List::MoreUtils qw( any none uniq );
use COD::CIF::ChangeLog qw( summarise_messages );
use COD::CIF::Parser qw( parse_cif ) ;
use COD::CIF::DDL qw( is_general_local_data_name );
use COD::CIF::DDL::DDL1 qw( canonicalise_value
                            get_category_name
                            get_data_type
                            get_enumeration_defaults
                            get_list_constraint_type
                            get_list_mandatory_flag );
use COD::CIF::DDL::Ranges qw( parse_range
                              range_to_string
                              is_in_range );
use COD::CIF::Tags::Manage qw( exclude_tag
                               has_special_value
                               has_numeric_value
                               get_item_loop_index
                               set_loop_tag
                               set_tag );
use COD::CIF::Tags::CanonicalNames qw( canonical_tag_name
                                       canonicalize_all_names );
use COD::CIF::DDL::Validate qw( check_enumeration_set );
use COD::SOptions qw( getOptions get_value get_int );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_warnings
                          process_parser_messages
                          report_message );
use COD::ToolsVersion;

my @dic_files;
my $use_parser = 'c';
my $enum_as_set_tags = ['_atom_site_refinement_flags'];
my $ignore_case = 0;
my $report_local_tags = 0;
my $report_deprecated = 0;
my $allow_double_precision_notation = 0;
my $max_message_count = -1;
my $debug = 0;

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

sub check_list_link_parent($$$);

#* OPTIONS:
#*   -d, --dictionaries 'cif_core.dic,cif_cod.dic'
#*                     A list of CIF dictionary files (according to DDL2)
#*                     to be used in CIF file validation. List elements
#*                     are separated either by ',' or by ' '. To include
#*                     dictionaries with filenames containing these symbols,
#*                     the --add-dictionary option is used.
#*   -D, --add-dictionary 'cif new dictionary.dic'
#*                     Add additional CIF dictionary to the list.
#*   --clear-dictionaries
#*                     Remove all CIF dictionaries from the list.
#*
#*   --max-message-count 5
#*                     Maximum number of validation messages that are reported
#*                     for each unique combination of validation criteria and
#*                     validated data items. Provide a negative value (i.e. -1)
#*                     to output all of the generated validation messages
#*                     (default: -1).
#*
#*   --treat-as-set _atom_site_refinement_flags
#*                     Treat values of given data items as a set. For example,
#*                     more than one enumeration value could be defined
#*                     for a single element. Any number of data item tags can
#*                     be provided in the following way:
#*                     $0 --treat-as-set _tag_1 --treat-as-set _tag_2
#*                     Default is the '_atom_site_refinement_flags' data item.
#*   --no-treat-as-set
#*                     Do not treat values of any data items as sets.
#*                     (see --treat-as-set).
#*
#*   --ignore-case
#*                     Ignore letter case while validating enumeration values.
#*                     For example, even though '_atom_site_adp_type' is
#*                     restricted to values ('Uani', 'Uiso', 'Uovl', ...),
#*                     value 'UANI' would still be treated as valid.
#*   --respect-case, --case-sensitive, --dont-ignore-case
#*                     Respect letter case while validating enumeration
#*                     values (default).
#*
#*   --report-deprecated
#*                     Report the presence of data items that are marked as
#*                     deprecated in the dictionaries. Data item deprecation
#*                     usually means that it has been replaced with other
#*                     data items.
#*   --no-report-deprecated, --ignore-deprecated
#*                     Do not report presence of data items that are marked
#*                     as deprecated in the dictionaries (default).
#*
#*   --report-local-tags
#*                     Report the presence of local data items.
#*   --no-report-local-tags, --ignore-local-tags
#*                     Do not report the presence of local data items (default).
#*
#*   --allow-double-precision-notation
#*                     Treat numbers expressed using the double precision
#*                     notation (i.e. 0.42D+7) as proper numbers in a way
#*                     that is compatible with DDL1, but not the CIF_1.1
#*                     syntax.
#*   --no-allow-double-precision-notation
#*                     Treat numbers expressed using the double precision
#*                     notation (i.e. 0.42D+7) as character strings in a
#*                     way compatible with the CIF_1.1 syntax, but does not
#*                     cover the full extent of the DDL1 numbers variations
#*                     (default).
#*
#*   --use-perl-parser
#*                     Use Perl parser to parse CIF files.
#*   --use-c-parser
#*                     Use C parser to parse CIF files (default)
#*
#*   -c, --always-continue
#*                     Continue processing and return successful return status
#*                     even if errors are diagnosed.
#*   -c-, --always-die
#*                     Stop and return error status if errors are diagnosed.
#*   --continue-on-errors
#*                     Do not terminate script if errors are raised (default).
#*   --die-on-errors
#*                     Terminate script immediately if errors are raised.
#*   --continue-on-warnings
#*                     Do not terminate script if warnings are raised (default).
#*   --die-on-warnings
#*                     Terminate script immediately if warnings are raised.
#*   --continue-on-notes
#*                     Do not terminate script if notes are raised (default).
#*   --die-on-notes
#*                     Terminate script immediately if notes are raised.
#*   --debug
#*                     Output extra information for debugging purposes.
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   -v, --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '-d,--dictionaries'    => sub{ @dic_files = split m/,|\s+/, get_value() },
    '-D,--add-dictionary'  => sub{ push @dic_files, get_value() },
    '--clear-dictionaries' => sub{ @dic_files = () },

    '--max-message-count' => sub { $max_message_count = get_int() },

    '--treat-as-set'                    => $enum_as_set_tags,
    '--no-treat-as-set'                 => sub{ $enum_as_set_tags = [] },

    '--ignore-case'                     => sub{ $ignore_case = 1 },
    '--dont-ignore-case,--respect-case' => sub{ $ignore_case = 0 },
    '--case-sensitive'                  => sub{ $ignore_case = 0 },

    '--allow-double-precision-notation'
                           => sub { $allow_double_precision_notation = 1 },
    '--no-allow-double-precision-notation'
                           => sub { $allow_double_precision_notation = 0 },

    '--report-local-tags'               => sub{ $report_local_tags = 1 },
    '--no-report-local-tags'            => sub{ $report_local_tags = 0 },
    '--ignore-local-tags'               => sub{ $report_local_tags = 0 },

    '--report-deprecated'               => sub{ $report_deprecated = 1 },
    '--no-report-deprecated'            => sub{ $report_deprecated = 0 },
    '--ignore-deprecated'               => sub{ $report_deprecated = 0 },

    '--use-perl-parser'                 => sub{ $use_parser = 'perl' },
    '--use-c-parser'                    => sub{ $use_parser = 'c' },

    '-c,--always-continue'              => sub { $die_on_errors   = 0;
                                                 $die_on_warnings = 0;
                                                 $die_on_notes    = 0 },
    '-c-,--always-die'                  => sub { $die_on_errors   = 1;
                                                 $die_on_warnings = 1;
                                                 $die_on_notes    = 1 },

    '--continue-on-errors'          => sub { $die_on_errors = 0 },
    '--die-on-errors'               => sub { $die_on_errors = 1 },

    '--continue-on-warnings' => sub { $die_on_warnings = 0 },
    '--die-on-warnings'      => sub { $die_on_warnings = 1 },

    '--continue-on-notes'    => sub { $die_on_notes = 0 },
    '--die-on-notes'         => sub { $die_on_notes = 1 },

    '--options'         => sub{ options; exit },
    '--help,--usage'    => sub{ usage; exit; },
    '--debug'           => sub{ $debug = 1 },
    '-v,--version'      => sub { print 'cod-tools version ',
                                 $COD::ToolsVersion::Version, "\n";
                                 exit }
);

my $die_on_error_level = {
    'ERROR'   => $die_on_errors,
    'WARNING' => $die_on_warnings,
    'NOTE'    => $die_on_notes
};

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

# Reading dictionary files

my $ddl1_enum_defaults = get_enumeration_defaults();
my %validation_dics;
if( @dic_files ) {
    my $tag_count = 0;
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    for my $dic ( @dic_files ) {
        my ( $data, $err_count, $messages ) = parse_cif( $dic, $options );
        process_parser_messages( $messages, $die_on_error_level );

        local $SIG{__WARN__} = sub { process_warnings( {
                                       'message'       => @_,
                                       'program'       => $0,
                                       'filename'      => $dic,
                                     }, $die_on_error_level ) };

        my $ddl_generation = determine_ddl_generation( $data );
        if ( !defined $ddl_generation ) {
            warn 'file was not recognised as a proper DDL dictionary -- ' .
                 'file will be skipped' . "\n";
            next;
        }

        if ( $ddl_generation eq 'm' ) {
            warn 'file was recognised as a DDLm-conformant dictionary, ' .
                 'however, only DDL1 and DDL2 compliant dictionaries are ' .
                 'supported in the current version of the program -- file ' .
                 'will be skipped' . "\n";
            next;
        }

        if ( $ddl_generation eq '1' ) {
            $validation_dics{$ddl_generation}{'dictionaries'}{$dic} =
                            get_ddl1_dic( $data, $ddl1_enum_defaults );
        } elsif ( $ddl_generation eq '2' ) {
            $validation_dics{$ddl_generation}{'dictionaries'}{$dic} =
                            get_ddl2_dict( $data->[0] );
        }

        if ( scalar( keys %{$validation_dics{$ddl_generation}{'dictionaries'}{$dic}} ) == 0 ) {
            warn "no data item definitions found\n";
        }
        $tag_count += scalar( keys %{$validation_dics{$ddl_generation}{'dictionaries'}{$dic}} );
    }

    if( $tag_count == 0 ) {
        report_message( {
            'program'   => $0,
            'err_level' => 'ERROR',
            'message'   => 'no data item definitions were found in the '
                         . 'provided dictionary files '
                         . '(\'' . join( '\', \'', @dic_files ) . '\')'
        }, $die_on_errors );
    }

    $validation_dics{'1'}{'merged_properties'} =
                                    get_merged_dictionary_properties(
                                        $validation_dics{'1'}{'dictionaries'}
                                    );
} else {
    report_message( {
        'program'   => $0,
        'err_level' => 'ERROR',
        'message'   => 'at least one dictionary file should be provided by '
                     . 'using the \'--dictionaries\' option. Automatic '
                     . 'dictionary download is not implemented yet'
    }, $die_on_errors );
    my $dic_iucr_uri = 'ftp://ftp.iucr.org/pub/cif_core.dic';
}

# Iterating through the CIF files

@ARGV = ('-') unless @ARGV;

my $validation_options = {
    'report_deprecated' => $report_deprecated,
    'ignore_case'       => $ignore_case,
    'enum_as_set_tags'  => $enum_as_set_tags,
    'allow_double_precision_notation' => $allow_double_precision_notation,
    'max_issue_count'   => $max_message_count,
};

for my $filename ( @ARGV ) {
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    next if !defined $data;

    # convert all tags to a 'canonical' form
    canonicalize_all_names( $data );

    for my $block ( @{$data} ) {
        my $dataname = 'data_' . $block->{'name'};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => $dataname
            }, $die_on_error_level )
        };

        my @tag_value_notes;

        push @tag_value_notes,
             @{ ddl1_validate_data_block( $block,
                                          $validation_dics{'1'},
                                          $validation_options ) };

        my $ddl2_dics = $validation_dics{'2'}{'dictionaries'};
        my @ddl2_messages;
        for my $dic_f ( sort keys %{$ddl2_dics} ) {
            push @ddl2_messages,
                 @{ ddl2_validate_data_block( $block, $ddl2_dics->{$dic_f},
                                              $validation_options ) };
        };
        push @tag_value_notes, @{ summarise_messages(\@ddl2_messages) };

        my $ddl1_dics = $validation_dics{'1'}{'dictionaries'};
        push @tag_value_notes,
             @{ report_unrecognised_data_names(
                    $block,
                    {
                      defined $ddl1_dics ? %{$ddl1_dics} : (),
                      defined $ddl2_dics ? %{$ddl2_dics} : (),
                    },
                    $report_local_tags
                )
             };

        for my $note (sort @tag_value_notes) {
            warn "NOTE, $note" . "\n"
        }
    }
}

##
# Canonicalises a CIF data name to a standard form used in validation messages.
#
# @param $tag
#       Data name that should be canonicalised.
# @return
#       Canonicalised data name. 
##
sub canonicalise_tag
{
    my ($tag) = @_;

    return canonical_tag_name($tag);
}

##
# Builds a dictionary structure from a parsed DDL1 dictionary.
#
# @param $dic_data_blocks
#       Reference to a DDL1 dictionary structure as returned by the
#       COD::CIF::Parser. Normally, a DDL1 dictionary consists of
#       multiple data blocks each defining a data category or a
#       data item.
# @param $ddl1_defaults
#       Reference to a hash containing default values of data items
#       that appear in a DDL1 data item definitions.
# @return
#       Reference to a hash containing data item definitions.
##
sub get_ddl1_dic
{
    my ($dic_data_blocks, $ddl1_defaults) = @_;

    my %definitions;
    for my $data_block (@{$dic_data_blocks}) {
        # category definitions usually do no contain the '_type'
        # data item or have it set to 'null'
        next if !exists $data_block->{'values'}{'_type'};
        next if $data_block->{'values'}{'_type'}[0] eq 'null';
        $data_block = add_default_data_items( $data_block, $ddl1_defaults );
        for ( map { lc } @{$data_block->{'values'}{'_name'}} ) {
            $definitions{$_} = $data_block;
            $definitions{$_}{values}{'_dataname'} = $data_block->{'name'};
        }
    }

    return \%definitions;
}

##
# Builds a dictionary structure from a parsed DDL2 dictionary.
#
# @param $dic_data_blocks
#       Reference to a DDL2 dictionary structure as returned by the
#       COD::CIF::Parser. Normally, a DDL2 dictionary consists of
#       a single data block with multiple save frames each defining
#       a data category or a data item.
# @return
#       Reference to a hash containing data item definitions.
##
sub get_ddl2_dict
{
    my ( $dic_block ) = @_;

    my %definitions;
    for my $save_frame ( @{$dic_block->{'save_blocks'}} ) {
        next if !exists $save_frame->{'values'}{'_item.name'};
        for ( @{$save_frame->{'values'}{'_item.name'}} ) {
            $definitions{lc $_} = $save_frame;
            $definitions{lc $_}{'values'}{'_dataname'} = $_;
        }
    }

    return \%definitions;
}

##
# Adds data items with the default values to the given data frame
# if they are not already present in the data frame.
#
# @param $data_block
#       Reference to data block or a save frame as returned by the
#       COD::CIF::Parser that should be modified.
# @param $ddl1_defaults
#       Reference to a hash containing default values of data items
#       that appears in DDL1 data item definitions.
# @return
#       Reference to the data frame with the default data items added.
##
sub add_default_data_items
{
    my ($data_block, $default_values) = @_;

    for my $tag ( keys %{$default_values} ) {
        if ( !exists $data_block->{'values'}{$tag} ) {
            $data_block->{'values'}{$tag} = [ $default_values->{$tag} ];
        }
    }

    return $data_block;
}

##
# Validates a CIF data block against a set of DDL1 dictionaries.
#
# @param $data_block
#       Reference to data block or a save frame as returned by the
#       COD::CIF::Parser.
# @param $validation_resources
#       Reference to a validation resource data structure of
#       the following form:
#       {
#       # Reference to a hash of dictionary data structures as
#       # returned by the get_ddl1_dic() subroutine:
#           'dictionaries' => {
#               'path_to_dictionary_file_A' => {
#                   ...
#                },
#               'path_to_dictionary_file_B' => {
#                   ...
#                },
#                ...,
#               'path_to_dictionary_file_Z' => {
#                   ...
#                },
#            },
#       # Reference to a data structured of merged validation dictionary
#       # properties as returned by the get_merged_dictionary_properties()
#       # subroutine.
#           'merged_properties' => {
#               'item_to_category' => { ... },
#               'category_to_mandatory_items' => { ... },
#               'subcategory_key_to_category_key' => { ... },
#           }
#       }
# @param $options
#       Reference to a hash of options. The following options are recognised:
#       {
#       # Report data items that have been replaced by other data items
#           'report_deprecated' => 0,
#       # Ignore the case while matching enumerators
#           'ignore_case' => 0,
#       # Array reference to a list of data items that should be
#       # treated as potentially having values consisting of a
#       # combination of several enumeration values. Data items
#       # are identified by data names.
#           'enum_as_set_tags' => ['_atom_site_refinement_flags'],
#       # Treat numbers expressed using the double precision notation
#       # (i.e. 0.42D+7) as proper numbers
#           'allow_double_precision_notation'  => 0
#       # Maximum number of validation issues that are reported for
#       # each unique combination of validation criteria and validated
#       # data items. Negative values remove the limit altogether.
#           'max_issue_count' => 5
#       }
# @return
#       Array reference to a list of validation messages.
##
sub ddl1_validate_data_block
{
    my ( $data_block, $validation_resources, $options ) = @_;

    $options = {} if !defined $options;
    my $report_deprecated = exists $options->{'report_deprecated'} ?
                                   $options->{'report_deprecated'} : 0;
    my $ignore_case       = exists $options->{'ignore_case'} ?
                                   $options->{'ignore_case'} : 0;
    my $enum_as_set_tags  = exists $options->{'enum_as_set_tags'} ?
                                   $options->{'enum_as_set_tags'} : [];
    my $allow_d_notation  = exists $options->{'allow_double_precision_notation'} ?
                                   $options->{'allow_double_precision_notation'} : 0;
    my $max_issue_count   = exists $options->{'max_issue_count'} ?
                                   $options->{'max_issue_count'} : -1;

    my @issues;
    for my $loop_tags ( @{$data_block->{'loops'}} ) {
        push @issues,
             @{check_loop_category_homogeneity(
                $loop_tags,
                $validation_resources->{'merged_properties'}{'item_to_category'}
             )};
        push @issues,
             @{check_loop_mandatory_item_presence(
                 $loop_tags,
                 $validation_resources->{'merged_properties'}
             )};
    }

    my $dics = $validation_resources->{'dictionaries'};
    for my $dic_f ( sort keys %{$dics} ) {
        push @issues,
             @{ ddl1_validate_data_block_against_single_dic(
                    $data_block, $dics->{$dic_f}, $validation_options
                )
             };
    };

    my @validation_messages;
    if ( $max_issue_count < 0 ) {
        push @validation_messages, map { $_->{'message'} } @issues;
    } else {
        my %grouped_issues;
        for my $issue ( @issues ) {
            my $constraint = $issue->{'test_type'};
            my $data_name_key = join "\x{001E}", @{$issue->{'data_items'}};
            push @{$grouped_issues{$constraint}{$data_name_key}}, $issue;
        }

        # TODO: move hash out of the subroutine
        my %test_types = (
            'SIMPLE_KEY_UNIQUENESS'    =>
                'simple loop key uniqueness',
            'COMPOSITE_KEY_UNIQUENESS' =>
                'composite loop key uniqueness',
            'LOOP.INDIVIDUAL_UNIQUE_VALUES' =>
                'value uniqueness',
            'LOOP.COLLECTIVELY_UNIQUE_VALUES' =>
                'collective value uniqueness',
            'LOOP.INTEGRITY'      =>
                'loop integrity',
            'LOOP.CATEGORY_HOMOGENEITY' =>
                'items in a looped list all belonging to the same category',
            'LOOP.MANDATORY_ITEM_PRESENCE' =>
                'mandatory item presence in a category loop',
            'KEY_ITEM_PRESENCE'       =>
                'mandatory key item presence',
            'ITEM_REPLACEMENT.PRESENCE_OF_REPLACED' =>
                'replaced data item presence',
            'ITEM_REPLACEMENT.SIMULTANEOUS_PRESENCE' =>
                'simultaneous presence of replaced and replacing items',
            'LOOP_CONTEXT.MUST_APPEAR_IN_LOOP' =>
                'data items that incorrectly appear outside of a looped list',
            'LOOP_CONTEXT.MUST_NOT_APPEAR_IN_LOOP' =>
                'data items that incorrectly appear inside of a looped list',
            'PRESENCE_OF_PARENT_DATA_ITEM' =>
                'parent data item presence',
            'PRESENCE_OF_PARENT_DATA_ITEM_VALUE' =>
                'parent data item value presence',
            'ENUMERATION_SET' =>
                'data value belonging to the specified enumeration set',
            'SU_ELIGIBILITY' =>
                'data value standard uncertainty eligibility',
            'ENUM_RANGE.CHAR_STRING_LENGTH' =>
                'data value belonging to a character range and ' .
                'consisting of more than one symbol',
            'ENUM_RANGE.IN_RANGE' =>
                'data value belonging to the specified value range',
            'TYPE_CONSTRAINT.QUOTED_NUMERIC_VALUES' =>
                'proper quote usage with numeric values',
            'TYPE_CONSTRAINT.PROPER_NUMERIC_VALUES' =>
                'data value conformance to the numeric data type'
        );

        for my $constraint (sort keys %grouped_issues) {
            for my $data_name_key (sort keys %{$grouped_issues{$constraint}}) {
                my @group_issues = @{$grouped_issues{$constraint}{$data_name_key}};
                my $group_size = scalar(@group_issues);

                my $description;
                if ( defined $test_types{$constraint} ) {
                    $description = $test_types{$constraint};
                }

                if ( $group_size > $max_issue_count ) {
                    push @validation_messages,
                         'a test ' .
                         (defined $description ? "of $description " : '') .
                         'involving the [' .
                         ( join ', ',
                            map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                                @{$group_issues[0]->{'data_items'}} ) .
                        "] data items resulted in $group_size validation messages " .
                        '-- the number of reported messages is limited to ' .
                        "$max_issue_count";
                    $group_size = $max_issue_count;
                }

                push @validation_messages,
                        map { $_->{'message'} } @group_issues[0..($group_size - 1)];
            }
        }
    }

    return \@validation_messages;
}

##
# Validates a CIF data block against a single DDL1 dictionary.
#
# @param $data_block
#       Reference to data block or a save frame as returned by the
#       COD::CIF::Parser.
# @param $dic
#       Reference to a dictionary data structures as returned by
#       the get_ddl1_dic() subroutine.
# @param $options
#       Reference to a hash of options. The following options are recognised:
#       {
#       # Report data items that have been replaced by other data items
#           'report_deprecated' => 0,
#       # Ignore the case while matching enumerators
#           'ignore_case' => 0,
#       # Array reference to a list of data items that should be
#       # treated as potentially having values consisting of a
#       # combination of several enumeration values. Data items
#       # are identified by data names.
#           'enum_as_set_tags' => ['_atom_site_refinement_flags'],
#       # Treat numbers expressed using the double precision notation
#       # (i.e. 0.42D+7) as proper numbers
#           'allow_double_precision_notation'  => 0
#       }
# @return
#       Array reference to a list of validation message data structures
#       of the following form:
#       {
#       # Code of the data block that contains the offending entry
#           'data_block_code' => 'offending_block_code',
#       # Code of the save frame that contains the offending entry.
#       # Might be undefined
#           'save_frame_code' => 'offending_frame_code',
#       # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#       # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#       # Human-readable description of the issue
#           'message' => 'issue description'
#       }
##
sub ddl1_validate_data_block_against_single_dic
{
    my ( $data_block, $dic, $options ) = @_;

    $options = {} if !defined $options;
    my $report_deprecated = exists $options->{'report_deprecated'} ?
                                   $options->{'report_deprecated'} : 0;
    my $ignore_case       = exists $options->{'ignore_case'} ?
                                   $options->{'ignore_case'} : 0;
    my $enum_as_set_tags  = exists $options->{'enum_as_set_tags'} ?
                                   $options->{'enum_as_set_tags'} : [];
    my $allow_d_notation  = exists $options->{'allow_double_precision_notation'} ?
                                   $options->{'allow_double_precision_notation'} : 0;

    my @issues = @{ validate_block_loops($data_block, $dic) };

    for my $tag ( @{$data_block->{'tags'}} ) {
        my $lc_tag = lc $tag;

        next if !exists $dic->{$lc_tag};

        if( $report_deprecated ) {
            push @issues,
                 @{ report_deprecated( $data_block, $tag, $dic ) }
        };

        push @issues,
             @{ validate_list_unique_key( $data_block, $tag, $dic ) };

        my $dic_item = $dic->{$lc_tag};

        push @issues,
             @{ validate_list_mandatory( $data_block, $tag, $dic_item ) };

        push @issues,
             @{ check_list_link_parent( $data_block, $tag, $dic_item ) };

        push @issues,
             @{ validate_enumeration_set(
                    $data_block, $tag, $dic_item,
                    {
                        'ignore_case'  => $ignore_case,
                        'treat_as_set' => any { lc($_) eq $lc_tag }
                                                        @{$enum_as_set_tags}
                    }
             ) };

       push @issues,
            @{ validate_su( $data_block, $tag, $dic_item ) };

       push @issues,
            @{ validate_range( $data_block, $tag, $dic_item ) };

       push @issues,
            @{ validate_data_type(
                    $data_block, $tag, $dic_item,
                    {
                        'allow_double_precision_notation' => $allow_d_notation,
                    }
            ) };
    }

    @issues = @{ summarise_validation_issues( \@issues ) };

    return \@issues;
}

##
# Constructs a data structure of shared dictionary properties extracted
# from all provided DDL1 dictionaries. The data structure allows to properly
# resolve entities that are defined in one dictionary, but referenced
# in a different dictionary (i.e. categories).
#
# NOTE: a proper dictionary merging algorithm could be used instead.
#
# @param $dics
#       Reference to a hash of dictionary data structures as returned by
#       the get_ddl1_dic() subroutine.
# @param \%merged_properties
#       Reference to a data structure of the following form:
#       {
#       # Reference to a hash that maps each data item to the category
#       # that the it belongs to:
#           'item_to_category' => {
#               'data_name_a' => 'category_name_1',
#               'data_name_b' => 'category_name_1',
#               'data_name_c' => 'category_name_3',
#               ...
#               'data_name_f' => 'category_name_n',
#           },
#       # Reference to a hash that maps categories to data items that must
#       # appear in looped list of that category:
#           'category_to_mandatory_items' => {
#               'category_name_1' => [
#                   'data_name_a',
#                   'data_name_b',
#                   ...
#                   'data_name_c',
#               ],
#
#               ...,
#
#               'category_name_n' => [
#                   ...
#               ]
#           },
#       # Reference to a hash that maps subcategory reference data items to
#       # the reference items of the parent category:
#           'subcategory_key_to_category_key' => {
#               'child_reference_item_name_1' => 'parent_reference_item_name_1',
#               'child_reference_item_name_2' => 'parent_reference_item_name_1',
#               ...,
#               'child_reference_item_name_n' => 'parent_reference_item_name_m',
#           },
#       # Reference to a hash that maps data item names to the names of their
#       # alternate data items:
#           'item_to_alternate_items' => {
#               'item_1' => [ 'alternate_item_1', ],
#               'item_2' => [ 'alternate_item_2_a', ..., 'alternate_item_2_z', ],
#               ...,
#               'item_n' => [ 'alternate_item_n', ],
#           },
#       }
##
sub get_merged_dictionary_properties
{
    my ($dics) = @_;

    my %merged_properties;
    $merged_properties{'item_to_category'} =
                            get_item_to_category_mapping($dics);
    $merged_properties{'category_to_mandatory_items'} =
                            get_category_to_mandatory_items_mapping($dics);
    $merged_properties{'subcategory_key_to_category_key'} =
                            get_subcategory_key_to_category_key_mapping($dics);
    $merged_properties{'item_to_alternate_items'} =
                            get_item_to_alternate_items_mapping($dics);

    return \%merged_properties;
}

##
# Constructs a hash that maps data item names to the names of the categories
# that they belong to as defined in the defining DDL1 dictionaries.
#
# @param $dics
#       Reference to a hash of dictionary data structures as returned by
#       the get_ddl1_dic() subroutine.
# @return
#       Reference to a hash of the following form:
#       {
#           'data_name_a' => 'category_name_1',
#           'data_name_b' => 'category_name_1',
#           'data_name_c' => 'category_name_3',
#           ...
#           'data_name_f' => 'category_name_n'
#       }
##
sub get_item_to_category_mapping
{
    my ($dics) = @_;

    my %item_to_category;
    for my $dic_file_name (sort keys %{$dics}) {
        my $dic = $dics->{$dic_file_name};
        for my $tag (sort keys %{$dic}) {
            my $category = get_category_name( $dic->{$tag} );
            next if !defined $category;
            $item_to_category{$tag} = lc $category;
        }
    }

    return \%item_to_category;
}

##
# Constructs a hash that maps category names to the names of data items
# that must appear in looped list of that category.
#
# @param $dics
#       Reference to a hash of dictionary data structures as returned by
#       the get_ddl1_dic() subroutine.
# @return
#       Reference to a hash of the following form:
#       {
#         'category_name_1' => [
#               'data_name_a',
#               'data_name_b',
#               ...
#               'data_name_c',
#         ],
#
#         ...,
#
#         'category_name_n' => [
#           ...
#         ]
#       }
##
sub get_category_to_mandatory_items_mapping
{
    my ($dics) = @_;

    my %category_to_mandatory_items;
    for my $dic_file_name (sort keys %{$dics}) {
        my $dic = $dics->{$dic_file_name};
        for my $tag (sort keys %{$dic}) {
            my $category = get_category_name( $dic->{$tag} );
            next if !defined $category;
            if ( get_list_mandatory_flag($dic->{$tag}) eq 'yes' ) {
                push @{$category_to_mandatory_items{$category}}, $tag;
            }
        }
    }

    return \%category_to_mandatory_items;
}

##
# Constructs a hash that maps subcategory reference data items to
# the reference items of the parent category.
#
# The relationship of a parent category and child category (subcategory)
# is not formally established in DDL1. However, a posts in the official
# IUCr mailing list
# (https://www.iucr.org/__data/iucr/lists/cif-developers/msg00197.html)
# provides a clarification on implicit semantics of this kind:
#
#   "When the _list_link_parent data name is of the same category as the
#   defined data name, the _list_link_parent data name and the defined name
#   may be considered as a single combined definition which can be referred
#   to by either of the original data names for the purposes of resolving
#   _list_reference and _list_mandatory requirements."
#
# The official IUCr dictionaries currently contain a single instance of
# such relationship between the '_atom_site_label' and '_atom_site_aniso_label'
# data items.
#
# @param $dics
#       Reference to a hash of dictionary data structures as returned by
#       the get_ddl1_dic() subroutine.
# @return
#       Reference to a hash of the following form:
#       {
#         'child_reference_item_name_1' => 'parent_reference_item_name_1',
#         'child_reference_item_name_2' => 'parent_reference_item_name_1',
#         ...,
#         'child_reference_item_name_n' => 'parent_reference_item_name_m',
#       }
##
sub get_subcategory_key_to_category_key_mapping
{
    my ($dics) = @_;

    my %category;
    for my $dic_file_name (sort keys %{$dics}) {
        my $dic = $dics->{$dic_file_name};

        for my $tag (sort keys %{$dic}) {
            my $data_item = $dic->{$tag};
            my $name = get_category_name( $data_item );
            next if !defined $name;

            if (exists $category{$name}{'items'}{$tag}) {
                $data_item = merge_dic_item_definitions(
                    $category{$name}{'items'}{$tag},
                    $data_item
                )
            }
            $category{$name}{'items'}{$tag} = $data_item
        }
    }

    my %subcategory_key_to_category_key;
    for my $name (keys %category) {
        my $items = $category{$name}{'items'};
        my $list_reference_groups = get_all_list_references($items);
        next if scalar (@{$list_reference_groups} < 2);
        my @potential_sub_keys;
        my @potential_main_keys;
        for my $group (@{$list_reference_groups}) {
            next if scalar @{$group->{'key_data_items'}} != 1;
            my $key_item_name = $group->{'key_data_items'}[0];

            my $item = $items->{$key_item_name};

            if (exists $item->{'values'}{'_list_link_child'}) {
                push @potential_main_keys, $key_item_name;
            }

            if (exists $item->{'values'}{'_list_link_parent'}) {
                push @potential_sub_keys, $key_item_name;
            }
        }

        for my $key_name (@potential_sub_keys) {
            my $parent_name = $items->{$key_name}{'values'}{'_list_link_parent'}[0];
            next if none { $_ eq $parent_name } @potential_main_keys;
            $subcategory_key_to_category_key{$key_name} = $parent_name;
        }
    }

    return \%subcategory_key_to_category_key;
}

##
# Merges two DDL1 data item definitions into a single definition.
#
# The merging algorithm is implemented according to
# the dictionary merging protocol described in:
#   "International Tables for Crystallography Volume G:
#    Definition and exchange of crystallographic data",
#   2005, 87-89, doi: 10.1107/97809553602060000107
#
# The protocol describes three merging modes: STRICT, REPLACE and OVERLAY.
# This subroutine implements only the OVERLAY mode. A short description of
# the mode as provided in the protocol:
# "New attributes are added to those already stored for the data name;
#  conflicting attributes replace those already stored."
#
# For a more detailed description of the merging protocol and modes,
# consult the original source.
#
# @param $base_definition
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. Data item values provided in this definition
#       have a lower priority than those provided in the new definition.
# @param $new_definition
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. Data item values provided in this definition
#       have a higher priority than those provided in the base definition.
# @return
#       Reference to a merged DDL1 data item definition block.
##
sub merge_dic_item_definitions
{
    my ($base_definition, $new_definition) = @_;

    my $merged_definition = clone( $base_definition );

    $merged_definition = merge_scalar_items_in_overlay_mode(
                            $merged_definition,
                            $new_definition
                         );

    $merged_definition = merge_looped_items_in_overlay_mode(
                            $merged_definition,
                            $new_definition
                         );

    return $merged_definition;
}

##
# Merges scalar data items from the new definition into the base definition
# according to the rules of the OVERLAY mode.
#
# @param $base_definition
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. Data item values provided in this definition
#       have a lower priority than those provided in the new definition.
# @param $new_definition
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. Data item values provided in this definition
#       have a higher priority than those provided in the base definition.
# @return
#       Reference to the base definition block with the merged scalar items.
##
sub merge_scalar_items_in_overlay_mode
{
    my ($base_definition, $new_definition) = @_;

    ##
    # The list of scalar data item were manually compiled
    # from the 'ddl_core.dic' dictionary. Metadata of the
    # source dictionary: 
    #
    # Dictionary name: ddl_core.dic
    # Dictionary version: 1.4.1
    # Last updated on: 2005-06-29
    # Retrieved on: 2020-02-19
    # Retrieved from: ftp://ftp.iucr.org/pub/ddl_core.dic
    ##
    my @scalar_item_names = qw(
        _category
        _definition
        _enumeration_default
        _enumeration_range
        _list
        _list_level
        _list_mandatory
        _type
        _type_construct
        _units
        _units_detail
    );

    for my $tag ( @scalar_item_names ) {
        next if !exists $new_definition->{'values'}{$tag};
        set_tag($base_definition, $tag, $new_definition->{'values'}{$tag}[0]);
    }

    return $base_definition;
}

##
# Merges potentially looped data items from the new definition into the
# base definition according to the rules of the OVERLAY mode.
#
# @param $base_definition
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. Data item values provided in this definition
#       have a lower priority than those provided in the new definition.
# @param $new_definition
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. Data item values provided in this definition
#       have a higher priority than those provided in the base definition.
# @return
#       Reference to the base definition block with the merged scalar items.
##
sub merge_looped_items_in_overlay_mode
{
    my ($base_definition, $new_definition) = @_;

    ##
    # The list of potentially looped data item were manually compiled
    # from the 'ddl_core.dic' dictionary. Metadata of the source dictionary: 
    #
    # Dictionary name: ddl_core.dic
    # Dictionary version: 1.4.1
    # Last updated on: 2005-06-29
    # Retrieved on: 2020-02-19
    # Retrieved from: ftp://ftp.iucr.org/pub/ddl_core.dic
    ##
    my %category_key_to_items = (
        # NOTE: the '_name' property should have been used to identify
        # the definitions that are being merged so there is no need to
        # process it any further
        # '_name'               => '_name',
        '_enumeration'        => [ '_enumeration_detail' ],
        '_example'            => [ '_example_detail' ],
        '_list_link_child'    => [],
        '_list_link_parent'   => [],
        '_list_reference'     => [],
        '_list_uniqueness'    => [],
        '_related_item'       => [ '_related_function' ],
        '_type_conditions'    => [],
    );

    for my $key_tag (sort keys %category_key_to_items) {
        # TODO: values of the '$loop_tag' data item will not be included
        # into the merged item definition' 
        my $new_looped_items = classify_looped_items_by_mergeability(
                                  $new_definition,
                                  $key_tag,
                                  $category_key_to_items{$key_tag}
                               );
        # TODO: data items that were reported as unmergeable should be
        # properly reported
        next if !%{$new_looped_items->{'mergeable'}};

        # NOTE: potential warning message text: values of the '$loop_tag'
        # data item will not be included into the merged item definition'
        my $base_looped_items = classify_looped_items_by_mergeability(
                                  $base_definition,
                                  $key_tag,
                                  $category_key_to_items{$key_tag}
                                );

        my $merged_loop = merge_looped_values(
                                    $base_looped_items->{'mergeable'},
                                    $new_looped_items->{'mergeable'}
                                );
        # TODO: silently remove duplicates from the merged loop
        # TODO: detect duplicate keys that point to different data values

        exclude_tag($base_definition, $key_tag);
        my $is_looped = (@{$merged_loop->{$key_tag}} > 1);
        if ($is_looped) {
            set_loop_tag(
                $base_definition,
                $key_tag,
                $key_tag,
                $merged_loop->{$key_tag}
            );
        } else {
            set_tag(
                $base_definition,
                $key_tag,
                $merged_loop->{$key_tag}[0]
            );
        }
        delete $merged_loop->{$key_tag};
        for my $tag (sort keys %{$merged_loop}) {
            exclude_tag($base_definition, $tag);
            if ($is_looped) {
                set_loop_tag(
                    $base_definition,
                    $key_tag,
                    $key_tag,
                    $merged_loop->{$tag}
                );
            } else {
                set_tag(
                    $base_definition,
                    $key_tag,
                    $merged_loop->{$tag}[0]
                );
            }
        }
    }

    return $base_definition;
}

##
# Identifies which of the data items from a looped category are mergeable
# and which are not.
#
# @param $definition_block
#       Reference to a DDL1 data item definition block as returned by
#       the COD::CIF::Parser. 
# @param $key_tag
#       Name of the data item that acts as the looped list reference
#       of the checked category.
# @param $category_tags
#       Reference to an array of data items that belong to the
#       same category as the looped list reference item. Data
#       items are identified by their names.
# @return
#       Reference to a data structure of the following form:
#       {
#       # Mergeable data items are identified by their data names
#       # and returned together with the associated data values
#           'mergeable' => {
#               'mergeable_item_a' => {
#                   'values' => [
#                       'value_a_1',
#                       'value_a_2',
#                       ...,
#                       'value_a_n'
#                   ]
#                },
#               ...,
#               'mergeable_item_m' => {
#                   'values' => [
#                       'value_m_1',
#                       'value_m_2',
#                       ...,
#                       'value_m_n'
#                   ]
#                }
#           },
#       # Unmergeable data items are identified by their data names
#       # and returned together with a human-readable description
#       # of the reason they were deemed unmergeable 
#           'unmergeable' => {
#               'unmergeable_item_a' => ,
#               'unmergeable_item_b' => ,
#               ...
#           }
#       }
##
sub classify_looped_items_by_mergeability
{
    my ($definition_block, $key_tag, $category_tags) = @_;

    my $analysed_items = {
        'mergeable'   => {},
        'unmergeable' => {},
    };

    if (!exists $definition_block->{'values'}{$key_tag}) {
        for my $loop_tag (@{$category_tags}) {
            next if !exists $definition_block->{'values'}{$loop_tag};
            $analysed_items->{'unmergeable'}{$loop_tag} =
                "data item '$loop_tag' appears in a data block that " .
                'does not contain the associated looped list reference ' .
                "data item '$key_tag'";
        }
        return $analysed_items;
    }

    my $key_loop_index = get_item_loop_index($definition_block, $key_tag);
    $key_loop_index = -1 if !defined $key_loop_index;

    my %unmergeable_items;
    my @mergeable_items = ( $key_tag );
    for my $loop_tag (@{$category_tags}) {
        next if !exists $definition_block->{'values'}{$loop_tag};
        my $item_loop_index = get_item_loop_index($definition_block, $loop_tag);
        $item_loop_index = -1 if !defined $item_loop_index;
        if ($key_loop_index != $item_loop_index) {
            $unmergeable_items{$loop_tag} =
                "data item '$loop_tag' and the associated looped list " .
                "reference data item '$key_tag' do not appear in the " .
                'same loop';
            next;
        }
        push @mergeable_items, $loop_tag;
    }

    $analysed_items->{'unmergeable'} = \%unmergeable_items;
    for my $mergeable_item (@mergeable_items) {
        push @{$analysed_items->{'mergeable'}{$mergeable_item}{'values'}},
             @{$definition_block->{'values'}{$mergeable_item}}; 
    }

    return $analysed_items;
}

##
# Merges two looped lists of different sizes into a single looped list.
# Missing data values are replaced by CIF unknown ('?') values.
#
# @param $base_loop_items
#       Reference to a data structure of mergeable data items as
#       returned by the classify_looped_items_by_mergeability()
#       subroutine:
#       {
#           'item_a_name' => {
#               'values' => [ 'base_value_a_1', 'base_value_a_2', ... ],
#           },
#           'item_c_name' => {
#               'values' => [ 'base_value_c_1', 'base_value_c_2', ... ],
#           },
#           ...
#       }
# @param $new_loop_items
#       Reference to a data structure that contains data names
#       and data values of items that should be merged:
#       {
#           'item_a_name' => {
#                       'values' => [
#                           'new_value_a_1',
#                           'new_value_a_2',
#                           'new_value_a_3',
#                           ...
#                       ],
#           },
#           'item_b_name' => {
#                       'values' => [
#                           'new_value_b_1',
#                           'new_value_b_2',
#                           'new_value_b_3',
#                           ...
#                       ],
#           },
#           ...
#       }
# @return \%merged_loops
#       Reference to a data structure that contains a merged looped list
#       of the following form:
#       {
#           'item_a_name' => [
#                         'old_value_a_1',
#                         'old_value_a_2',
#                         ...,
#                         'new_value_a_1',
#                         'new_value_a_2',
#                         'new_value_a_3',
#                         ...
#                       ],
#           'item_b_name' => [
#                         '?',
#                         '?',
#                         ...,
#                         'new_value_b_1',
#                         'new_value_b_2',
#                         'new_value_b_3',
#                         ...
#                       ],
#           'item_c_name' => [
#                         'old_value_c_1',
#                         'old_value_c_2',
#                         ...,
#                         '?',
#                         '?',
#                         '?',
#                         ...
#                       ],
#           ...,
#       }
##
sub merge_looped_values
{
    my ($base_loop_items, $new_loop_items) = @_;

    my %merged_loop_items;
    # In case only new data items were added
    if (!%{$base_loop_items}) {
        for my $tag (keys %{$new_loop_items}) {
            push @{$merged_loop_items{$tag}},
                 @{$new_loop_items->{$tag}{'values'}};
        }
        return \%merged_loop_items;
    }

    # In case data item values need merging
    my $base_loop_length = scalar @{$base_loop_items->{
                            (sort keys %{$base_loop_items})[0]
                           }{'values'}};
    my $new_loop_length = scalar @{$new_loop_items->{
                            (sort keys %{$new_loop_items})[0]
                          }{'values'}};
    for my $tag (keys %{$base_loop_items}) {
        push @{$merged_loop_items{$tag}},
             @{$base_loop_items->{$tag}{'values'}};
        if (exists $new_loop_items->{$tag}) {
            push @{$merged_loop_items{$tag}},
                 @{$new_loop_items->{$tag}{'values'}};
        } else {
            push @{$merged_loop_items{$tag}},
                 ( '?' x $new_loop_length );
        }
    }
    for my $tag (keys %{$new_loop_items}) {
        next if exists $merged_loop_items{$tag};
        if (exists $base_loop_items->{$tag} ) {
            push @{$merged_loop_items{$tag}},
                 @{$base_loop_items->{$tag}{'values'}};
        } else {
            push @{$merged_loop_items{$tag}},
                 ( '?' x $base_loop_length );
        }
        push @{$merged_loop_items{$tag}},
             @{$new_loop_items->{$tag}{'values'}};
    }

    return \%merged_loop_items;
}

##
# Constructs a hash that maps data item names to the names of their
# alternate data items.
#
# @param $dics
#       Reference to a hash of dictionary data structures as returned by
#       the get_ddl1_dic() subroutine.
# @return
#       Reference to a hash of the following form:
#       {
#         'item_1' => [ 'alternate_item_1', ],
#         'item_2' => [ 'alternate_item_2_a', ..., 'alternate_item_2_z', ],
#         ...,
#         'item_n' => [ 'alternate_item_n', ],
#       }
##
sub get_item_to_alternate_items_mapping
{
    my ($dics) = @_;

    my %item_to_alternate_items;
    for my $dic_file_name (sort keys %{$dics}) {
        my $dic = $dics->{$dic_file_name};
        for my $tag (sort keys %{$dic}) {
            my $alternate_item_names = get_alternate_item_names($dic->{$tag});
            next if !@{$alternate_item_names};
            $item_to_alternate_items{$tag} = $alternate_item_names;
        }
    };

    return \%item_to_alternate_items
}

##
# Extracts the data names of items that are marked as alternates of
# the given item.
#
# @param $data_item
#       Data item definition block as returned by the COD::CIF::Parser.
# @return
#       Reference to an array of data names.
##
sub get_alternate_item_names
{
    my ($dic_item) = @_;

    return [] if !exists $dic_item->{'values'}{'_related_item'};
    return [] if !exists $dic_item->{'values'}{'_related_function'};
    # check if items reside in the same loop (or are both unlooped)
    my $related_item_loop = get_item_loop_index($dic_item, '_related_item');
    $related_item_loop = -1 if !defined $related_item_loop;

    my $related_function_loop = get_item_loop_index($dic_item, '_related_function');
    $related_function_loop = -1 if !defined $related_function_loop;
    return [] if $related_item_loop != $related_function_loop;

    my @alternate_item_names;
    for (my $i = 0; $i < @{$dic_item->{'values'}{'_related_item'}}; $i++) {
        next if $dic_item->{'values'}{'_related_function'}[$i] ne 'alternate';
        push @alternate_item_names, $dic_item->{'values'}{'_related_item'}[$i];
    };

    return \@alternate_item_names;
}

# NOTE: the subroutine was copied from the COD::CIF::DDL::DDLm module.
##
# Groups validation issues with identical messages together and replaces
# each group with a single validation issue that contains a summarized
# version of the message.
#
# @param $issues
#       Array reference to a list of validation message data structures
#       of the following form:
#       {
#       # Code of the data block that contains the offending entry
#           'data_block_code' => 'offending_block_code',
#       # Code of the save frame that contains the offending entry.
#       # Might be undefined
#           'save_frame_code' => 'offending_frame_code',
#       # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#       # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#       # Human-readable description of the issue
#           'message' => 'issue description'
#       }
#
# @return $summarised_issues
#       Reference to an array of unique summarised issues.
##
sub summarise_validation_issues
{
    my ($issues) = @_;

    my %message_count;
    for my $issue (@{$issues}) {
        $message_count{$issue->{'message'}}{'count'}++;
        $message_count{$issue->{'message'}}{'representative_issue'} = $issue;
    }

    my @summarised_issues;
    for my $message ( sort keys %message_count ) {
        my $count = $message_count{$message}->{'count'};
        my $issue = $message_count{$message}->{'representative_issue'};
        if( $count > 1 ) {
            $issue->{'message'} = $message . " ($count times)";
        }
        push @summarised_issues, $issue;
    }

    return \@summarised_issues;
}

##
# Validates a CIF data frame against a DDL2 dictionary.
#
# @param $data_block
#       Reference to data block or a save frame as returned by the
#       COD::CIF::Parser.
# @param $dic
#       Reference to a dictionary data structure as returned by
#       the get_ddl2_dict() subroutine.
# @param $options
#       Reference to a hash of options.
# @return
#       Array reference to a list of validation messages. Currently no
#       options are recognised.
##
sub ddl2_validate_data_block
{
    my ( $data_block, $dic, $options ) = @_;

    my @notes;
    for my $tag ( @{$data_block->{'tags'}} ) {
         my $lc_tag = lc $tag;
         my $dic_item = $dic->{$lc_tag};
         push @notes,
              @{ ddl2_validate_data_type( $data_block, $tag, $dic_item ) };
     }

    return \@notes;
}

##
# Returns an array of tags of data items that have superseded the data item.
#
# @param $dic
#       Reference to a dictionary data structure as returned by
#       the get_ddl1_dic() subroutine.
# @param $tag
#       Lowercased name of the data item.
# @return
#       Array of tags of data items that have superseded the data item.
##
sub get_replacement_tags
{
    my ( $dic, $tag ) = @_;

    return [] if !exists $dic->{$tag};
    my $dic_item = $dic->{$tag}{'values'};
    return [] if !exists $dic_item->{'_related_item'};

    my @replace_with;
    # check if data items are deprecated (replaced with other data items)
    for( my $i = 0; $i < @{$dic_item->{'_related_item'}}; $i++ ) {
        if( $dic_item->{'_related_function'}[$i] eq 'replace' ) {
            push @replace_with, $dic_item->{'_related_item'}[$i];
        }
    }

    return \@replace_with;
}

##
# Returns an array of tags of the data items that are required to be present
# in the loop containing the analysed data item.
#
# @param $dic
#       Reference to a dictionary data structure as returned by
#       the get_ddl1_dic() subroutine.
# @param $tag
#       Lowercased name of the data item to analyse.
# @return $list_reference_tags
#       A reference to an array of tags of data items that are required to
#       be present in the loop containing the analysed data items.
##
sub get_list_reference_tags
{
    my ( $dic, $tag ) = @_;

    return [] if !exists $dic->{$tag};
    my $dic_item = $dic->{$tag}{values};
    return [] if !exists $dic_item->{'_list_reference'};

    my @list_reference_tags;
    # _list_reference identifies data items that must collectively be
    # in a loop. They are referenced by the names of their data blocks
    for my $ref_dataname (@{$dic_item->{'_list_reference'}}) {
      for my $dic_tag ( sort keys %{$dic} ) {
          if ( '_' . $dic->{$dic_tag}{values}{'_dataname'} eq $ref_dataname ) {
              push @list_reference_tags, $dic_tag;
          }
      }
    }

    return \@list_reference_tags;
}

##
# Checks the existence of parent (foreign) keys as specified by a DDL1 dictionary.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub check_list_link_parent($$$)
{
    my ( $block, $tag, $dic_item ) = @_;

    return [] if !exists $dic_item->{'values'}{'_list_link_parent'};
    my $parents = $dic_item->{'values'}{'_list_link_parent'};

    # TODO: not handled yet, unsure how to do that
    return [] if @{$parents} > 1;
    my $parent = $parents->[0];

    my @validation_issues;
    if ( !exists $block->{values}{$parent} ) {
        push @validation_issues,
             {
                'test_type'  => 'PRESENCE_OF_PARENT_DATA_ITEM',
                'data_items' => [ $tag, $parent ],
                'message'    =>
                    'missing parent data item -- the ' .
                    q{'} . ( canonicalise_tag($parent) ) . q{'} .
                    ' data item is required by the ' .
                    q{'} . ( canonicalise_tag($tag) ) . q{'} . ' data item',
             };
        return \@validation_issues;
    }

    my %parent_values = map { $_ => 1 } @{$block->{values}{$parent}};

    my @unmatched = uniq sort grep { !exists $parent_values{$_} }
                         @{$block->{values}{$tag}};

    for my $value (@unmatched) {
        # FIXME: these special CIF values should be handled properly
        # by taking their quotation into account
        next if ( $value eq '.' || $value eq '?' );
        push @validation_issues,
             {
                'test_type'  => 'PRESENCE_OF_PARENT_DATA_ITEM_VALUE',
                'data_items' => [ $tag, $parent ],
                'message'    =>
                    'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                    ' contains value ' . q{'} . $value . q{'} . ' that was ' .
                    'not found among the values of the parent data item ' .
                    q{'} . ( canonicalise_tag($parent) ) . q{'},
             };
    }

    return \@validation_issues;
}

##
# Checks enumeration values against a DDL1 dictionary.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl1_dic() subroutine.
# @param $options
#       Reference to a hash of options. The following options are recognised:
#       {
#       # Ignore the case while matching enumerators
#           'ignore_case'  => 0
#       # Treat data values as potentially consisting of a
#       # combination of several enumeration values
#           'treat_as_set' => 0
#       }
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_enumeration_set
{
    my ($data_block, $tag, $dic_item, $options) = @_;

    return [] if !exists $dic_item->{'values'}{'_enumeration'};
    my $enum_set = $dic_item->{'values'}{'_enumeration'};

    my @values;
    for ( my $i = 0; $i < @{$data_block->{'values'}{$tag}}; $i++ ) {
        next if has_special_value($data_block, $tag, $i);
        push @values, $data_block->{'values'}{$tag}[$i];
    }

    my @issues;
    my $is_proper_enum = check_enumeration_set( \@values, $enum_set, $options );
    for ( my $i = 0; $i < @{ $is_proper_enum }; $i++ ) {
        next if !$is_proper_enum->[$i];
        push @issues,
             {
               'test_type'  => 'ENUMERATION_SET',
               'data_items' => [ $tag ],
               'message'    =>
                    'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                    ' value ' . q{'} . $values[$i] . q{'} . ' must be one ' .
                    'of the enumeration values [' .
                        ( join ', ', @{$enum_set} ) .
                    ']'
             };
    };

    return \@issues;
}

##
# Checks values with standard uncertainties against a DDL1 dictionary.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_su
{
    my ( $data_block, $tag, $dic_item ) = @_;

    return [] if is_su_permitted($dic_item);

    my @validation_issues;
    for (my $i = 0; $i < @{$data_block->{'values'}{$tag}}; $i++) {
        next if  has_special_value($data_block, $tag, $i);
        next if !has_numeric_value($data_block, $tag, $i);

        my $value = $data_block->{'values'}{$tag}[$i];
        if ( $value =~ /([(][0-9]+[)])$/ ) {
            push @validation_issues,
                 {
                    'test_type'  => 'SU_ELIGIBILITY',
                    'data_items' => [ $tag ],
                    'message'    =>
                        'data item ' .
                        q{'} . ( canonicalise_tag($tag) ) . q{'} . ' value ' .
                        q{'} . $value . q{'} . ' is not permitted ' .
                        'to contain the appended standard uncertainty value ' .
                        "'$1'",
                }
        }
    }

    return \@validation_issues;
}

##
# Evaluates if the DDL1 dictionary definition permits data item values
# to contain standard uncertainty values.
#
# @param $dic_item
#       Dictionary definition of the data item as returned by get_ddl1_dic()
#       subroutine.
# @return
#       '1' if the s.u. value is permitted,
#       '0' otherwise.
##
sub is_su_permitted
{
    my ( $dic_item ) = @_;

    return 1 if !exists $dic_item->{'values'}{'_type'};
    return 1 if $dic_item->{'values'}{'_type'}[0] ne 'numb';

    my $is_su_permitted = any { $_ eq 'esd' || $_ eq 'su' }
                            @{$dic_item->{'values'}{'_type_conditions'}};

    return $is_su_permitted;
}

##
# Checks if values are within the range specified by a DDL1 dictionary.
#
# In case the value has an associated standard uncertainty (s.u.) value
# the range is extended from [x; y] to [x-3s; y+3s] where 's' is the s.u.
# value. Standard uncertainty values are considered in range comparison
# even if the data item is not formally eligible to have an associated
# s.u. value at all.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_range
{
    my ( $data_block, $tag, $dic_item ) = @_;

    return [] if !exists $dic_item->{'values'}{'_enumeration_range'};

    my $range = parse_range($dic_item->{'values'}{'_enumeration_range'}[0]);
    my $range_type = $dic_item->{'values'}{'_type'}[0];

    my @validation_issues;
    for (my $i = 0; $i < @{$data_block->{'values'}{$tag}}; $i++) {
        next if has_special_value($data_block, $tag, $i);
        next if !has_numeric_value($data_block, $tag, $i) &&
                $range_type eq 'numb';

        my $value = $data_block->{'values'}{$tag}[$i];
        if ( $range_type eq 'char' && length $value > 1 ) {
            push @validation_issues,
                 {
                   'test_type'  => 'ENUM_RANGE.CHAR_STRING_LENGTH',
                   'data_items' => [ $tag ],
                   'message'    =>
                        'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                        ' value ' . q{'} . $value . q{'} . ' violates range ' .
                        'constraints -- the values should consist of a ' .
                        'single character from the range ' .
                        range_to_string( $range, { 'type' => $range_type } )
                 };
            next;
        }

        my $su = $data_block->{'precisions'}{$tag}[$i];
        if ( $range_type eq 'numb' ) {
            $value =~ s/[(][0-9]+[)]$//;
        }

        if( is_in_range( $value,
                { 'type'  => $range_type,
                  'range' => $range,
                  'sigma' => $su, } ) <= 0 ) {
            push @validation_issues,
                 {
                   'test_type' => 'ENUM_RANGE.IN_RANGE',
                   'data_items' => [ $tag ],
                   'message'    =>
                        'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                        ' value ' . q{'} . $data_block->{'values'}{$tag}[$i] . q{'} .
                        ' should be in range ' .
                        range_to_string( $range, { 'type' => $range_type } )
                 };
        }
    }

    return \@validation_issues;
}

##
# Checks if values satisfy the DDL1 data type constraints.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl1_dic() subroutine.
# @param $options
#       Reference to a hash of options. The following options are recognised:
#       {
#       # Treat numbers expressed using the double precision notation
#       # (i.e. 0.42D+7) as proper numbers
#           'allow_double_precision_notation'  => 0
#       }
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_data_type
{
    my ( $data_block, $tag, $dic_item, $options ) = @_;

    my $data_type = get_data_type( $dic_item );
    return [] if !defined $data_type;
    return [] if $data_type ne 'numb';

    my $allow_d_notation = $options->{'allow_double_precision_notation'};

    my @validation_issues;
    for ( my $i = 0; $i < @{$data_block->{'values'}{$tag}}; $i++ ) {
        next if has_special_value($data_block, $tag, $i);
        next if has_numeric_value($data_block, $tag, $i);
        my $value = $data_block->{'values'}{$tag}[$i];

        my $message =
            'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
            ' value ' . q{'} . $value . q{'} . ' violates type constraints -- ';

        my $is_quoted_number = 0;
        if ( $allow_d_notation ) {
            $is_quoted_number = is_ddl1_number( $value );
            if ( $is_quoted_number ) {
                next if has_uqstring_value( $data_block, $tag, $i );
            }
        } else {
            $is_quoted_number = is_cif_1_number( $value );
        }

        my $test_type;
        if ( $is_quoted_number ) {
            $test_type = 'TYPE_CONSTRAINT.QUOTED_NUMERIC_VALUES';
            $message .=
                 'numeric values should be written without the use ' .
                 'of quotes or multiline value designators'
        } else {
             $test_type = 'TYPE_CONSTRAINT.PROPER_NUMERIC_VALUES';
             $message .=
                 'the value should be a numerically interpretable string, ' .
                 "e.g. '42', '42.00', '4200E-2'"
        };

        push @validation_issues,
             {
               'test_type' =>  $test_type,
               'data_items' => [ $tag ],
               'message'    => $message
             };
    }

    return \@validation_issues;
}

##
# Evaluates if the given value is a numeric one according to the CIF_1.1 syntax.
#
# @param $value
#       Value to be evaluated.
# @return
#       '1' if the value is numeric,
#       '0' otherwise.
##
sub is_cif_1_number
{
    my ($value) = @_;
    my $u_int   = '[0-9]+';
    my $int     = "[+-]?${u_int}";
    my $exp     = "[eE][+-]?${u_int}";
    my $u_float = "(?:${u_int}${exp})|(?:[0-9]*[.]${u_int}|${u_int}+[.])(?:${exp})?";
    my $float   = "[+-]?(?:${u_float})";

    return ( $value =~ m/^(?:${int}|${float})$/ ) ? 1 : 0;
}

##
# Evaluates if the given value is a numeric one according to the DDL1 core
# dictionary.
#
# @param $value
#       Value to be evaluated.
# @return
#       '1' if the value is numeric,
#       '0' otherwise.
##
sub is_ddl1_number
{
    my ($value) = @_;
    my $u_int   = '[0-9]+';
    my $int     = "[+-]?${u_int}";
    my $exp     = "[eEdD][+-]?${u_int}";
    my $u_float = "(?:${u_int}${exp})|(?:[0-9]*[.]${u_int}|${u_int}+[.])(?:${exp})?";
    my $float   = "[+-]?(?:${u_float})";

    return ( $value =~ m/^(?:${int}|${float})$/ ) ? 1 : 0;
}

##
# Checks if values satisfy the DDL2 data type constraints.
#
# @param $data_frame
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl2_dict() subroutine.
# @return
#       Array reference to a list of validation messages.
##
sub ddl2_validate_data_type
{
    my ( $data_frame, $tag, $dic_item ) = @_;

    # FIXME: the DDL2 data type validation is much more complex than
    # assumed in the current implementation. For example, the basic
    # data type are described in the DDL2 dictionary using regular
    # expressions, but these data types can be extended or even overridden
    # in any other DDL2 dict
    return [] if !$dic_item->{'values'}{'_item_type.code'};
    return [] if  $dic_item->{'values'}{'_item_type.code'}[0] ne 'float' &&
                  $dic_item->{'values'}{'_item_type.code'}[0] ne 'int';

    my @validation_messages;
    for ( my $i = 0; $i < @{$data_frame->{'values'}{$tag}}; $i++ ) {
        next if has_special_value($data_frame, $tag, $i);
        next if has_numeric_value($data_frame, $tag, $i);
        push @validation_messages,
            'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
            " value '$data_frame->{'values'}{$tag}[$i]' is of type " .
            q{'} . $data_frame->{'types'}{$tag}[$i] . q{'} .
            ' while it should be numeric, i.e. \'FLOAT\' or \'INT\'';
    }

    return \@validation_messages;
}

##
# Checks if data names are defined in at least one of the given dictionaries.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $dics
#       Reference to a hash of dictionaries as returned by the
#       get_ddl1_dic() or get_ddl2_dict() subroutines.
# @param $report_local_tags
#       Boolean denoting if the local data item names should be reported as
#       unrecognised data names.
# @return
#       Array reference to a list of validation messages.
##
sub report_unrecognised_data_names
{
    my ($data_block, $dics, $report_local_tags) = @_;

    my @validation_messages;

    my @tags = sort @{$data_block->{'tags'}};
    if ( !$report_local_tags ) {
        @tags = grep { !is_general_local_data_name($_) } @tags;
    }

    for my $dic ( values %{$dics} ) {
        @tags = grep { !exists $dic->{lc $_} } @tags;
    }

    @validation_messages = map {
              'definition of the ' . q{'} . ( canonicalise_tag($_) ) . q{'} .
              ' data item was not found in the provided dictionaries';
          } @tags;

    return \@validation_messages;
}

sub validate_block_loops
{
    my ($data_block, $dic) = @_;

    my $list_references = get_all_list_references($dic);

    my @validation_issues;
    for my $loop_tags ( @{$data_block->{'loops'}} ) {
        push @validation_issues,
             @{ validate_loop_reference_items( $loop_tags, $dic ) };

        my $covered_sets = select_covered_reference_sets($list_references, $loop_tags);
        for my $reference_tags ( @{$covered_sets} ) {
            next if !@{$reference_tags};
            $reference_tags = [ map { canonical_tag_name($_) } @{$reference_tags} ];
            if ( @{$reference_tags} == 1 ) {
                push @validation_issues,
                        @{ check_simple_key_uniqueness(
                           $data_block,
                           $reference_tags->[0],
                           get_data_type( $dic->{$reference_tags->[0]} )
                        ) }
            } else {
                my %ref_types;
                for my $data_name ( @{$reference_tags} ) {
                    $ref_types{$data_name} =
                            get_data_type( $dic->{$data_name} );
                }

                push @validation_issues,
                        @{ check_composite_key_uniqueness(
                           $data_block,
                           $reference_tags,
                           \%ref_types
                        ) }
            }
        }
    }

    for my $group (@{$list_references}) {
        my @tags = sort map { canonical_tag_name( $_ ) }
                            @{$group->{'key_data_items'}},
                            @{$group->{'sub_data_items'}};
        my %loops;
        for my $tag (@tags) {
            next if !$data_block->{'inloop'}{$tag};
            $loops{$data_block->{'inloop'}{$tag}} = 1;
        }
        next if keys %loops <= 1;
        my $message =
            'data items [' .
                ( join ', ', map { q{'} . ( canonicalise_tag($_) ) . q{'} } @tags ) .
            '] must all appear in the same loop';
        push @validation_issues,
             {
                'test_type'  => 'LOOP.INTEGRITY',
                'data_items' => \@tags,
                'message'    => $message
             }
    }

    return \@validation_issues;
}

##
# Checks the uniqueness constraint of a simple loop key that consists
# of a single data item.
#
# @param $data_frame
#       Data frame in which the data item resides as returned
#       by the COD::CIF::Parser.
# @param $data_name
#       Data name of the data item which acts as the unique loop key.
# @param $key_type
#       Data type of the key as defined in the DDL1 dictionary.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub check_simple_key_uniqueness
{
    my ($data_frame, $data_name, $key_type) = @_;

    my $unique_key_violations =
            get_simple_unique_key_violations($data_frame, $data_name, $key_type);

    my @messages;
    for my $key ( sort keys %{$unique_key_violations} ) {
        push @messages,
             'data item ' . q{'} . ( canonicalise_tag($data_name) ) . q{'} .
             ' acts as a loop key, but the associated data values are not ' .
             "unique -- value '$key' appears " .
                ( scalar @{$unique_key_violations->{$key}} ) . ' times as [' .
                ( join ', ', map { "'$_'" } @{$unique_key_violations->{$key}} ) .
             ']';
    }

    my @validation_issues;
    for my $message ( @messages ) {
        push @validation_issues,
             {
                'test_type'  => 'SIMPLE_KEY_UNIQUENESS',
                'data_items' => [ $data_name ],
                'message'    => $message
             }
    }

    return \@validation_issues;
}

##
# Identifies values that violate a simple unique key constraint in
# the given data block.
#
# @param $data_frame
#       Data frame in which the data items reside as returned
#       by the COD::CIF::Parser.
# @param $data_name
#       Data name of the data item which acts as the unique loop key.
# @param $key_type
#       Data type of the key data item as defined in the DDL1 dictionary.
# @return $unique_key_violations
#       Reference to a data structure that details the violations
#       of the unique key constraint. The data structure takes the
#       following form:
#       {
#       # canonicalised values serve as hash keys and point to arrays
#       # that contain the duplicate values in their original form, i.e.:
#           '10'   => [ '10', '1E+1', '1000E-2', ... ],
#           'text' => [ 'text', 'text', 'text' ],
#            ...,
#       }
##
sub get_simple_unique_key_violations
{
    my ($data_frame, $data_name, $key_type) = @_;

    my %grouped_values;
    for ( my $i = 0; $i < @{$data_frame->{'values'}{$data_name}}; $i++ ) {
        # TODO: special values are silently skipped, but maybe they should
        # still be reported somehow since having special value in a key
        # might not be desirable...
        next if has_special_value($data_frame, $data_name, $i);
        my $value = $data_frame->{'values'}{$data_name}[$i];
        my $canon_value = canonicalise_value( $value, $key_type );
        push @{$grouped_values{$canon_value}}, $value;
    };

    my %unique_key_violations;
    for my $key ( keys %grouped_values ) {
        next if @{$grouped_values{$key}} < 2;
        $unique_key_violations{$key} = $grouped_values{$key};
    }

    return \%unique_key_violations;
}

##
# Checks the uniqueness constraint of a composite loop key that consists
# of multiple data items.
#
# @param $data_frame
#       Data frame in which the data items reside as returned
#       by the COD::CIF::Parser.
# @param $data_names
#       Reference to an array of data item names that act as
#       the composite unique loop key.
# @param $data_types
#       Reference to a hash containing the data types of
#       the key data items as defined in a DDL1 dictionary.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub check_composite_key_uniqueness
{
    my ($data_frame, $data_names, $data_types) = @_;

    if ( !@{$data_names} ) {
        return [];
    }

    my $violations = get_composite_unique_key_violations(
                        $data_frame,
                        $data_names,
                        $data_types
                     );

    my @messages;
    for my $violation ( @{$violations} ) {
        my @duplicates;
        for my $values ( @{$violation->{'duplicate_values'}} ) {
            push @duplicates,
                 '[' . ( join ', ', map { "'$_'" } @{$values} ) . ']';
        }

        push @messages,
             'data items [' .
                ( join ', ',
                    map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                        @{$data_names} ) .
             '] act as a composite loop key, but the associated data values ' .
             'are not collectively unique -- values [' .
                ( join ', ', map { "'$_'" } @{$violation->{'canonical_values'}} ) .
             '] appear ' .
             ( scalar @{$violation->{'duplicate_values'}} ) . ' times as ' .
             ( join ', ', @duplicates );
    }

    my @validation_issues;
    for my $message ( @messages ) {
        push @validation_issues,
             {
                'test_type'  => 'COMPOSITE_KEY_UNIQUENESS',
                'data_items' => $data_names,
                'message'    => $message
             }
    }

    return \@validation_issues;
}

##
# Identifies values that violate a composite unique key constraint in
# the given data block.
#
# @param $data_frame
#       Data frame in which the data items reside as returned
#       by the COD::CIF::Parser.
# @param $data_names
#       Reference to an array of data item names that act as
#       the composite unique loop key.
# @param $data_types
#       Reference to a hash containing the data types of
#       the key data items as defined in a DDL1 dictionary.
# @return $unique_key_violations
#       Reference to an array of data structures that detail
#       the violations of the composite unique key constraint.
#       Each data structure takes the following form:
#       {
#       # Values in the canonical form
#           'canonical_values' => [ 'text_1', '10', 'text_2' ],
#       # Duplicate values in their original form
#           'duplicate_values' => [
#                                   [ 'text_1', '10',     'text_2' ],
#                                   [ 'text_1', '1E+1',   'text_2' ],
#                                   ...,
#                                   [ 'text_1', '1000E-1', 'text_2' ]
#                                 ]
#       }
##
sub get_composite_unique_key_violations
{
    my ($data_frame, $data_names, $data_types) = @_;

    my $join_char = "\x{001E}";
    my %grouped_values;
    my $loop_size = @{$data_frame->{'values'}{$data_names->[0]}};
    for ( my $i = 0; $i < $loop_size; $i++ ) {
        my $composite_key = '';
        my @composite_key_values;
        my $has_special_value = 0;
        for my $data_name ( @{$data_names } ) {
            # TODO: composite keys containing special values are silently
            # skipped, but maybe they should still be reported somehow since
            # having special value in a key might render it unusable
            if ( has_special_value($data_frame, $data_name, $i) ) {
                $has_special_value = 1;
                last;
            };

            my $value = $data_frame->{'values'}{$data_name}[$i];
            my $data_type = $data_types->{lc $data_name};
            push @composite_key_values, $value;
            $composite_key .= canonicalise_value( $value, $data_type ) .
                              "$join_char";
        }
        if (!$has_special_value) {
            push @{$grouped_values{$composite_key}}, \@composite_key_values;
        }
    }

    my @unique_key_violations;
    for my $key (sort keys %grouped_values) {
        next if @{$grouped_values{$key}} < 2;
        my %violation;
        $violation{'canonical_values'} = [ split /$join_char/, $key ];
        $violation{'duplicate_values'} = $grouped_values{$key};
        push @unique_key_violations, \%violation;
    }

    return \@unique_key_violations;
}

##
# Groups data items based on their list references declared in the DDL1
# dictionary.
#
# @param $data_names
#       Reference to an array of data item names that should be grouped.
# @param $dic
#       Reference to a DDL1 dictionary structure as returned by the
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of data structures of the following form:
#       {
#       # names of the data items comprising the list reference
#           'key_data_items' => [ '_key_data_name_1', '_key_data_name_2' ],
#       # names of the data items that share the same list reference
#           'sub_data_items' => [ '_item_1', _item_2', '_item_3' ]
#       },
##
sub group_items_by_list_references
{
    my ( $data_names, $dic ) = @_;

    my %grouped_items;
    my $join_char = "\x{001E}";
    for my $tag ( map { lc } @{$data_names} ) {
        next if !exists $dic->{$tag};
        my $key_data_names = get_list_reference_tags($dic, $tag);
        next if !@{$key_data_names};

        my $key = join $join_char, map { lc } @{$key_data_names};
        if ( !defined $grouped_items{$key} ) {
            $grouped_items{$key}{'key_data_items'} = $key_data_names
        }
        push @{$grouped_items{$key}{'sub_data_items'}}, lc $tag;
    }
    my @item_groups = map { $grouped_items{$_} } sort keys %grouped_items;

    return \@item_groups;
}

##
# Selects those reference sets that can be constructed from the given data
# items.
#
# @param $list_references
#       Reference to an array of list reference groups as returned by
#       the group_items_by_list_references() subroutine.
# @param $data_items
#       Reference to an array of data items names that can be used to
#       construct the set.
# @return $covered_list_references
#       Reference to an array of list reference sets.
##
sub select_covered_reference_sets
{
    my ( $list_references, $data_items ) = @_;

    my @covered_list_references;
    for my $group ( @{$list_references} ) {
        my $key_data_items = $group->{'key_data_items'};
        next if !@{$key_data_items};

        my $is_eligible_ref_set = 1;
        for my $key_data_item ( @{$key_data_items} ) {
            $is_eligible_ref_set &=
                    any { lc $key_data_item eq lc $_ } @{$data_items};
        }
        if ( $is_eligible_ref_set ) {
            push @covered_list_references, $key_data_items;
        }
    }

    return \@covered_list_references;
}

##
# Gets all list reference sets that are described in the given DDL1 dictionary.
#
# @param $dic
#       Reference to a DDL1 dictionary structure as returned by the
#       get_ddl1_dic() subroutine.
# @return $list_ref_groups
#       Reference to an array of list reference groups as returned by
#       the group_items_by_list_references() subroutine.
##
sub get_all_list_references
{
    my ( $dic ) = @_;

    my $list_ref_groups =
             group_items_by_list_references( [ keys %{$dic} ], $dic );

    return $list_ref_groups;
}

##
# Checks if a loop contains reference data items that together act as a
# primary loop key as specified by a DDL1 dictionary.
#
# @param $loop_tags
#       Reference to an array of data names residing in a loop.
# @param $dic
#       Reference to a DDL1 dictionary structure as returned by the
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_loop_reference_items
{
    my ( $loop_tags, $dic ) = @_;

    my $item_ref_groups = group_items_by_list_references( $loop_tags, $dic );

    my @reported_key;
    my @validation_issues;
    for my $group ( @{$item_ref_groups} ) {
        for my $key_tag ( @{$group->{'key_data_items'}} ) {
            next if any { $_ eq $key_tag } @reported_key;
            next if any { lc $_ eq $key_tag } @{$loop_tags};

            push @reported_key, $key_tag;
            my $message =
                'missing looped list reference data item -- the ' .
                q{'} . ( canonicalise_tag($key_tag) ) . q{'} .
                ' data item must be provided in the loop containing the [' .
                    ( join ', ',
                        map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                            @{$group->{'sub_data_items'}} ) .
                '] data items';

            push @validation_issues,
                 {
                    'test_type' => 'KEY_ITEM_PRESENCE',
                    'data_items' => [ $key_tag ],
                    'message'    => $message
                 }
        }
    }

    return \@validation_issues;
}

##
# Checks if data items in a looped list all belong to the same category
# as specified by the defining DDL1 dictionaries.
#
# @param $loop_tags
#       Reference to an array of data names residing in a loop.
# @param $item_to_category
#       Reference to a hash that maps each data item to
#       the category that the it belongs to as returned
#       by the get_item_to_category_mapping() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub check_loop_category_homogeneity
{
    my ( $loop_tags, $item_to_category ) = @_;

    my %category_items;
    for my $tag ( @{$loop_tags} ) {
        next if !exists $item_to_category->{$tag};
        push @{$category_items{$item_to_category->{$tag}}}, $tag;
    }

    my @validation_issues;
    my @categories = sort keys %category_items;
    if (@categories > 1) {
        push @validation_issues,
             {
                'test_type'  => 'LOOP.CATEGORY_HOMOGENEITY',
                'data_items' => [ @{$loop_tags} ],
                'message'    =>
                    'data items in a looped list must all belong ' .
                    'to the same category -- ' .
                    ( join ', ',
                        map {
                            'data items [' .
                            ( join ', ',
                                map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                                    @{$category_items{$_}} ) .
                            "] belong to the '$_' category"
                        } @categories
                    ),
             }
    }

    return \@validation_issues;
}

##
# Checks if mandatory data items are present in a looped list as specified
# by the defining DDL1 dictionaries.
#
# @param $loop_tags
#       Reference to an array of data names residing in a loop.
# @param $merged_properties
#       Reference to a data structured of merged validation dictionary
#       properties as returned by the get_merged_dictionary_properties()
#       subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub check_loop_mandatory_item_presence
{
    my ($loop_tags, $merged_properties) = @_;

    my $item_to_category = $merged_properties->{'item_to_category'};
    my $mandatory_items  = $merged_properties->{'category_to_mandatory_items'};
    my $subkey_to_key    = $merged_properties->{'subcategory_key_to_category_key'};
    my $item_to_alternate_items = $merged_properties->{'item_to_alternate_items'};

    my %key_to_subkeys;
    for my $subkey (keys %{$subkey_to_key}) {
        push @{$key_to_subkeys{$subkey_to_key->{$subkey}}}, $subkey;
    };

    my %category_to_items;
    for my $tag ( @{$loop_tags} ) {
        next if !exists $item_to_category->{lc $tag};
        push @{$category_to_items{$item_to_category->{lc $tag}}}, $tag;
    }

    my @validation_issues;
    for my $category_name (sort keys %category_to_items) {
        next if !exists $mandatory_items->{$category_name};
        my $category_loop_tags = $category_to_items{$category_name};
        for my $mandatory_item (@{$mandatory_items->{$category_name}}) {
            $mandatory_item = canonical_tag_name($mandatory_item);
            next if any { $_ eq $mandatory_item } @{$category_loop_tags};

            my $subkey_alternative_found = 0;
            for my $subkey_alternative (@{$key_to_subkeys{$mandatory_item}}) {
                if ( any { $_ eq $subkey_alternative } @{$category_loop_tags} ) {
                    $subkey_alternative_found = 1;
                    last;
                }
            }
            next if $subkey_alternative_found;

            my $alternate_item_found = 0;
            for my $alt_item (@{$item_to_alternate_items->{$mandatory_item}}) {
                if ( any { $_ eq $alt_item } @{$loop_tags} ) {
                    $alternate_item_found = 1;
                    last;
                }
            }
            next if $alternate_item_found;

            push @validation_issues,
                 {
                   'test_type'  => 'LOOP.MANDATORY_ITEM_PRESENCE',
                   'data_items' => [ @{$category_loop_tags} ],
                   'message'    =>
                        'missing mandatory looped list data item -- the ' .
                        q{'} . ( canonicalise_tag($mandatory_item) ) . q{'} .
                        ' data item must be provided in the loop containing ' .
                        'the [' .
                            ( join ', ',
                                map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                                    @{$category_loop_tags} ) .
                        '] data items',
                 }
        }
    }

    return \@validation_issues;
}

##
# Checks if a data items have a collectively unique value as specified
# by a DDL1 dictionary.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       Data name of the item that potentially identifies the
#       collectively unique data items.
# @param $dic
#       Reference to a DDL1 dictionary structure as returned by the
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_list_unique_key
{
    my ( $data_block, $tag, $dic ) = @_;

    return [] if !defined $dic->{$tag};
    my $dic_item = $dic->{$tag};

    return [] if !defined $data_block->{'inloop'}{$tag};
    my $loop_index = $data_block->{'inloop'}{$tag};

    return [] if !exists $dic_item->{'values'}{'_list_uniqueness'};
    my $unique_key_items = [ map { lc } @{$dic_item->{'values'}{'_list_uniqueness'}} ];

    my %loop_tags = map { lc $_ => 1 } @{$data_block->{'loops'}[$loop_index]};
    my @key_loop_tags = grep { exists $loop_tags{$_} } @{$unique_key_items};
    return [] if !@key_loop_tags;

    my @validation_issues;
    @key_loop_tags = map { canonical_tag_name($_) } @key_loop_tags;
    if ( @key_loop_tags == 1 ) {
        my $key_type = get_data_type( $dic->{$key_loop_tags[0]} );
        my $violations = get_simple_unique_key_violations(
                                $data_block,
                                $key_loop_tags[0],
                                $key_type
                         );

        my @messages;
        for my $key ( sort keys %{$violations} ) {
            push @validation_issues,
                 {
                    'test_type'  => 'LOOP.INDIVIDUAL_UNIQUE_VALUES',
                    'data_items' => [ $key_loop_tags[0] ],
                    'message'    =>
                        'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                        ' requires data item ' .
                        q{'} . ( canonicalise_tag($key_loop_tags[0]) ) . q{'} .
                        ' to have unique values, but the associated values ' .
                        "are not unique -- value '$key' appears " .
                        ( scalar @{$violations->{$key}} ) . ' times as [' .
                        ( join ', ', map { "'$_'" } @{$violations->{$key}} ) .
                        ']',
                }
        }
    } else {
        my %key_item_types;
        for my $data_name ( @key_loop_tags ) {
            $key_item_types{$data_name} = get_data_type( $dic->{$data_name} );
        }
        my $violations = get_composite_unique_key_violations(
                            $data_block,
                            \@key_loop_tags,
                            \%key_item_types
                         );

        my @messages;
        for my $violation ( @{$violations} ) {
            my @duplicates;
            for my $values ( @{$violation->{'duplicate_values'}} ) {
                push @duplicates,
                     '[' . ( join ', ', map { "'$_'" } @{$values} ) . ']';
            }

            push @validation_issues,
                 {
                    'test_type'  => 'LOOP.COLLECTIVELY_UNIQUE_VALUES',
                    'data_items' => [ @key_loop_tags ],
                    'message'    =>
                         'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                         ' requires data items [' .
                            ( join ', ',
                                map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                                    @key_loop_tags ) .
                         '] to have collectively unique values, but ' .
                         'the associated values are not collectively ' .
                         'unique -- values [' .
                            ( join ', ', map { "'$_'" }
                                @{$violation->{'canonical_values'}} ) .
                        '] appear ' .
                            ( scalar @{$violation->{'duplicate_values'}} ) .
                            ' times as ' . ( join ', ', @duplicates ),
                }
        }
    }

    return \@validation_issues;
}

##
# Checks if a data item reside in a correct loop context as specified
# by a DDL1 dictionary.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic_item
#       Dictionary definition of the validated data item as returned by
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub validate_list_mandatory
{
    my ( $data_block, $tag, $dic_item ) = @_;

    my $must_be_looped = get_list_constraint_type( $dic_item );
    return [] if !defined $must_be_looped;

    my @validation_issues;
    if ( !exists $data_block->{'inloop'}{$tag} ) {
        if ( $must_be_looped eq 'yes' ) {
            push @validation_issues,
                 {
                    'test_type'  => 'LOOP_CONTEXT.MUST_APPEAR_IN_LOOP',
                    'data_items' => [ $tag ],
                    'message'    =>
                        'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                        ' must appear in a loop',
                 }
        }
    } elsif ( $must_be_looped eq 'no' ) {
        push @validation_issues,
             {
                'test_type'  => 'LOOP_CONTEXT.MUST_NOT_APPEAR_IN_LOOP',
                'data_items' => [ $tag ],
                'message'    =>
                    'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                    ' must not appear in a loop',
             }
    }

    return \@validation_issues;
}

##
# Checks if a data item is deprecated as specified by a DDL1 dictionary.
# Cases when both the replaced and the replacing data item reside in the
# same data block are also reported.
#
# @param $data_block
#       Data frame that should be validated as returned by the COD::CIF::Parser.
# @param $tag
#       The data name of the item that should be validated.
# @param $dic
#       Reference to a DDL1 dictionary structure as returned by the
#       get_ddl1_dic() subroutine.
# @return
#       Reference to an array of validation issue data structures of
#       the following form:
#       {
#           # Code of the validation test that generated the issue
#           'test_type' => 'TEST_TYPE_CODE',
#           # Names of the data items examined by the the validation test
#           'data_items' => [ 'data_name_1', 'data_name_2', ... ],
#           # Validation message that should be displayed to the user
#           'message'    => 'a detailed validation message'
#       }
##
sub report_deprecated
{
    my ($data_block, $tag, $dic) = @_;

    my $replacement_tags = get_replacement_tags($dic, lc $tag);
    return [] if !@{$replacement_tags};

    my @validation_issues;

    push @validation_issues,
         {
           'test_type'  => 'ITEM_REPLACEMENT.PRESENCE_OF_REPLACED',
           'data_items' => [ $tag ],
           'message'    =>
                'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
                ' has been replaced by the [' .
                    join(', ', map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                                                        @{$replacement_tags}) .
                '] data items'
         };

    my @existing_replacement_tags =
        grep { exists $data_block->{values}{$_} } @{$replacement_tags};
    if( @existing_replacement_tags ) {
        my $message =
            'data item ' . q{'} . ( canonicalise_tag($tag) ) . q{'} .
            ' appears in the same data block as its replacement data items [' .
                join( ', ', map { q{'} . ( canonicalise_tag($_) ) . q{'} }
                                                        @{$replacement_tags} ) .
            ']';
        push @validation_issues,
             {
                'test_type'  => 'ITEM_REPLACEMENT.SIMULTANEOUS_PRESENCE',
                'data_items' => [ $tag ],
                'message'    => $message
             }
    }

    return \@validation_issues;
}

##
# Determines the DDL generation of the provided dictionary using ad hock
# criteria.
#
# @param $data
#       Reference to parsed CIF dictionary file as returned by the
#       COD::CIF::Parser.
# @return
#       A string that represents the DDL generation or an undefined
#       value if the generation could not be determined. The following
#       string may be returned:
#           '1' for DDL1
#           '2' for DDL2
#           'm' for DDLm
##
sub determine_ddl_generation
{
    my ( $data ) = @_;

    if ( any { $_->{'name'} eq 'on_this_dictionary' } @{$data} ) {
        return '1';
    }

    my $block = $data->[0];
    if ( exists $block->{'values'}{'_dictionary.datablock_id'}) {
        return '2';
    }

    if ( exists $block->{'values'}{'_dictionary.ddl_conformance'} &&
         $block->{'values'}{'_dictionary.ddl_conformance'}[0] =~ /^3[.]/ ) {
        return 'm';
    }

    return;
}

##
# Evaluates if the data item contains an unquoted string value as specified by
# the CIF working specification.
#
# @param $frame
#       Data frame that contains the data item as returned by the COD::CIF::Parser.
# @param $data_name
#       Name of the data item.
# @param $index
#       The index of the data item value to be evaluated.
# @return
#       Boolean value denoting if the data item contains an unquoted string
#       value.
##
sub has_uqstring_value
{
    my ( $data_frame, $data_name, $index ) = @_;

    my $type = defined $data_frame->{'types'}{$data_name}[$index] ?
               $data_frame->{'types'}{$data_name}[$index] : 'UQSTRING' ;

    return $type eq 'UQSTRING';
};
