import os
import json
import copy
from dataclasses import dataclass, field, asdict
from typing import Union, Mapping, Dict, List, Optional, Tuple
from graphxplore.MetaDataHandling import MetaData, VariableType
from graphxplore.Basis import BaseUtils
from .Conditionals import LogicOperator, LogicOperatorParser, AlwaysTrueOperator
from .Conclusions import Conclusion
from .meta_lattice import MetaLattice
from .variable_mapping import VariableMapping
from .data_structure_transformer import TableMappingType
[docs]
@dataclass
class TableMapping:
"""
Each target table x must have some relationship to one or multiple source tables.
Using this relationship, single units of source data are formed. Variable mappings are applied to these
units to form a single output row of *x*. Variables of the related source tables and their foreign
tables (and their foreign tables, and so on...) will have a single value (might be a missing value) in
this unit of source data. These variables are called singular variables. Variables of inverted
foreign tables (*a* is an inverted foreign table of *b*, if *b* is a foreign table of *a*), might have
multiple values in a unit of source data (e.g. timeseries, or multiple blood measurements for a single
patient). They are called aggregate variables. For a table mapping you have the following options:
- *x* has a one-to-one relationship with a single source table *y*. Primary key values are copied
from *y* to *x*. A unit of source data is formed by a single row of *y* and rows from
foreign tables and/or inverted foreign tables of *y*. (Most common option)
- *x* has a one-to-many relationship with multiple source tables. The data of the source tables (and
foreign tables or inverted foreign tables) will be combined to form a single unit of source data.
This can be done in two ways:
- The data of the source tables can be merged. Here, data rows from different source tables are
combined to a single unit, if the row's primary key values are identical. If a primary key value
of a source table has no analog in another source table, its row is taken independently.
- The data of the source tables can be concatenated. Here, the source tables are processed
independently one after the other to form units of source data together with their foreign tables or
inverted foreign tables. The primary key values of *x* will be 0-indexed integers.
- If *x* is a foreign table of another target table *x'*, the relationship to source tables can be
inherited from *x'*. If *x'* itself inherits the relationship of another target table *x''*, this
inheritance is propagated to *x*. The primary key values of *x* will be 0-indexed integers and all its rows
will be de-duplicated. The primary key values of *x* will be used as foreign key values in *x'*.
Optionally, you can define a condition to filter out units of source data that should not be considered in
the mapping. If the condition evaluates to ``False`` for a unit of source data, it is fully removed from the
transformation process of this target table. By default, the
:class:`~graphxplore.DataMapping.Conditionals.AlwaysTrueOperator` is used and all source data is taken into the
transformation
"""
type : Optional[TableMappingType] = None
source_tables : List[str] = field(default_factory=list)
to_inherit : Optional[str] = None
condition : LogicOperator = AlwaysTrueOperator()
[docs]
def to_dict(self) -> Dict[str, Union[str, List[str], None]]:
result = asdict(self)
if self.type is not None:
result['type'] = result['type'].value
result['condition'] = str(result['condition'])
return result
[docs]
@staticmethod
def from_dict(input_dict: Dict[str, Union[str, List[str], None]]) -> 'TableMapping':
if 'type' not in input_dict:
raise AttributeError('You must specify the type of table mapping at the key "type" in your dict')
if input_dict['type'] is None:
mapping_type = None
else:
if input_dict['type'] not in TableMappingType._value2member_map_.keys():
raise AttributeError('Mapping type "' + input_dict['type'] + '" not recognized')
mapping_type = TableMappingType._value2member_map_[input_dict['type']]
if 'condition' not in input_dict:
raise AttributeError('You must specify a condition in your table mapping dict at the key "condition"')
condition = LogicOperatorParser.from_string(input_dict['condition'])
if 'source_tables' not in input_dict:
raise AttributeError('You must specify the source tables of the table mapping at the key "source_tables"')
if 'to_inherit' not in input_dict:
raise AttributeError('You must specify the table to inherit from at the key "to_inherit"')
return TableMapping(type=mapping_type, source_tables=input_dict['source_tables'],
to_inherit=input_dict['to_inherit'],condition=condition)
[docs]
class DataMapping:
"""This class summarizes all individual :class:`VariableMapping` objects for a whole dataset via a dictionary of
table -> variable -> :class:`VariableMapping`
:param source: The :class:`~graphxplore.MetaDataHandling.MetaData` of the source dataset
:param target: The :class:`~graphxplore.MetaDataHandling.MetaData` of the source structure
:param table_mappings: The table mapping for each table. Can be filled later, defaults to ``None``.
:param variable_mappings: The dictionary of all variable mappings for all tables. Can be filled later, defaults
to ``None``.
"""
def __init__(self, source : MetaData, target : MetaData,
table_mappings: Optional[Mapping[str, TableMapping]] = None,
variable_mappings : Optional[Mapping[str, Mapping[str, VariableMapping]]] = None):
"""Constructor method
"""
for table in source.get_table_names():
if not source.has_primary_key(table):
raise AttributeError('Source table "' + table + '" has no assigned primary key')
for table in target.get_table_names():
if not target.has_primary_key(table):
raise AttributeError('Target table "' + table + '" has no assigned primary key')
self.source = source
self.target = target
self.source_lattice = MetaLattice.from_meta_data(self.source)
self.target_lattice = MetaLattice.from_meta_data(self.target)
if table_mappings is not None:
for table in self.target.get_table_names():
if table not in table_mappings:
raise AttributeError('Target table "' + table + '" does not exist in table mapping dict')
self.table_mappings = table_mappings
else:
self.table_mappings = {table : TableMapping() for table in self.target.get_table_names()}
for table, table_mapping in self.table_mappings.items():
self._check_table_mapping(table, table_mapping)
if variable_mappings is not None:
for table in self.target.get_table_names():
if table not in variable_mappings:
raise AttributeError('Target table "' + table + '" does not exist in variable mapping dict')
pk = self.target.get_primary_key(table)
if pk in variable_mappings[table]:
raise AttributeError('Primary key of target table "' + table
+ '" should not exist in variable mapping dict')
table_mapping_assigned = self.table_mappings[table].type is not None
singular_source_tables, aggregation_source_tables = None, None
if table_mapping_assigned:
singular_source_tables, aggregation_source_tables = self.get_source_tables_for_var_mappings(table)
for variable in self.target.get_variable_names(table):
if not self.variable_should_get_mapped(table, variable):
if variable in variable_mappings[table]:
if variable == pk:
raise AttributeError('Primary key of target table "' + table
+ '" should not exist in variable mapping dict')
else:
raise AttributeError(
'Variable "' + variable
+ '" is a foreign key that is used for inheritance of table mapping of "'
+ table + '". It should not have a variable mapping')
else:
if variable not in variable_mappings[table]:
raise AttributeError('Target variable "' + variable + '" of target table "' + table
+ '" missing in variable mappings')
var_mapping = variable_mappings[table][variable]
if var_mapping.target_table != table:
raise AttributeError('Mismatch in target table "' + table
+ '" in variable mapping dict and "' + var_mapping.target_table
+ '" in variable mapping')
if var_mapping.target_variable != variable:
raise AttributeError('Mismatch in target variable "' + variable + '" of target table "'
+ table + '" in variable mapping dict and "'
+ var_mapping.target_variable + '" in variable mapping')
if not table_mapping_assigned:
if len(var_mapping.cases) > 0:
raise AttributeError('Target variable "' + variable + '" of table "' + table
+ '" already mapped in variable mapping dict, but its table is '
'still unmapped')
else:
self._check_var_mapping(
var_mapping, singular_source_tables, aggregation_source_tables)
self.variable_mappings = variable_mappings
else:
self.variable_mappings = {table : {variable : VariableMapping(table, variable, [])
for variable in self.target.get_variable_names(table)
if variable != self.target.get_primary_key(table)}
for table in self.target.get_table_names()}
[docs]
def assign_variable_mapping(self, var_mapping : VariableMapping) -> None:
"""Adds a :class:`VariableMapping` object to the collection. If a mapping exists already for the target table
and variable, it will be overwritten
:param var_mapping: The variable mapping to add
"""
self.target.get_variable(var_mapping.target_table, var_mapping.target_variable)
if self.table_mappings[var_mapping.target_table].type is None:
raise AttributeError('You have to specify the table mapping for target table "'
+ var_mapping.target_table + '" before adding variable mappings')
if not self.variable_should_get_mapped(var_mapping.target_table, var_mapping.target_variable):
if var_mapping.target_variable == self.target.get_primary_key(var_mapping.target_table):
raise AttributeError('"' + var_mapping.target_variable + '" is the primary key of target table "'
+ var_mapping.target_table
+ '". Primary keys have no own variable mapping. Their mapping behaviour is '
'defined by the table mapping')
else:
raise AttributeError('"' + var_mapping.target_variable + '" is a foreign key of target table "'
+ var_mapping.target_table
+ '" which is used for inheritance of its table mapping. These foreign keys have '
'no own variable mapping. Their mapping behaviour is defined by the table '
'mapping')
singular_source_tables, aggregation_source_tables = self.get_source_tables_for_var_mappings(var_mapping.target_table)
self._check_var_mapping(var_mapping, singular_source_tables, aggregation_source_tables)
if var_mapping.target_table in self.variable_mappings:
self.variable_mappings[var_mapping.target_table][var_mapping.target_variable] = var_mapping
else:
self.variable_mappings[var_mapping.target_table] = {var_mapping.target_variable : var_mapping}
[docs]
def get_variable_mapping(self, table : str, variable : str) -> VariableMapping:
"""Retrieves the :class:`VariableMapping` for the given table and variable. Raises an exception if the table
or variable does not exist in the collection
:param table: The target table of the variable to map
:param variable: The name of the variable
:return: Returns the retrieved variable mapping
"""
if table not in self.variable_mappings:
raise AttributeError('Table "' + table + '" does not exist in data mapping')
if not self.variable_should_get_mapped(table, variable):
if variable == self.target.get_primary_key(table):
raise AttributeError('"' + variable + '" is the primary key of target table "' + table
+ '". Primary keys have no own variable mapping. Their mapping behaviour is defined by '
'the table mapping')
else:
raise AttributeError('"' + variable + '" is a foreign key of target table "' + table
+ '" which is used for inheritance of its table mapping. These foreign keys have '
'no own variable mapping. Their mapping behaviour is defined by the table '
'mapping')
if variable not in self.variable_mappings[table]:
raise AttributeError('Variable "' + variable + ' does not exist in data mapping for table "' + table + '"')
return self.variable_mappings[table][variable]
[docs]
def assign_table_mapping(self, table: str, table_mapping: TableMapping):
"""Assign the table mapping for ``table``. This overwrites any existing table mapping
:param table: The table the mapping gets assigned to
:param table_mapping: The table mapping that gets assigned
"""
self._check_table_mapping(table, table_mapping)
old_mapping = copy.deepcopy(self.table_mappings[table])
self.table_mappings[table] = table_mapping
# remove variable mapping of foreign key in inherited table, if it was defined before
if table_mapping.type == TableMappingType.Inherited:
foreign_key = self.target.get_primary_key(table)
if foreign_key in self.variable_mappings[table_mapping.to_inherit]:
del self.variable_mappings[table_mapping.to_inherit][foreign_key]
# add empty variable mapping for foreign key
elif old_mapping.type is not None and old_mapping.type == TableMappingType.Inherited:
foreign_key = self.target.get_primary_key(table)
foreign_table = old_mapping.to_inherit
self.variable_mappings[foreign_table][foreign_key] = VariableMapping(foreign_table, foreign_key, [])
[docs]
def get_table_mapping(self, table: str) -> TableMapping:
"""Returns the table mapping for ``table`` if it exists
:param table: The table to retrieve the mapping for
:return: Returns the retrieved mapping or raises an exception if it does not exist
"""
if table not in self.table_mappings:
raise AttributeError('Table "' + table + '" does not exist in data mapping')
return self.table_mappings[table]
[docs]
def foreign_key_is_for_inheritance(self, table: str, foreign_key: str) -> bool:
"""Checks if ``foreign_key`` is marked for inheritance, i.e its foreign table inherits the table mapping
from ``table``
:param table: The target table to check the foreign key for
:param foreign_key: The foreign key, an exception will be raised if this is not a foreign key of table ``table``
:return: Returns ``True`` if the foreign table of ``foreign_key`` is inheriting from ``table``
"""
if foreign_key not in self.target.get_foreign_keys(table):
raise AttributeError('"' + foreign_key + '" is not a foreign key of target table "' + table + '"')
foreign_table = self.target.get_foreign_keys(table)[foreign_key]
foreign_table_mapping = self.table_mappings[foreign_table]
return foreign_table_mapping.type == TableMappingType.Inherited and foreign_table_mapping.to_inherit == table
[docs]
def to_dict(self) -> Dict[str, Dict[str, Dict[str, Union[str, List[Dict[str, str]]]]]]:
"""Converts the object to a dictionary containing only strings
:return: Returns a dictionary containing all mappings
"""
result = {}
for table in self.target.get_table_names():
var_mappings = {}
for variable in self.target.get_variable_names(table):
if not self.variable_should_get_mapped(table, variable):
continue
var_mappings[variable] = self.variable_mappings[table][variable].to_dict()
result[table] = {'table_mapping' : self.table_mappings[table].to_dict(), 'variable_mappings' : var_mappings}
return result
[docs]
@staticmethod
def from_dict(input_dict : dict, source: MetaData, target: MetaData) -> 'DataMapping':
"""Reads :class:`~graphxplore.DataMapping.VariableMapping` and :class:`TableMapping` objects from a dictionary
and combines them with the specified source and target :class:`~graphxplore.MetaDataHandling.MetaData`
:param input_dict: The input dictionary
:param source: The metadata of the source dataset
:param target: The metadata of the target dataset
:return: Returns a dictionary containing all mappings
"""
var_mappings = {}
table_mappings = {}
for table, table_dict in input_dict.items():
if 'table_mapping' not in table_dict:
raise AttributeError('"table_mapping" entry with table mappings for table "' + table + '" missing')
table_mappings[table] = TableMapping.from_dict(table_dict['table_mapping'])
if 'variable_mappings' not in table_dict:
raise AttributeError('"variable_mappings" entry with variable mappings for table "' + table
+ '" missing')
var_mappings[table] = {variable : VariableMapping.from_dict(entry)
for variable, entry in table_dict['variable_mappings'].items()}
return DataMapping(source, target, table_mappings, var_mappings)
[docs]
def to_json(self, json_path : str, file_encoding : Optional[str] = None) -> None:
"""Stores all variable mappings in a JSON
:param json_path: Path to the JSON
:param file_encoding: file encoding that should be used for writing the JSON
"""
dir_path = os.path.dirname(os.path.realpath(json_path))
if not os.path.exists(dir_path) or not os.path.isdir(dir_path):
raise AttributeError('File path "' + json_path
+ '" is invalid, since the containing directory does not exist')
output = self.to_dict()
with open(json_path, "w", encoding=file_encoding) as f:
json.dump(output, f, indent=6, ensure_ascii=False)
[docs]
@staticmethod
def from_json(json_path : str, source: MetaData, target: MetaData,
file_encoding : Optional[str] = None) -> 'DataMapping':
"""Reads :class:`~graphxplore.DataMapping.VariableMapping` and :class:`TableMapping` objects from a JSON
and combines them with the specified source and target :class:`~graphxplore.MetaDataHandling.MetaData`
:param json_path: Path to the JSON
:param source: The metadata of the source dataset
:param target: The metadata of the target dataset
:param file_encoding: file encoding of the JSON
:param file_encoding: file encoding of the JSON
:return: Returns a dictionary with all mapping data
"""
if not os.path.isfile(json_path):
raise AttributeError('Path "' + json_path +'" is not a valid file path')
encoding = file_encoding if file_encoding is not None else BaseUtils.detect_file_encoding(json_path)
with open(json_path, encoding=encoding) as f:
data = json.load(f)
return DataMapping.from_dict(data, source, target)
[docs]
def variable_mapped(self, table: str, variable: str) -> bool:
"""Checks, if at least one :class:`MappingCase` is defined for the table and variable. Raises an exception, if
the variable and/or table is not present in the mapping
:param table: The table of the variable to check for
:param variable: The variable name to check for
:return: Returns ``True``, if the table and variable exist in the mapping and at least one :class:`MappingCase`
was defined
"""
var_mapping = self.get_variable_mapping(table, variable)
return len(var_mapping.cases) > 0
[docs]
def table_fully_mapped(self, table: str) -> bool:
"""Checks, if all variables of a table are mapped, meaning they have at least one :class:`MappingCase`
:param table: The table to check all variables for
:return: Returns ``True``, if all variables are mapped
"""
if table not in self.target.get_table_names():
raise AttributeError('Table "' + table + '" does not exist in target metadata')
for variable in self.target.get_variable_names(table):
if self.variable_should_get_mapped(table, variable) and not self.variable_mapped(table, variable):
return False
return True
[docs]
def complete(self) -> bool:
"""Checks if all variables of all tables are mapped, meaning they have at least one :class:`MappingCase`
:return: Returns ``True``, if all variables of all tables are mapped
"""
for table in self.target.get_table_names():
if not self.table_fully_mapped(table):
return False
return True
[docs]
def get_source_tables_for_var_mappings(self, target_table: str,
mapping_to_set : Optional[TableMapping] = None) -> Tuple[List[str], List[str]]:
"""Based on the table mapping of ``target_table``, find all source tables that can be used for variable
mappings. To cases are possible: Single value conditionals/conclusion (related source tables and foreign
tables, foreign tables of foreign tables, etc.), and source tables that can be used for aggregation (inverted
foreign tables of the related source tables, inverted foreign tables of inverted foreign tables, etc.)
:param target_table: The target table for which available source tables should be retrieved
:param mapping_to_set: If the table mapping of ``target_table`` is not yet set, you can specify the future
mapping here. If this parameter is None, the assigned table mapping will be used. Defaults to None
:return: Returns two lists of source tables, one for single value and one for aggregation conditionals/conclusions
"""
if target_table not in self.table_mappings:
raise AttributeError('Target table "' + target_table + '" does not exist in data mapping')
if mapping_to_set is None:
table_mapping = self.table_mappings[target_table]
else:
table_mapping = mapping_to_set
if table_mapping.type is None:
raise AttributeError('Table mapping not yet assigned for target table "' + target_table + '"')
elif table_mapping.type == TableMappingType.Inherited:
return self.get_source_tables_for_var_mappings(table_mapping.to_inherit)
elif table_mapping.type == TableMappingType.OneToOne:
source_table = table_mapping.source_tables[0]
upward = set(self.source_lattice.get_relatives(source_table))
downward = set(self.source_lattice.get_relatives(source_table, False))
upward_ordered = [table for table in self.source.get_table_names() if table in upward]
downward_ordered = [table for table in self.source.get_table_names() if table in downward]
return [source_table] + upward_ordered, downward_ordered
elif table_mapping.type in [TableMappingType.Merge, TableMappingType.Concatenate]:
upward = set()
downward = set()
for source_table in table_mapping.source_tables:
upward.update(self.source_lattice.get_relatives(source_table))
downward.update(self.source_lattice.get_relatives(source_table, False))
upward_ordered = [table for table in self.source.get_table_names() if
table in upward and table not in table_mapping.source_tables]
downward_ordered = [table for table in self.source.get_table_names() if
table in downward and table not in table_mapping.source_tables]
return table_mapping.source_tables + upward_ordered, downward_ordered
else:
raise NotImplementedError('Table mapping type not implemented')
def _check_table_mapping(self, target_table, table_mapping: TableMapping):
"""Checks the relationship of the given ``target_table`` to the source dataset. It can have a one-to-one
relation to a single source table, or a one-to-many relation to multiple source tables by merging or
concatenating their data. Alternatively the type of relation can be inherited from another source table
which must be an ancestor of ``target_table``.
:param target_table: The name of the target table
:param table_mapping: The table mapping to check
:return: Returns a dictionary with the type of mapping and the relevant source tables
"""
if table_mapping.type == TableMappingType.OneToOne:
if len(table_mapping.source_tables) != 1:
raise AttributeError('One-to-one table mapping need a single source table specified')
elif table_mapping.type == TableMappingType.Inherited:
if table_mapping.to_inherit is None:
raise AttributeError('For inheriting table mapping you need to specify a target table to inherit from')
if not isinstance(table_mapping.condition, AlwaysTrueOperator):
raise AttributeError('When inheriting table relation, no condition other than the tautology should be '
'specified')
if table_mapping.to_inherit not in self.target_lattice.parents[target_table]:
raise AttributeError('Target table "' + target_table + '" was marked for inheriting from table "'
+ table_mapping.to_inherit + '", but "' + target_table
+ '" is not a foreign table of "' + table_mapping.to_inherit + '"')
if self.table_mappings[table_mapping.to_inherit].type is None:
raise AttributeError('Target table "' + target_table + '" was marked for inheriting from table "'
+ table_mapping.to_inherit + '", but this table has no assigned table mapping yet')
elif table_mapping.type in [TableMappingType.Merge, TableMappingType.Concatenate]:
if len(table_mapping.source_tables) < 2:
raise AttributeError('For one-to-many table mappings at least two source tables must be specified')
for source_table in table_mapping.source_tables:
if source_table not in self.source.get_table_names():
raise AttributeError('Source table "' + source_table
+ '" was specified in table mapping of target table "' + target_table
+ '", but does not exist in source metadata')
if not isinstance(table_mapping.condition, AlwaysTrueOperator):
singular_source_tables, aggregation_source_tables = self.get_source_tables_for_var_mappings(
target_table, table_mapping)
self._check_operator_sources(
target_table, None, table_mapping.condition, singular_source_tables, aggregation_source_tables)
[docs]
def variable_should_get_mapped(self, table: str, variable: str) -> bool:
"""Checks if a variable mapping should be defined for the variable. All variables should be mapped except
primary keys and foreign keys of foreign tables which inherit the table mapping of ``table``
:param table: The table of the variable to check
:param variable: The name of the variable to check
:return: Returns ``True`` if the variable should have a variable mapping
"""
if variable == self.target.get_primary_key(table):
return False
if variable in self.target.get_foreign_keys(table):
foreign_table = self.target.get_foreign_keys(table)[variable]
foreign_table_mapping = self.table_mappings[foreign_table]
if foreign_table_mapping.type == TableMappingType.Inherited and foreign_table_mapping.to_inherit == table:
return False
return True
@staticmethod
def _check_operator_sources(table: str, variable: Optional[str], operator: Union[LogicOperator, Conclusion],
singular_source_tables: List[str], aggregation_source_tables: List[str]):
var_string = 'variable mapping of target variable "' + variable + '"' if variable is not None else 'table mapping'
for source_table, var_data in operator.get_required_data().items():
for source_var, agg_info in var_data:
if agg_info is not None:
if source_table not in aggregation_source_tables:
raise AttributeError('Source table "' + source_table
+ '" used for aggregation in ' + var_string + ' in target table "' + table
+ '", but cannot be aggregated with specified table mapping')
else:
if source_table not in singular_source_tables:
raise AttributeError('Source table "' + source_table
+ '" used for singular value comparison in ' + var_string
+ ' in target table "' + table
+ '", but source table data cannot be used for singular value comparison '
'with specified table mapping')
def _check_var_mapping(self, var_mapping: VariableMapping, singular_source_tables : List[str],
aggregation_source_tables : List[str]):
table = var_mapping.target_table
var = var_mapping.target_variable
var_info = self.target.get_variable(table, var)
if var_info.variable_type == VariableType.PrimaryKey:
raise AttributeError('Primary key "' + var + '" of target table "' + table
+ '" should not have a variable mapping. The mapping behaviour of primary keys is '
'handled with the table mapping')
elif var_info.variable_type == VariableType.ForeignKey:
foreign_table = self.target.get_foreign_keys(table)[var]
if self.foreign_key_is_for_inheritance(table, var) and len(var_mapping.cases) > 0:
raise AttributeError('Foreign table "' + foreign_table + '" inherits the table mapping of "'
+ table + '", but the foreign key "' + var + '" has mapping cases defined')
for case in var_mapping.cases:
for operator in [case.conditional, case.conclusion]:
self._check_operator_sources(table, var, operator, singular_source_tables, aggregation_source_tables)