Source code for graphxplore.MetaDataHandling.meta_data

import os
import re
import json
import copy
from typing import List, Iterable, Dict, Optional
from .variable_info import VariableInfo, VariableType, DataType
from graphxplore.Basis import BaseUtils

[docs] class MetaData: """This class is the core of all ETL processes in graphxplore. It stores the metadata of a relational dataset. It contains information about its CSV tables, variables, primary/foreign keys, and much more information on the variable-level. For more information checkout :class:`VariableInfo` :param tables: The names of the CSV tables of the relational data set (without .csv) """ def __init__(self, tables : Iterable[str]): """Constructor method """ self.data = dict([(table, {'label' : table, 'primary_key' : '', 'foreign_keys' : {}, 'variables' : {}}) for table in tables])
[docs] @staticmethod def load_from_json(filepath: str, file_encoding : Optional[str] = None) -> 'MetaData': """Reads a :class:`Metadata` object from a JSON. :param filepath: Path to the JSON :param file_encoding: file encoding of the JSON :return: Returns a Metadata object """ if not os.path.isfile(filepath): raise AttributeError('Path "' + filepath +'" is not a valid file path') encoding = file_encoding if file_encoding is not None else BaseUtils.detect_file_encoding(filepath) with open(filepath, encoding=encoding) as f: data = json.load(f) return MetaData.from_dict(data)
[docs] @staticmethod def from_dict(data : dict) -> 'MetaData': """Parses a :class:`Metadata` object from a dictionary. :param data: The input dictionary :return: Returns the parsed object """ tables = data.keys() meta_data = MetaData(tables) for table in tables: table_data = data[table] if 'label' not in table_data or not isinstance(table_data['label'], str): raise AttributeError('Data for table "' + table + '" does not contain a string entry "label"') label = table_data['label'] if 'variables' not in table_data or not isinstance(table_data['variables'], dict) \ or len(table_data['variables']) == 0: raise AttributeError('Data for table "' + table + '" does not contain a dictionary entry "variables" or any variable data') variables = {} for var_name, var_dict in table_data['variables'].items(): variables[var_name] = VariableInfo.from_dict(var_name, table, var_dict) if 'primary_key' not in table_data or not isinstance(table_data['primary_key'], str): raise AttributeError('Data for table "' + table + '" does not contain a string entry "primary_key"') if table_data['primary_key'] == '': print('Table "' + table + '" has no primary key assigned') primary_key = table_data['primary_key'] if primary_key != '' and (primary_key not in variables or variables[primary_key].variable_type != VariableType.PrimaryKey): raise AttributeError('Primary key "' + primary_key + '" for table "' + table + '" is not specified as variable of type "PrimaryKey" in table data') if 'foreign_keys' not in table_data or not isinstance(table_data['foreign_keys'], dict): raise AttributeError('Data for table "' + table + '" does not contain a list entry "foreign_keys') for foreign_key, foreign_table in table_data['foreign_keys'].items(): if not isinstance(foreign_key, str) or not isinstance(foreign_table, str): raise AttributeError('Foreign keys and foreign tables must be string entries') if foreign_table not in tables: raise AttributeError('Foreign table "' + foreign_table + '" which was declared in data for table "' + table + '", does not exist') if foreign_key != data[foreign_table]['primary_key']: raise AttributeError('Foreign key "' + foreign_key + '" is not the primary key in foreign table "' + foreign_table + '"') if foreign_key not in table_data['variables'] \ or table_data['variables'][foreign_key]['variable_type'] != VariableType.ForeignKey: raise AttributeError('Foreign key "' + foreign_key + '" for table "' + table + '" is not specified as variable of type "ForeignKey" in table data') meta_data.data[table] = {'label': label, 'primary_key': primary_key, 'foreign_keys': table_data['foreign_keys'], 'variables': variables} return meta_data
[docs] def to_dict(self) -> dict: """Converts the object to a dictionary. :return: Returns the generated dictionary """ output = {} for table, table_data in self.data.items(): table_output = { 'label' : table_data['label'], 'primary_key' : table_data['primary_key'], 'foreign_keys' : copy.deepcopy(table_data['foreign_keys']), 'variables' : {var_name : var_info.to_dict() for var_name, var_info in table_data['variables'].items()} } output[table] = table_output return output
[docs] def store_in_json(self, file_path: str, file_encoding : Optional[str] = None) -> None: """Stores the object as a JSON file. :param file_path: Path to the JSON :param file_encoding: file encoding that should be used for writing the JSON """ dir_path = os.path.dirname(os.path.realpath(file_path)) if not os.path.exists(dir_path) or not os.path.isdir(dir_path): raise AttributeError('File path "' + file_path + '" is invalid, since the containing directory does not exist') output_dict = self.to_dict() with open(file_path, "w", encoding=file_encoding) as f: json.dump(output_dict, f, indent=6, ensure_ascii=False)
def __deepcopy__(self, memo : Dict={}) -> 'MetaData': result = MetaData(self.get_table_names()) for table, table_data in self.data.items(): result.data[table]['label'] = table_data['label'] result.data[table]['primary_key'] = table_data['primary_key'] result.data[table]['foreign_keys'] = copy.deepcopy(table_data['foreign_keys']) result.data[table]['variables'] = {var_name : VariableInfo.from_dict(var_name, table, var_info.to_dict()) for var_name, var_info in table_data['variables'].items()} return result
[docs] def add_table(self, table : str) -> None: """Add a table to the metadata :param table: The name of the table, i.e. its file name with '.csv' omitted """ if table in self.data: raise AttributeError('Table "' + table + '" already exists in meta data') self.data[table] = {'label': table, 'primary_key': '', 'foreign_keys': {}, 'variables': {}}
[docs] def remove_table(self, table : str) -> None: """Remove a table from the metadata. All foreign keys pointing to this table are changed to categorical variables :param table: The name of the table, i.e. its file name with '.csv' omitted """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') del self.data[table] for other in self.get_table_names(): fks_of_table = [fk for fk, ft in self.get_foreign_keys(other).items() if ft == table] for fk in fks_of_table: del self.data[other]["foreign_keys"][fk] self.data[other]['variables'][fk].variable_type = VariableType.Categorical
[docs] def assign_label(self, table : str, label : str) -> None: """Assigns a label to a table, e.g. describing the contained data. Existing labels will be overwritten :param table: The name of the table, i.e. its file name with '.csv' omitted :param label: The label that should be assigned, should not contain whitespace or line breaks """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if not re.match("^[A-Za-z0-9-_]+$", label): raise AttributeError('Label "' + label + '" should only contain letters, numbers, hyphens and underscores') self.data[table]['label'] = label
[docs] def add_variable(self, table : str, variable : str) -> VariableInfo: """Adds a variable for a specified table to the metadata. :param table: The name of the table, i.e. its file name with '.csv' omitted :param variable: The name of the variable, i.e. the column name :return: Returns the generated variable info that can be filled """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if variable in self.data[table]['variables']: raise AttributeError('Variable "' + variable + '" already exists in table "' + table + '"') self.data[table]['variables'][variable] = VariableInfo(name=variable, table = table, labels=[], variable_type=VariableType.Categorical, data_type=DataType.String, data_type_distribution=None) return self.data[table]['variables'][variable]
[docs] def assign_primary_key(self, table : str, primary_key : str) -> None: """Assigns a primary key for the specified table. Raises an exception if ``table`` already has a primary key, or ``primary_key`` is not a variable of ``table`` :param table: The name of the table, i.e. its file name with '.csv' omitted :param primary_key: The name of the primary key, i.e. the column name """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if primary_key not in self.data[table]['variables']: raise AttributeError('Primary key "' + primary_key + '" is not a variable of table "' + table + '" in meta data') if self.data[table]['primary_key'] != '': raise AttributeError('Primary key already set for table "' + table + '"') self.data[table]['variables'][primary_key].variable_type = VariableType.PrimaryKey self.data[table]['primary_key'] = primary_key
[docs] def change_primary_key(self, table : str, primary_key : str) -> None: """Changes the primary key for the specified table. Raises an exception if ``primary_key`` is not a variable of ``table`` :param table: The name of the table, i.e. its file name with '.csv' omitted :param primary_key: The name of the primary key, i.e. the column name """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if primary_key not in self.data[table]['variables']: raise AttributeError('Primary key "' + primary_key + '" is not a variable of table "' + table + '" in meta data') if primary_key != self.data[table]['primary_key']: if self.data[table]['primary_key'] != '': old_key = self.data[table]['primary_key'] for other in self.get_table_names(): if other == table: continue if old_key in self.get_foreign_keys(other): self.remove_foreign_key(other, old_key) self.data[table]['variables'][old_key].variable_type = VariableType.Categorical self.data[table]['variables'][primary_key].variable_type = VariableType.PrimaryKey self.data[table]['primary_key'] = primary_key
[docs] def add_foreign_key(self, table : str, foreign_table : str, foreign_key : str) -> None: """Adds a foreign key and its foreign origin table to a specified table. ``foreign_key`` must be a variable of ``table`` and a primary key of ``foreign_table``. :param table: The name of the table, i.e. its file name with '.csv' omitted :param foreign_table: The name of the foreign table, i.e. its file name with '.csv' omitted :param foreign_key: The name of the foreign key, i.e. the column name :return: """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if foreign_table not in self.data: raise AttributeError('Foreign table "' + foreign_table + '" not in meta data') if foreign_key in self.data[table]['foreign_keys']: raise AttributeError('Multiple assignment of foreign key "' + foreign_key + '" for table "' + table + '"') if self.data[foreign_table]['primary_key'] != foreign_key: raise AttributeError('Foreign key "' + foreign_key + '" is not primary key in table "' + foreign_table + '"') if foreign_key not in self.data[table]['variables']: raise AttributeError('Foreign key "' + foreign_key + '" is not a variable of table "' + table + '"') var_info = self.data[table]['variables'][foreign_key] var_info.variable_type = VariableType.ForeignKey if var_info.binning is not None: var_info.binning.should_bin = False var_info.binning.exclude_from_binning = None var_info.value_distribution = None if var_info.artifacts is not None: only_data_type_artifacts = [] for artifact in var_info.artifacts: if var_info.cast_value_to_data_type(artifact) is None: only_data_type_artifacts.append(artifact) var_info.artifacts = only_data_type_artifacts self.data[table]['foreign_keys'][foreign_key] = foreign_table
[docs] def remove_foreign_key(self, table : str, foreign_key : str) -> None: """Removes a foreign key for a specified table. ``foreign_key`` must be a variable of ``table``. :param table: The name of the table, i.e. its file name with '.csv' omitted :param foreign_key: The name of the foreign key, i.e. the column name :return: """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if foreign_key not in self.data[table]['variables']: raise AttributeError('Foreign key "' + foreign_key + '" is not a variable of table "' + table + '"') if foreign_key not in self.data[table]['foreign_keys']: raise AttributeError('Foreign key "' + foreign_key + '" was not assigned for table "' + table + '"') var_info = self.data[table]['variables'][foreign_key] var_info.variable_type = VariableType.Categorical if var_info.binning is not None: var_info.binning.should_bin = False var_info.binning.exclude_from_binning = None del self.data[table]['foreign_keys'][foreign_key]
[docs] def get_table_names(self) -> List[str]: """Retrieve all table name (file names with '.csv' omitted) of the metadata. :return: Returns the list of table names """ return list(self.data.keys())
[docs] def get_primary_key(self, table : str) -> str: """Retrieve the primary key of the table. Returns the empty string if not yet assigned. :param table: The name of the table, i.e. its file name with '.csv' omitted :return: Returns the name of the primary key """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') return self.data[table]['primary_key']
[docs] def has_primary_key(self, table : str) -> bool: """Checks if the table has a primary key assigned. :param table: The name of the table, i.e. its file name with '.csv' omitted :return: Returns ``True`` if a primary key was assigned """ return self.get_primary_key(table) != ''
[docs] def get_foreign_keys(self, table) -> Dict[str, str]: """Retrieve all foreign keys of a table as a dictionary with the keys being the foreign keys and the values the foreign tables. :param table: The name of the table, i.e. its file name with '.csv' omitted :return: Returns the foreign key/table dictionary """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') return self.data[table]["foreign_keys"]
[docs] def get_label(self, table) -> str: """Returns the label of the table or the empty string if none was assigned. :param table: The name of the table, i.e. its file name with '.csv' omitted :return: Returns the table label as string """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') return self.data[table]['label']
[docs] def get_variable_names(self, table : str) -> List[str]: """Retrieves all variable names for a given table. :param table: The name of the table, i.e. its file name with '.csv' omitted :return: Returns the list of retrieved variable names """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') return list(self.data[table]['variables'].keys())
[docs] def get_variable(self, table : str, variable : str) -> VariableInfo: """Retrieves the information about a given variable for inspection or altering. :param table: The name of the table, i.e. its file name with '.csv' omitted :param variable: The name of the variable, i.e. the column name :return: Returns the variable information object """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if variable not in self.data[table]['variables']: raise AttributeError('Variable "' + variable + '" is not a variable of table "' + table + '" in meta data') return self.data[table]['variables'][variable]
[docs] def remove_variable(self, table: str, variable: str) -> None: """Delete the variable for the specified table from the metadata. If it is a primary key, foreign key references from other tables are deleted as well :param table: The name of the table, i.e. its file name with '.csv' omitted :param variable: The name of the variable, i.e. the column name """ if table not in self.data: raise AttributeError('Table "' + table + '" not in meta data') if variable not in self.data[table]['variables']: raise AttributeError('Variable "' + variable + '" is not a variable of table "' + table + '" in meta data') if self.get_primary_key(table) == variable: for other in self.get_table_names(): if other == table: continue fks_to_delete = [fk for fk, ft in self.get_foreign_keys(other).items() if fk == variable and ft == table] for fk in fks_to_delete: del self.data[other]["foreign_keys"][fk] self.data[table]['primary_key'] = '' del self.data[table]['variables'][variable]
[docs] def has_artifacts(self) -> bool: """Check, if at least one variable has annotated artifacts :return: Returns ``True``, if at least one annotated artifact was found """ for table in self.data: for variable in self.data[table]['variables']: var_info = self.data[table]['variables'][variable] if var_info.artifacts and len(var_info.artifacts) > 0: return True return False
[docs] def get_total_nof_variables(self) -> int: """Counts all variables in the metadata across all tables :return: Returns the count as an integer """ result = 0 for table in self.get_table_names(): result += len(self.get_variable_names(table)) return result