Source code for y0.struct

# -*- coding: utf-8 -*-

"""Data structures."""

from __future__ import annotations

from dataclasses import dataclass
from functools import lru_cache
from typing import Callable, Iterable, Literal, NamedTuple, Optional, Tuple, Union, cast

import pandas as pd

from .dsl import Expression, Variable

__all__ = [
    "VermaConstraint",
    "DSeparationJudgement",
]

DEFAULT_SIGNIFICANCE = 0.01


[docs] class VermaConstraint(NamedTuple): """Represent a Verma constraint.""" lhs_cfactor: Expression lhs_expr: Expression rhs_cfactor: Expression rhs_expr: Expression variables: Tuple[Variable, ...]
[docs] @classmethod def from_element(cls, element) -> VermaConstraint: """Extract content from each element in the vector returned by `verma.constraint`. :param element: An element in the vector returned by `verma.constraint` :returns: A Verma constraint tuple for the given element .. seealso:: Extracting from R objects https://rpy2.github.io/doc/v3.4.x/html/vector.html#extracting-items """ from .parser import parse_causaleffect from .r_utils import _extract, _parse_vars return cls( rhs_cfactor=parse_causaleffect(_extract(element, "rhs.cfactor")), rhs_expr=parse_causaleffect(_extract(element, "rhs.expr")), lhs_cfactor=parse_causaleffect(_extract(element, "lhs.cfactor")), lhs_expr=parse_causaleffect(_extract(element, "lhs.expr")), variables=_parse_vars(element), )
CITest = Literal[ "pearson", "chi-square", "cressie_read", "freeman_tuckey", "g_sq", "log_likelihood", "modified_log_likelihood", "power_divergence", "neyman", ] DEFAULT_CONTINUOUS_CI_TEST: CITest = "pearson" DEFAULT_DISCRETE_CI_TEST: CITest = "cressie_read" CITestFunc = Callable @lru_cache def get_conditional_independence_tests() -> dict[CITest, CITestFunc]: """Get the conditional independence tests from :mod:`pgmpy.estimators.CITests`.""" try: from pgmpy.estimators import CITests except ImportError as e: raise ImportError("Calculating falsifications requires `pip install pgmpy`.") from e return { "pearson": CITests.pearsonr, "chi-square": CITests.chi_square, "cressie_read": CITests.cressie_read, "freeman_tuckey": CITests.freeman_tuckey, "g_sq": CITests.g_sq, "log_likelihood": CITests.log_likelihood, "modified_log_likelihood": CITests.modified_log_likelihood, "power_divergence": CITests.power_divergence, "neyman": CITests.neyman, } class CITestTuple(NamedTuple): """A tuple containing the results from a PGMPy conditional independency test. Note that continuous tests such as :func:`pgmpy.estimators.CITests.pearsonr` do not have an associated _degrees of freedom_ (dof), so this field is set to none in those cases. """ statistic: float p_value: float dof: Optional[float] = None CITestResult = Union[CITestTuple, bool]
[docs] @dataclass(frozen=True) class DSeparationJudgement: """ Record if a left/right pair are d-separated given the conditions. By default, acts like a boolean, but also caries evidence graph. """ separated: bool left: Variable right: Variable conditions: Tuple[Variable, ...]
[docs] @classmethod def create( cls, left: Variable, right: Variable, conditions: Optional[Iterable[Variable]] = None, *, separated: bool = True, ) -> DSeparationJudgement: """Create a d-separation judgement in canonical form.""" left, right = sorted([left, right], key=str) if conditions is None: conditions = tuple() conditions = tuple(sorted(set(conditions), key=str)) return cls(separated, left, right, conditions)
def __bool__(self) -> bool: return self.separated @property def is_canonical(self) -> bool: """Return if the conditional independency is in canonical form.""" return ( self.left < self.right and isinstance(self.conditions, tuple) and tuple(sorted(self.conditions, key=str)) == self.conditions )
[docs] def test( self, df: pd.DataFrame, *, boolean: bool = False, method: Optional[CITest] = None, significance_level: Optional[float] = None, _method_checked: bool = False, ) -> Union[bool, CITestTuple]: """Test for conditional independence, given some data. :param df: A dataframe. :param boolean: Should results be returned as a pre-cutoff boolean? :param method: Conditional independence from :mod:`pgmpy` to use. If none, defaults to :func:`pgmpy.estimators.CITests.cressie_read`. :param significance_level: The statistical tests employ this value for comparison with the p-value of the test to determine the independence of the tested variables. If none, defaults to 0.01. Only applied if ``boolean=True``. :returns: Tests the null hypothesis that X is independent of Y given Zs. If ``boolean=False``, returns a three-tuple of chi, dof, p_value. If ``boolean=True``, make sure you also set ``significance_level=0.05`` or your preferred value, then returns simply a boolean if the test fails. :raises ValueError: if any parts of the judgement aren't in the dataframe's columns """ if self.left.name not in df.columns: raise ValueError( f"left variable {self.left.name} ({type(self.left.name)}) not in columns {list(df.columns)}" ) if self.right.name not in df.columns: raise ValueError( f"right variable {self.right.name} ({type(self.right.name)}) not in columns {df.columns}" ) for c in self.conditions: if c.name in {self.left.name, self.right.name}: raise ValueError(f"conditional {c.name} repeats one of the primary arguments") if c.name not in df.columns: raise ValueError( f"conditional {c.name} ({type(c.name)}) not in columns {df.columns}" ) if significance_level is None: significance_level = DEFAULT_SIGNIFICANCE method = _ensure_method( method, df[[self.left.name, self.right.name, *(c.name for c in self.conditions)]], skip=_method_checked, ) tests: dict[CITest, CITestFunc] = get_conditional_independence_tests() func: CITestFunc = tests[method] result = func( X=self.left.name, Y=self.right.name, Z={condition.name for condition in self.conditions}, data=df, boolean=boolean, significance_level=significance_level, ) if boolean: return cast(bool, result) # Person's correlation returns a pair with the first element being the Person's correlation # and the second being the p-value. The other methods return a triple with the first element # being the Chi^2 statistic, the second being the p-value, and the third being the degrees of # freedom. if method == "pearson": statistic, p_value = result dof = None else: statistic, p_value, dof = result return CITestTuple(statistic=statistic, p_value=p_value, dof=dof)
def _ensure_method(method: Optional[CITest], df: pd.DataFrame, skip: bool = False) -> CITest: if skip: if method is None: raise RuntimeError return method # TODO extend to discrete but more than 2. # see https://stats.stackexchange.com/questions/12273/how-to-test-if-my-data-is-discrete-or-continuous # TODO what happens when some variables are binary but others are continous? binary = _is_binary(df) if method is None: if binary: return DEFAULT_DISCRETE_CI_TEST else: return DEFAULT_CONTINUOUS_CI_TEST elif binary and method == "pearson": raise ValueError( f"using continuous data test ({method}) on binary data: {_summarize_df(df)}" ) elif not binary and method != "pearson": raise ValueError(f"using binary data test ({method}) on continuous data") return method def _summarize_df(df: pd.DataFrame): return {column: set(df[column].unique()) for column in df.columns} def _is_binary(df: pd.DataFrame) -> bool: column_to_type = {column: _is_two_values(df[column]) for column in df.columns} return all(column_to_type.values()) def _is_two_values(series): values = set(series.unique()) return values == {True, False} or values == {1, 0} or values == {1, -1}