CalistaTable

class calista.table.CalistaTable(engine: LazyEngine)

Bases: object

analyze(rule_name: str, rule: Condition) → Metrics

Compute Metrics based on a condition.

Args:

rule_name (str): The name of the rule.
rule (Condition): The Condition to evaluate.

Returns:

Metrics: The metrics resulting from the analysis.

Raises:

Any exceptions raised by the analyze_rules method.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your rule
>>> my_rule = func.is_not_null(col_name="PLANETE")
>>>
>>> # Generate and print your metrics
>>> metrics = calista_table.analyze(rule_name="PLANETE is not null", rule=my_rule)
>>> print(metrics)

>>> rule_name : PLANETE is not null
>>> total_row_count : 4
>>> valid_row_count : 3
>>> valid_row_count_pct : 75.0
>>> timestamp : 2024-01-01 00:00:00.000000

analyze_rules(rules: Dict[str, Condition]) → List[Metrics]

Compute List[Metrics] based on rules.

Args:: rules (dict[RuleName, Condition]): The name of the rules and the conditions to execute.
Returns:: List[Metrics]: The metrics resulting from the analysis.
Raises:: Any exceptions raised by the engine’s execute_conditions method.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your rules
>>> my_rule = func.is_not_null(col_name="PLANETE")
>>> my_rule_2 = func.is_alphabetic(col_name="PLANETE") & func.length_lt(col_name="PLANETE", length=20)
>>>
>>> # Generate and print your metrics
>>> metrics = calista_table.analyze_rules({"PLANETE is not null": my_rule,
>>>                                        "PLANETE is alphabetic and length < 20": my_rule_2})
>>> for metric in metrics:
>>>     print(metrics)
>>>     print("-----------------")

>>> rule_name : PLANETE is not null
>>> total_row_count : 4
>>> valid_row_count : 3
>>> valid_row_count_pct : 75.0
>>> timestamp : 2024-01-01 00:00:00.000000
>>> -----------------
>>> rule_name : PLANETE is alphabetic and length < 20
>>> total_row_count : 4
>>> valid_row_count : 3
>>> valid_row_count_pct : 75.0
>>> timestamp : 2024-01-01 00:00:00.000000

apply_rule(rule: Condition, rule_name: str | None = None) → DataFrameType

Returns the dataset with new columns of booleans for given rule.

Args:

rule (Condition): The Condition to execute.
rule_name (str): Name of the rule (Default: None)

Returns:

DataFrameType: The dataset with the new column resulting from the analysis.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your rule
>>> my_rule = func.is_not_null(col_name="PLANETE")
>>>
>>> # Generate and print the resulting dataframe
>>> df_result = calista_table.apply_rule(rule_name="PLANETE is not null", rule=my_rule)
>>> print(df_result)

>>>    PLANETE   test
>>> 0     mars   True
>>> 1     None  False
>>> 2  jupiter   True
>>> 3    terre   True

apply_rules(rules: Dict[str, Condition]) → DataFrameType

Returns the dataset with new columns of booleans for each rules or the given condition.

Args:: rules (Dict[RuleName, Condition]): The name of the rules and the conditions to execute.
Returns:: DataFrameType: The dataset with new columns resulting from the analysis.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your rules
>>> my_rule = func.is_not_null(col_name="PLANETE")
>>> my_rule_2 = func.is_alphabetic(col_name="PLANETE") & func.length_lt(col_name="PLANETE", length=20)
>>>
>>> # Generate and print the resulting dataframe
>>> df_result = calista_table.apply_rules({"PLANETE is not null": my_rule,
>>>                                        "PLANETE is alphabetic and length < 20": my_rule_2})
>>> print(df_result)

>>>        PLANETE  PLANETE is not null  PLANETE is alphabetic and length < 20
>>>     0     mars                 True                                   True
>>>     1     None                False                                  False
>>>     2  jupiter                 True                                   True
>>>     3    terre                 True                                   True

filter(condition: Condition) → CalistaTable

Filters rows using the given condition.

filter() is an alias for where().

Args:: condition : Condition
Returns:: CalistaTable: Filtered CalistaTable.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your filter and your rule
>>> my_filter = func.is_not_null(col_name="PLANETE")
>>> my_rule = func.is_alphabetic(col_name="PLANETE")
>>>
>>> # Generate and print your metrics
>>> metrics = calista_table.filter(my_filter).analyze(rule_name="PLANETE is alphabetic on non null values", condition=my_rule)
>>> print(metrics)

>>> rule_name : PLANETE is not null
>>> total_row_count : 3
>>> valid_row_count : 3
>>> valid_row_count_pct : 100.0
>>> timestamp : 2024-01-01 00:00:00.000000

get_invalid_rows(rule: Condition) → DataFrameType

Returns the dataset filtered with the rows not validating the rules.

Args:: rule (Condition): The Condition to evaluate.
Returns:: DataFrameType: The dataset filtered with the rows where the rule is not satisfied.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your rule
>>> my_rule = func.is_not_null(col_name="PLANETE")
>>>
>>> # Generate and print the resulting dataframe
>>> df_result = calista_table.get_invalid_rows(my_rule)

>>>   PLANETE
>>> 1    None

get_valid_rows(rule: Condition) → DataFrameType

Returns the dataset filtered with the rows validating the rules.

Args:: rule (Condition): The Condition to evaluate.
Returns:: DataFrameType: The dataset filtered with the rows where the rule is satisfied.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your rule
>>> my_rule = func.is_not_null(col_name="PLANETE")
>>>
>>> # Generate and print the resulting dataframe
>>> df_result = calista_table.get_valid_rows(my_rule)

>>>    PLANETE
>>> 0     mars
>>> 2  jupiter
>>> 3    terre

group_by(*cols: str) → GroupedTable

Groups the CalistaTable using the specified columns, so we can execute aggregation conditions on them. See GroupedTable for all the available functions after calling group_by.

Args:: cols (list, str):columns to group by. Each element should be a column name (string).

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"TEAM": ["red", "red", "red", "blue", "blue", "blue"],
>>>                                                                "POINTS": [10, 20, 30, 40, 20, 10]})
>>>
>>> # Define your rule
>>> my_rule = func.sum_gt_value(col_name="POINTS", value=65)
>>>
>>> # Generate and print your metrics
>>> metrics = calista_table.group_by("TEAM").analyze(rule_name="Total points higher than 65", rule=my_rule)
>>> print(metrics)

>>> rule_name : Total points higher than 65
>>> total_row_count : 2
>>> valid_row_count : 1
>>> valid_row_count_pct : 50.0
>>> timestamp : 2024-01-01 00:00:00.000000

property schema: dict[str, str]

Returns the schema of the underlying dataset.

Returns:: Dict[ColumnName, PythonType]: Dict representing the schema of the underlying dataset.

show(n: int = 10) → None

Prints the first n rows to the console.

Args:: n (int, optional): Number of rows to show

Example

>>> from calista import CalistaEngine
>>>
>>> # Create your CalistaTable and show it
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>> calista_table.show()

>>>    PLANETE
>>> 0     mars
>>> 1     None
>>> 2  jupiter
>>> 3    terre

where(condition: Condition) → CalistaTable

Filters rows using the given condition.

filter() is an alias for where().

Args:: condition : Condition
Returns:: CalistaTable: Filtered CalistaTable.

Example

>>> from calista import CalistaEngine
>>> from calista import functions as func
>>>
>>> # Create your CalistaTable
>>> calista_table = CalistaEngine(engine="pandas").load_from_dict({"PLANETE": ["mars", None, "jupiter", "terre"]})
>>>
>>> # Define your filter and your rule
>>> my_filter = func.is_not_null(col_name="PLANETE")
>>> my_rule = func.is_alphabetic(col_name="PLANETE")
>>>
>>> # Generate and print your metrics
>>> metrics = calista_table.where(my_filter).analyze(rule_name="PLANETE is alphabetic on non null values", rule=my_rule)
>>> print(metrics)

>>> rule_name : PLANETE is not null
>>> total_row_count : 3
>>> valid_row_count : 3
>>> valid_row_count_pct : 100.0
>>> timestamp : 2024-01-01 00:00:00.000000