Skip to content

API Reference

Auto-generated code documentation.

datakit_lite

datakit_lite: helpful utilities for Python and analytics education.

log_duration

log_duration(label: str)

Context manager that prints how long a block takes.

Source code in src/datakit_lite/timer.py
34
35
36
37
38
39
40
41
@contextmanager
def log_duration(label: str):
    """Context manager that prints how long a block takes."""
    start = time.perf_counter()
    yield
    end = time.perf_counter()
    duration = round(end - start, 4)
    print(f"{label} took {duration} seconds")

project_paths

project_paths(root: str | Path = '.') -> ProjectPaths

Return a set of standard project directories, creating them if needed.

Directories

data/raw data/clean reports models

Source code in src/datakit_lite/paths.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def project_paths(root: str | Path = ".") -> ProjectPaths:
    """Return a set of standard project directories, creating them if needed.

    Directories:
        data/raw
        data/clean
        reports
        models
    """
    root = Path(root).resolve()

    data_raw = root / "data" / "raw"
    data_clean = root / "data" / "clean"
    reports = root / "reports"
    models = root / "models"

    for p in [data_raw, data_clean, reports, models]:
        p.mkdir(parents=True, exist_ok=True)

    return ProjectPaths(
        root=root,
        data_raw=data_raw,
        data_clean=data_clean,
        reports=reports,
        models=models,
    )

summarize_table

summarize_table(df: DataFrame) -> pd.DataFrame

Return a simple summary of a pandas DataFrame.

Columns

name: column name dtype: pandas dtype non_null: count of non-null values total: total rows missing_pct: percent missing (0-100) unique: number of unique values

Source code in src/datakit_lite/summary.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def summarize_table(df: pd.DataFrame) -> pd.DataFrame:
    """Return a simple summary of a pandas DataFrame.

    Columns:
        name: column name
        dtype: pandas dtype
        non_null: count of non-null values
        total: total rows
        missing_pct: percent missing (0-100)
        unique: number of unique values
    """
    rows = []

    total = len(df)

    for col in df.columns:
        series = df[col]
        non_null = series.notna().sum()
        missing_pct = round((1 - non_null / total) * 100, 2)
        unique = series.nunique(dropna=True)

        rows.append(
            {
                "name": col,
                "dtype": str(series.dtype),
                "non_null": int(non_null),
                "total": int(total),
                "missing_pct": missing_pct,
                "unique": int(unique),
            }
        )

    return pd.DataFrame(rows)

timeit

timeit(fn)

Print how long a function takes.

Parameters:

fn : callable The function to be timed.

Source code in src/datakit_lite/timer.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def timeit(fn):
    """Print how long a function takes.

    Parameters:
    ----------
    fn : callable
        The function to be timed.
    """

    @wraps(fn)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = fn(*args, **kwargs)
        end = time.perf_counter()
        duration = round(end - start, 4)
        print(f"{fn.__name__} took {duration} seconds")
        return result

    return wrapper

paths

Project path management utilities.

This module provides: - ProjectPaths: dataclass for organizing standard project directories - project_paths: function to create and return project directory structure

ProjectPaths dataclass

Standard project directory paths.

Attributes:

root : Path The root directory of the project. data_raw : Path Directory for raw data files. data_clean : Path Directory for cleaned/processed data files. reports : Path Directory for reports and output files. models : Path Directory for model files and artifacts.

Source code in src/datakit_lite/paths.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
@dataclass
class ProjectPaths:
    """Standard project directory paths.

    Attributes:
    ----------
    root : Path
        The root directory of the project.
    data_raw : Path
        Directory for raw data files.
    data_clean : Path
        Directory for cleaned/processed data files.
    reports : Path
        Directory for reports and output files.
    models : Path
        Directory for model files and artifacts.
    """

    root: Path
    data_raw: Path
    data_clean: Path
    reports: Path
    models: Path

project_paths

project_paths(root: str | Path = '.') -> ProjectPaths

Return a set of standard project directories, creating them if needed.

Directories

data/raw data/clean reports models

Source code in src/datakit_lite/paths.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def project_paths(root: str | Path = ".") -> ProjectPaths:
    """Return a set of standard project directories, creating them if needed.

    Directories:
        data/raw
        data/clean
        reports
        models
    """
    root = Path(root).resolve()

    data_raw = root / "data" / "raw"
    data_clean = root / "data" / "clean"
    reports = root / "reports"
    models = root / "models"

    for p in [data_raw, data_clean, reports, models]:
        p.mkdir(parents=True, exist_ok=True)

    return ProjectPaths(
        root=root,
        data_raw=data_raw,
        data_clean=data_clean,
        reports=reports,
        models=models,
    )

summary

Summary utilities for pandas DataFrames.

This module provides functions to generate summary statistics and metadata for pandas DataFrames, including information about column types, missing values, and unique value counts.

summarize_table

summarize_table(df: DataFrame) -> pd.DataFrame

Return a simple summary of a pandas DataFrame.

Columns

name: column name dtype: pandas dtype non_null: count of non-null values total: total rows missing_pct: percent missing (0-100) unique: number of unique values

Source code in src/datakit_lite/summary.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def summarize_table(df: pd.DataFrame) -> pd.DataFrame:
    """Return a simple summary of a pandas DataFrame.

    Columns:
        name: column name
        dtype: pandas dtype
        non_null: count of non-null values
        total: total rows
        missing_pct: percent missing (0-100)
        unique: number of unique values
    """
    rows = []

    total = len(df)

    for col in df.columns:
        series = df[col]
        non_null = series.notna().sum()
        missing_pct = round((1 - non_null / total) * 100, 2)
        unique = series.nunique(dropna=True)

        rows.append(
            {
                "name": col,
                "dtype": str(series.dtype),
                "non_null": int(non_null),
                "total": int(total),
                "missing_pct": missing_pct,
                "unique": int(unique),
            }
        )

    return pd.DataFrame(rows)

timer

Timer utilities for measuring function and code block execution time.

This module provides: - timeit: decorator for timing function execution - log_duration: context manager for timing code blocks

log_duration

log_duration(label: str)

Context manager that prints how long a block takes.

Source code in src/datakit_lite/timer.py
34
35
36
37
38
39
40
41
@contextmanager
def log_duration(label: str):
    """Context manager that prints how long a block takes."""
    start = time.perf_counter()
    yield
    end = time.perf_counter()
    duration = round(end - start, 4)
    print(f"{label} took {duration} seconds")

timeit

timeit(fn)

Print how long a function takes.

Parameters:

fn : callable The function to be timed.

Source code in src/datakit_lite/timer.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def timeit(fn):
    """Print how long a function takes.

    Parameters:
    ----------
    fn : callable
        The function to be timed.
    """

    @wraps(fn)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = fn(*args, **kwargs)
        end = time.perf_counter()
        duration = round(end - start, 4)
        print(f"{fn.__name__} took {duration} seconds")
        return result

    return wrapper