Projects/3BIT/winter-semester/IZV/2/analysis.py

#!/usr/bin/env python3
# coding=utf-8
"""
Author: Roman Nečas (xnecasr00)
Date: 8.11.2025

Traffic Accident Data Analysis Module.

This module provides functions for loading, parsing, and visualizing
Czech traffic accident data from 2023-2025.

Functions:
    load_data: Load data from ZIP archive
    parse_data: Parse and clean accident data
    plot_state: Visualize accidents by driver state
    plot_alcohol: Visualize alcohol-related accidents by consequences
    plot_conditions: Visualize accidents over time by weather conditions
"""

from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import zipfile

# muzete pridat libovolnou zakladni knihovnu ci knihovnu predstavenou na prednaskach
# dalsi knihovny pak na dotaz

# Ukol 1: nacteni dat ze ZIP souboru


def load_data(filename: str, ds: str) -> pd.DataFrame:
    """
    Load traffic accident data from ZIP file for years 2023-2025.

    Args:
        filename: Path to ZIP file containing data
        ds: Dataset name without 'I' prefix (e.g., 'nehody', 'Vozidla', 'nasledky')

    Returns:
        DataFrame containing concatenated data from all three years
    """
    # Add 'I' prefix to dataset name
    file_ds = 'I' + ds
    years = ['2023', '2024', '2025']
    dfs = []

    with zipfile.ZipFile(filename, 'r') as z:
        for year in years:
            file_path = f"{year}/{file_ds}.xls"
            with z.open(file_path) as f:
                # Read HTML file with cp1250 encoding
                df = pd.read_html(f, encoding='cp1250')[0]
                dfs.append(df)

    # Concatenate all years
    result = pd.concat(dfs, ignore_index=True)

    # Remove unnamed columns
    result = result.loc[:, ~result.columns.str.contains('^Unnamed')]

    return result

# Ukol 2: zpracovani dat


def parse_data(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    """
    Parse and clean accident data.

    Args:
        df: Raw DataFrame from load_data()
        verbose: If True, print memory usage information

    Returns:
        Cleaned DataFrame with date and region columns
    """
    # Create a copy to avoid modifying original
    df = df.copy()

    # Create date column from p2a (format: DD.MM.YYYY)
    df['date'] = pd.to_datetime(df['p2a'], format='%d.%m.%Y')

    # Map region codes to names
    region_map = {
        0: "PHA", 1: "STC", 2: "JHC", 3: "PLK", 4: "ULK", 5: "HKK",
        6: "JHM", 7: "MSK", 14: "OLK", 15: "ZLK", 16: "VYS", 17: "PAK",
        18: "LBK", 19: "KVK"
    }
    # Use category dtype for memory efficiency (repeated string values)
    df['region'] = df['p4a'].map(region_map).astype('category')

    # Remove duplicates by accident ID
    df = df.drop_duplicates(subset=['p1'])

    # Calculate and print memory usage if verbose
    if verbose:
        memory_bytes = df.memory_usage(deep=True).sum()
        memory_mb = memory_bytes / 1_000_000  # 1 MB = 10^6 bytes
        print(f"new_size={memory_mb:.1f} MB")

    return df


# Ukol 3: počty nehod v jednotlivých regionech podle stavu řidiče
def plot_state(df: pd.DataFrame, df_vehicles: pd.DataFrame, fig_location: str = None,
               show_figure: bool = False):
    """
    Plot accident counts by driver state across regions.

    Args:
        df: Parsed DataFrame from parse_data()
        df_vehicles: DataFrame with vehicle information
        fig_location: Path to save figure
        show_figure: Whether to display the figure
    """
    # Merge dataframes on accident ID (left join to keep all accidents)
    # Select only p1 and p57 columns from vehicles to reduce memory usage
    merged = df.merge(df_vehicles[['p1', 'p57']], on='p1', how='left')

    # Filter for p57 values 3-9 only (specific driver states)
    # Use .isin() for efficient vectorized filtering (faster than multiple conditions)
    merged = merged[merged['p57'].isin([3, 4, 5, 6, 7, 8, 9])]

    # Map p57 values to descriptive labels
    state_map = {
        3: "Nemoc, únava",
        4: "Pod vlivem léků",
        5: "Invalida",
        6: "Řidič při jiné činnosti",
        7: "Řidič se nevěnoval řízení",
        8: "Náhlá fyzická indispozice",
        9: "Jiné"
    }
    # Use category dtype for memory efficiency (repeated string values)
    merged['driver_state'] = merged['p57'].map(state_map).astype('category')

    # Aggregate by region and driver state using groupby (vectorized aggregation)
    # .size() counts rows per group, reset_index() converts to DataFrame
    plot_data = merged.groupby(
        ['region', 'driver_state'], observed=True
    ).size().reset_index(name='count')

    # Select 6 driver states for the 6 subplots (excluding one of the 7 states)
    selected_states = [
        "Nemoc, únava",
        "Pod vlivem léků",
        "Invalida",
        "Řidič při jiné činnosti",
        "Řidič se nevěnoval řízení",
        "Náhlá fyzická indispozice"
    ]

    # Filter data to only include selected states
    plot_data = plot_data[plot_data['driver_state'].isin(selected_states)]

    # Create figure with 6 subplots in 3 rows × 2 columns grid
    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12), constrained_layout=True)
    axes = axes.flatten()  # Flatten to 1D array for easier iteration

    # Define custom background color for subplots
    background_color = '#f0f0f0'  # Light gray background

    # Create a bar chart for each driver state
    for idx, state in enumerate(selected_states):
        ax = axes[idx]

        # Filter data for current state
        state_data = plot_data[plot_data['driver_state'] == state]

        # Sort regions by count (descending) for better visualization
        state_data = state_data.sort_values('count', ascending=False)

        # Create bar chart
        ax.bar(state_data['region'], state_data['count'],
               color='#5b8db8',  # Steel blue color for bars (neutral)
               alpha=0.8)

        # Set custom background color
        ax.set_facecolor(background_color)

        # Set title for subplot
        ax.set_title(f'Stav řidiče: {state}', fontsize=11, fontweight='bold')

        # Set axis labels
        ax.set_xlabel('Kraj', fontsize=9)
        ax.set_ylabel('Počet nehod', fontsize=9)

        # Rotate x-axis labels for better readability
        ax.tick_params(axis='x', rotation=45, labelsize=8)
        ax.tick_params(axis='y', labelsize=8)

        # Add grid for better readability
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        ax.set_axisbelow(True)

    # Save figure if path provided
    if fig_location:
        plt.savefig(fig_location, dpi=100, bbox_inches='tight')

    # Show figure if requested
    if show_figure:
        plt.show()
    else:
        plt.close(fig)


# Ukol4: alkohol a roky v krajích
def plot_alcohol(df: pd.DataFrame, df_consequences: pd.DataFrame,
                 fig_location: str = None, show_figure: bool = False):
    """
    Plot accident consequences with alcohol involvement by region and year.

    Args:
        df: Parsed DataFrame from parse_data()
        df_consequences: DataFrame with consequence information
        fig_location: Path to save figure
        show_figure: Whether to display the figure
    """
    # Merge dataframes on accident ID (1:n relationship)
    # One accident can have multiple consequences
    # Use inner join to keep only accidents with consequences
    merged = df.merge(df_consequences, on='p1', how='inner')

    # Filter for alcohol involvement (p11 >= 3 indicates alcohol-related conditions)
    # Vectorized comparison is more efficient than iterating
    merged = merged[merged['p11'] >= 3]

    # Extract year and month using .dt accessor (efficient datetime operations)
    merged['year'] = merged['date'].dt.year
    merged['month'] = merged['date'].dt.month

    # Filter for months 1-10 only (exclude Nov/Dec)
    merged = merged[merged['month'] <= 10]

    # Map p59g to injury type labels
    injury_map = {
        1: "Usmrcení",
        2: "Těžké zranění",
        3: "Lehké zranění",
        4: "Bez zranění"
    }
    # Use category dtype for memory efficiency (repeated string values)
    merged['injury_type'] = merged['p59g'].map(injury_map).astype('category')

    # Remove rows with no injury type (NaN)
    merged = merged.dropna(subset=['injury_type'])

    # Count consequences per region, year, and injury type
    # Using groupby for efficient aggregation
    plot_data = merged.groupby(
        ['region', 'year', 'injury_type'], observed=True
    ).size().reset_index(name='count')

    # Create figure with subplots for each injury type
    g = sns.catplot(
        data=plot_data,
        x='region',
        y='count',
        hue='year',
        col='injury_type',
        kind='bar',
        col_wrap=2,
        height=4.5,
        aspect=1.5,
        palette='Set2',
        sharex=True,
        sharey=False  # Independent y-axis scaling for each subplot
    )

    # Set titles and labels
    g.set_axis_labels("Kraj", "Počet následků")
    g.set_titles("Následky nehody: {col_name}")

    # Rotate x-axis labels and show them on ALL subplots (not just bottom ones)
    # Also set background color for consistency with other plots
    background_color = '#f0f0f0'  # Light gray background
    for ax in g.axes.flat:
        ax.tick_params(axis='x', rotation=45, labelbottom=True)
        ax.set_facecolor(background_color)
        # Add grid for better readability
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        ax.set_axisbelow(True)

    # Move legend outside to the right to prevent overlap with data
    g._legend.set_bbox_to_anchor((1.05, 0.5))
    g._legend.set_loc('center left')
    g._legend.set_title('Rok')

    # Adjust layout to prevent overlaps
    plt.tight_layout()

    # Save figure if path provided
    if fig_location:
        plt.savefig(fig_location, dpi=100, bbox_inches='tight')

    # Show figure if requested
    if show_figure:
        plt.show()
    else:
        plt.close(g.figure)


# Ukol 5: Podmínky v čase
def plot_conditions(df: pd.DataFrame, fig_location: str = None,
                    show_figure: bool = False):
    """
    Plot accident counts over time by conditions for selected regions.

    Args:
        df: Parsed DataFrame from parse_data()
        fig_location: Path to save figure
        show_figure: Whether to display the figure
    """
    # Select 4 regions
    selected_regions = ['JHM', 'MSK', 'OLK', 'ZLK']
    df_filtered = df[df['region'].isin(selected_regions)].copy()

    # Map p11 to condition labels
    condition_map = {
        0: "jiné",
        1: "na počátku deště",
        2: "mlha",
        3: "alkohol 1 ‰ a více",
        4: "alkohol do 0,99 ‰",
        5: "pod vlivem drog",
        6: "náledí",
        7: "nárazový vítr",
        8: "sněžení",
        9: "déšť"
    }
    # Use category dtype for memory efficiency (repeated string values)
    df_filtered['condition'] = df_filtered['p11'].map(condition_map).astype('category')

    # Remove rows with unmapped conditions
    df_filtered = df_filtered.dropna(subset=['condition'])

    # Create pivot table: date × condition -> count (aggfunc='count' counts p1 values)
    # Each date-region combination gets counts for each condition type
    pivot = df_filtered.pivot_table(
        values='p1',
        index=['date', 'region'],
        columns='condition',
        aggfunc='count',
        fill_value=0,
        observed=True
    ).reset_index()

    # Resample to monthly frequency for each region
    # (must process regions separately due to multi-index structure)
    resampled_dfs = []
    for region in selected_regions:
        # Filter data for current region and set date as index for resampling
        region_data = pivot[pivot['region'] == region].set_index('date')
        region_data = region_data.drop(columns=['region'])

        # Resample to monthly frequency ('MS' = Month Start) and sum counts
        monthly = region_data.resample('MS').sum()
        monthly['region'] = region
        resampled_dfs.append(monthly.reset_index())

    # Concatenate all regions back together
    monthly_data = pd.concat(resampled_dfs, ignore_index=True)

    # Melt to long format (wide -> long transformation for plotting)
    # Converts condition columns into rows for easier visualization with seaborn
    plot_data = monthly_data.melt(
        id_vars=['date', 'region'],
        var_name='Podmínky',
        value_name='Počet nehod'
    )

    # Create line plot
    g = sns.relplot(
        data=plot_data,
        x='date',
        y='Počet nehod',
        hue='Podmínky',
        col='region',
        kind='line',
        col_wrap=2,
        height=4.5,
        aspect=1.5,
        facet_kws={'sharex': True, 'sharey': False},
        legend='brief'
    )

    # Set titles and labels
    g.set_axis_labels("Datum", "Počet nehod")
    g.set_titles("Kraj: {col_name}")

    # Set x-axis limits, rotate date labels
    background_color = '#f0f0f0'  # Light gray background
    for ax in g.axes.flat:
        ax.set_xlim(pd.Timestamp('2023-01-01'), pd.Timestamp('2025-01-01'))
        ax.tick_params(axis='x', rotation=45, labelbottom=True)
        ax.set_facecolor(background_color)
        # Add grid for better readability
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        ax.set_axisbelow(True)

    # Move legend outside to prevent overlap
    g._legend.set_bbox_to_anchor((1.05, 0.5))
    g._legend.set_loc('center left')
    g._legend.set_title('Podmínky')

    # Adjust layout to prevent overlaps
    plt.tight_layout()

    # Save figure if path provided
    if fig_location:
        plt.savefig(fig_location, dpi=100, bbox_inches='tight')

    # Show figure if requested
    if show_figure:
        plt.show()
    else:
        plt.close(g.figure)


if __name__ == "__main__":
    # zde je ukazka pouziti, tuto cast muzete modifikovat podle libosti
    # skript nebude pri testovani pousten primo, ale budou volany konkreni
    # funkce.

    df = load_data("data_23_25.zip", "nehody")
    df_consequences = load_data("data_23_25.zip", "nasledky")
    df_vehicles = load_data("data_23_25.zip", "Vozidla")
    df2 = parse_data(df, True)

    plot_state(df2, df_vehicles, "01_state.png")
    plot_alcohol(df2, df_consequences, "02_alcohol.png")
    plot_conditions(df2, "03_conditions.png")

# Poznamka:
# pro to, abyste se vyhnuli castemu nacitani muzete vyuzit napr
# VS Code a oznaceni jako bunky (radek #%%% )
# Pak muzete data jednou nacist a dale ladit jednotlive funkce
# Pripadne si muzete vysledny dataframe ulozit nekam na disk (pro ladici
# ucely) a nacitat jej naparsovany z disku