#!/usr/bin/env python3 # coding=utf-8 """ Author: Roman Nečas (xnecasr00) Date: 8.11.2025 Traffic Accident Data Analysis Module. This module provides functions for loading, parsing, and visualizing Czech traffic accident data from 2023-2025. Functions: load_data: Load data from ZIP archive parse_data: Parse and clean accident data plot_state: Visualize accidents by driver state plot_alcohol: Visualize alcohol-related accidents by consequences plot_conditions: Visualize accidents over time by weather conditions """ from matplotlib import pyplot as plt import pandas as pd import seaborn as sns import numpy as np import zipfile # muzete pridat libovolnou zakladni knihovnu ci knihovnu predstavenou na prednaskach # dalsi knihovny pak na dotaz # Ukol 1: nacteni dat ze ZIP souboru def load_data(filename: str, ds: str) -> pd.DataFrame: """ Load traffic accident data from ZIP file for years 2023-2025. Args: filename: Path to ZIP file containing data ds: Dataset name without 'I' prefix (e.g., 'nehody', 'Vozidla', 'nasledky') Returns: DataFrame containing concatenated data from all three years """ # Add 'I' prefix to dataset name file_ds = 'I' + ds years = ['2023', '2024', '2025'] dfs = [] with zipfile.ZipFile(filename, 'r') as z: for year in years: file_path = f"{year}/{file_ds}.xls" with z.open(file_path) as f: # Read HTML file with cp1250 encoding df = pd.read_html(f, encoding='cp1250')[0] dfs.append(df) # Concatenate all years result = pd.concat(dfs, ignore_index=True) # Remove unnamed columns result = result.loc[:, ~result.columns.str.contains('^Unnamed')] return result # Ukol 2: zpracovani dat def parse_data(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame: """ Parse and clean accident data. Args: df: Raw DataFrame from load_data() verbose: If True, print memory usage information Returns: Cleaned DataFrame with date and region columns """ # Create a copy to avoid modifying original df = df.copy() # Create date column from p2a (format: DD.MM.YYYY) df['date'] = pd.to_datetime(df['p2a'], format='%d.%m.%Y') # Map region codes to names region_map = { 0: "PHA", 1: "STC", 2: "JHC", 3: "PLK", 4: "ULK", 5: "HKK", 6: "JHM", 7: "MSK", 14: "OLK", 15: "ZLK", 16: "VYS", 17: "PAK", 18: "LBK", 19: "KVK" } # Use category dtype for memory efficiency (repeated string values) df['region'] = df['p4a'].map(region_map).astype('category') # Remove duplicates by accident ID df = df.drop_duplicates(subset=['p1']) # Calculate and print memory usage if verbose if verbose: memory_bytes = df.memory_usage(deep=True).sum() memory_mb = memory_bytes / 1_000_000 # 1 MB = 10^6 bytes print(f"new_size={memory_mb:.1f} MB") return df # Ukol 3: počty nehod v jednotlivých regionech podle stavu řidiče def plot_state(df: pd.DataFrame, df_vehicles: pd.DataFrame, fig_location: str = None, show_figure: bool = False): """ Plot accident counts by driver state across regions. Args: df: Parsed DataFrame from parse_data() df_vehicles: DataFrame with vehicle information fig_location: Path to save figure show_figure: Whether to display the figure """ # Merge dataframes on accident ID (left join to keep all accidents) # Select only p1 and p57 columns from vehicles to reduce memory usage merged = df.merge(df_vehicles[['p1', 'p57']], on='p1', how='left') # Filter for p57 values 3-9 only (specific driver states) # Use .isin() for efficient vectorized filtering (faster than multiple conditions) merged = merged[merged['p57'].isin([3, 4, 5, 6, 7, 8, 9])] # Map p57 values to descriptive labels state_map = { 3: "Nemoc, únava", 4: "Pod vlivem léků", 5: "Invalida", 6: "Řidič při jiné činnosti", 7: "Řidič se nevěnoval řízení", 8: "Náhlá fyzická indispozice", 9: "Jiné" } # Use category dtype for memory efficiency (repeated string values) merged['driver_state'] = merged['p57'].map(state_map).astype('category') # Aggregate by region and driver state using groupby (vectorized aggregation) # .size() counts rows per group, reset_index() converts to DataFrame plot_data = merged.groupby( ['region', 'driver_state'], observed=True ).size().reset_index(name='count') # Select 6 driver states for the 6 subplots (excluding one of the 7 states) selected_states = [ "Nemoc, únava", "Pod vlivem léků", "Invalida", "Řidič při jiné činnosti", "Řidič se nevěnoval řízení", "Náhlá fyzická indispozice" ] # Filter data to only include selected states plot_data = plot_data[plot_data['driver_state'].isin(selected_states)] # Create figure with 6 subplots in 3 rows × 2 columns grid fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12), constrained_layout=True) axes = axes.flatten() # Flatten to 1D array for easier iteration # Define custom background color for subplots background_color = '#f0f0f0' # Light gray background # Create a bar chart for each driver state for idx, state in enumerate(selected_states): ax = axes[idx] # Filter data for current state state_data = plot_data[plot_data['driver_state'] == state] # Sort regions by count (descending) for better visualization state_data = state_data.sort_values('count', ascending=False) # Create bar chart ax.bar(state_data['region'], state_data['count'], color='#5b8db8', # Steel blue color for bars (neutral) alpha=0.8) # Set custom background color ax.set_facecolor(background_color) # Set title for subplot ax.set_title(f'Stav řidiče: {state}', fontsize=11, fontweight='bold') # Set axis labels ax.set_xlabel('Kraj', fontsize=9) ax.set_ylabel('Počet nehod', fontsize=9) # Rotate x-axis labels for better readability ax.tick_params(axis='x', rotation=45, labelsize=8) ax.tick_params(axis='y', labelsize=8) # Add grid for better readability ax.grid(axis='y', alpha=0.3, linestyle='--') ax.set_axisbelow(True) # Save figure if path provided if fig_location: plt.savefig(fig_location, dpi=100, bbox_inches='tight') # Show figure if requested if show_figure: plt.show() else: plt.close(fig) # Ukol4: alkohol a roky v krajích def plot_alcohol(df: pd.DataFrame, df_consequences: pd.DataFrame, fig_location: str = None, show_figure: bool = False): """ Plot accident consequences with alcohol involvement by region and year. Args: df: Parsed DataFrame from parse_data() df_consequences: DataFrame with consequence information fig_location: Path to save figure show_figure: Whether to display the figure """ # Merge dataframes on accident ID (1:n relationship) # One accident can have multiple consequences # Use inner join to keep only accidents with consequences merged = df.merge(df_consequences, on='p1', how='inner') # Filter for alcohol involvement (p11 >= 3 indicates alcohol-related conditions) # Vectorized comparison is more efficient than iterating merged = merged[merged['p11'] >= 3] # Extract year and month using .dt accessor (efficient datetime operations) merged['year'] = merged['date'].dt.year merged['month'] = merged['date'].dt.month # Filter for months 1-10 only (exclude Nov/Dec) merged = merged[merged['month'] <= 10] # Map p59g to injury type labels injury_map = { 1: "Usmrcení", 2: "Těžké zranění", 3: "Lehké zranění", 4: "Bez zranění" } # Use category dtype for memory efficiency (repeated string values) merged['injury_type'] = merged['p59g'].map(injury_map).astype('category') # Remove rows with no injury type (NaN) merged = merged.dropna(subset=['injury_type']) # Count consequences per region, year, and injury type # Using groupby for efficient aggregation plot_data = merged.groupby( ['region', 'year', 'injury_type'], observed=True ).size().reset_index(name='count') # Create figure with subplots for each injury type g = sns.catplot( data=plot_data, x='region', y='count', hue='year', col='injury_type', kind='bar', col_wrap=2, height=4.5, aspect=1.5, palette='Set2', sharex=True, sharey=False # Independent y-axis scaling for each subplot ) # Set titles and labels g.set_axis_labels("Kraj", "Počet následků") g.set_titles("Následky nehody: {col_name}") # Rotate x-axis labels and show them on ALL subplots (not just bottom ones) # Also set background color for consistency with other plots background_color = '#f0f0f0' # Light gray background for ax in g.axes.flat: ax.tick_params(axis='x', rotation=45, labelbottom=True) ax.set_facecolor(background_color) # Add grid for better readability ax.grid(axis='y', alpha=0.3, linestyle='--') ax.set_axisbelow(True) # Move legend outside to the right to prevent overlap with data g._legend.set_bbox_to_anchor((1.05, 0.5)) g._legend.set_loc('center left') g._legend.set_title('Rok') # Adjust layout to prevent overlaps plt.tight_layout() # Save figure if path provided if fig_location: plt.savefig(fig_location, dpi=100, bbox_inches='tight') # Show figure if requested if show_figure: plt.show() else: plt.close(g.figure) # Ukol 5: Podmínky v čase def plot_conditions(df: pd.DataFrame, fig_location: str = None, show_figure: bool = False): """ Plot accident counts over time by conditions for selected regions. Args: df: Parsed DataFrame from parse_data() fig_location: Path to save figure show_figure: Whether to display the figure """ # Select 4 regions selected_regions = ['JHM', 'MSK', 'OLK', 'ZLK'] df_filtered = df[df['region'].isin(selected_regions)].copy() # Map p11 to condition labels condition_map = { 0: "jiné", 1: "na počátku deště", 2: "mlha", 3: "alkohol 1 ‰ a více", 4: "alkohol do 0,99 ‰", 5: "pod vlivem drog", 6: "náledí", 7: "nárazový vítr", 8: "sněžení", 9: "déšť" } # Use category dtype for memory efficiency (repeated string values) df_filtered['condition'] = df_filtered['p11'].map(condition_map).astype('category') # Remove rows with unmapped conditions df_filtered = df_filtered.dropna(subset=['condition']) # Create pivot table: date × condition -> count (aggfunc='count' counts p1 values) # Each date-region combination gets counts for each condition type pivot = df_filtered.pivot_table( values='p1', index=['date', 'region'], columns='condition', aggfunc='count', fill_value=0, observed=True ).reset_index() # Resample to monthly frequency for each region # (must process regions separately due to multi-index structure) resampled_dfs = [] for region in selected_regions: # Filter data for current region and set date as index for resampling region_data = pivot[pivot['region'] == region].set_index('date') region_data = region_data.drop(columns=['region']) # Resample to monthly frequency ('MS' = Month Start) and sum counts monthly = region_data.resample('MS').sum() monthly['region'] = region resampled_dfs.append(monthly.reset_index()) # Concatenate all regions back together monthly_data = pd.concat(resampled_dfs, ignore_index=True) # Melt to long format (wide -> long transformation for plotting) # Converts condition columns into rows for easier visualization with seaborn plot_data = monthly_data.melt( id_vars=['date', 'region'], var_name='Podmínky', value_name='Počet nehod' ) # Create line plot g = sns.relplot( data=plot_data, x='date', y='Počet nehod', hue='Podmínky', col='region', kind='line', col_wrap=2, height=4.5, aspect=1.5, facet_kws={'sharex': True, 'sharey': False}, legend='brief' ) # Set titles and labels g.set_axis_labels("Datum", "Počet nehod") g.set_titles("Kraj: {col_name}") # Set x-axis limits, rotate date labels background_color = '#f0f0f0' # Light gray background for ax in g.axes.flat: ax.set_xlim(pd.Timestamp('2023-01-01'), pd.Timestamp('2025-01-01')) ax.tick_params(axis='x', rotation=45, labelbottom=True) ax.set_facecolor(background_color) # Add grid for better readability ax.grid(axis='y', alpha=0.3, linestyle='--') ax.set_axisbelow(True) # Move legend outside to prevent overlap g._legend.set_bbox_to_anchor((1.05, 0.5)) g._legend.set_loc('center left') g._legend.set_title('Podmínky') # Adjust layout to prevent overlaps plt.tight_layout() # Save figure if path provided if fig_location: plt.savefig(fig_location, dpi=100, bbox_inches='tight') # Show figure if requested if show_figure: plt.show() else: plt.close(g.figure) if __name__ == "__main__": # zde je ukazka pouziti, tuto cast muzete modifikovat podle libosti # skript nebude pri testovani pousten primo, ale budou volany konkreni # funkce. df = load_data("data_23_25.zip", "nehody") df_consequences = load_data("data_23_25.zip", "nasledky") df_vehicles = load_data("data_23_25.zip", "Vozidla") df2 = parse_data(df, True) plot_state(df2, df_vehicles, "01_state.png") plot_alcohol(df2, df_consequences, "02_alcohol.png") plot_conditions(df2, "03_conditions.png") # Poznamka: # pro to, abyste se vyhnuli castemu nacitani muzete vyuzit napr # VS Code a oznaceni jako bunky (radek #%%% ) # Pak muzete data jednou nacist a dale ladit jednotlive funkce # Pripadne si muzete vysledny dataframe ulozit nekam na disk (pro ladici # ucely) a nacitat jej naparsovany z disku