Projects/3BIT/winter-semester/IZV/2/analysis.py
2026-04-14 19:28:46 +02:00

440 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# coding=utf-8
"""
Author: Roman Nečas (xnecasr00)
Date: 8.11.2025
Traffic Accident Data Analysis Module.
This module provides functions for loading, parsing, and visualizing
Czech traffic accident data from 2023-2025.
Functions:
load_data: Load data from ZIP archive
parse_data: Parse and clean accident data
plot_state: Visualize accidents by driver state
plot_alcohol: Visualize alcohol-related accidents by consequences
plot_conditions: Visualize accidents over time by weather conditions
"""
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import zipfile
# muzete pridat libovolnou zakladni knihovnu ci knihovnu predstavenou na prednaskach
# dalsi knihovny pak na dotaz
# Ukol 1: nacteni dat ze ZIP souboru
def load_data(filename: str, ds: str) -> pd.DataFrame:
"""
Load traffic accident data from ZIP file for years 2023-2025.
Args:
filename: Path to ZIP file containing data
ds: Dataset name without 'I' prefix (e.g., 'nehody', 'Vozidla', 'nasledky')
Returns:
DataFrame containing concatenated data from all three years
"""
# Add 'I' prefix to dataset name
file_ds = 'I' + ds
years = ['2023', '2024', '2025']
dfs = []
with zipfile.ZipFile(filename, 'r') as z:
for year in years:
file_path = f"{year}/{file_ds}.xls"
with z.open(file_path) as f:
# Read HTML file with cp1250 encoding
df = pd.read_html(f, encoding='cp1250')[0]
dfs.append(df)
# Concatenate all years
result = pd.concat(dfs, ignore_index=True)
# Remove unnamed columns
result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
return result
# Ukol 2: zpracovani dat
def parse_data(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
"""
Parse and clean accident data.
Args:
df: Raw DataFrame from load_data()
verbose: If True, print memory usage information
Returns:
Cleaned DataFrame with date and region columns
"""
# Create a copy to avoid modifying original
df = df.copy()
# Create date column from p2a (format: DD.MM.YYYY)
df['date'] = pd.to_datetime(df['p2a'], format='%d.%m.%Y')
# Map region codes to names
region_map = {
0: "PHA", 1: "STC", 2: "JHC", 3: "PLK", 4: "ULK", 5: "HKK",
6: "JHM", 7: "MSK", 14: "OLK", 15: "ZLK", 16: "VYS", 17: "PAK",
18: "LBK", 19: "KVK"
}
# Use category dtype for memory efficiency (repeated string values)
df['region'] = df['p4a'].map(region_map).astype('category')
# Remove duplicates by accident ID
df = df.drop_duplicates(subset=['p1'])
# Calculate and print memory usage if verbose
if verbose:
memory_bytes = df.memory_usage(deep=True).sum()
memory_mb = memory_bytes / 1_000_000 # 1 MB = 10^6 bytes
print(f"new_size={memory_mb:.1f} MB")
return df
# Ukol 3: počty nehod v jednotlivých regionech podle stavu řidiče
def plot_state(df: pd.DataFrame, df_vehicles: pd.DataFrame, fig_location: str = None,
show_figure: bool = False):
"""
Plot accident counts by driver state across regions.
Args:
df: Parsed DataFrame from parse_data()
df_vehicles: DataFrame with vehicle information
fig_location: Path to save figure
show_figure: Whether to display the figure
"""
# Merge dataframes on accident ID (left join to keep all accidents)
# Select only p1 and p57 columns from vehicles to reduce memory usage
merged = df.merge(df_vehicles[['p1', 'p57']], on='p1', how='left')
# Filter for p57 values 3-9 only (specific driver states)
# Use .isin() for efficient vectorized filtering (faster than multiple conditions)
merged = merged[merged['p57'].isin([3, 4, 5, 6, 7, 8, 9])]
# Map p57 values to descriptive labels
state_map = {
3: "Nemoc, únava",
4: "Pod vlivem léků",
5: "Invalida",
6: "Řidič při jiné činnosti",
7: "Řidič se nevěnoval řízení",
8: "Náhlá fyzická indispozice",
9: "Jiné"
}
# Use category dtype for memory efficiency (repeated string values)
merged['driver_state'] = merged['p57'].map(state_map).astype('category')
# Aggregate by region and driver state using groupby (vectorized aggregation)
# .size() counts rows per group, reset_index() converts to DataFrame
plot_data = merged.groupby(
['region', 'driver_state'], observed=True
).size().reset_index(name='count')
# Select 6 driver states for the 6 subplots (excluding one of the 7 states)
selected_states = [
"Nemoc, únava",
"Pod vlivem léků",
"Invalida",
"Řidič při jiné činnosti",
"Řidič se nevěnoval řízení",
"Náhlá fyzická indispozice"
]
# Filter data to only include selected states
plot_data = plot_data[plot_data['driver_state'].isin(selected_states)]
# Create figure with 6 subplots in 3 rows × 2 columns grid
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12), constrained_layout=True)
axes = axes.flatten() # Flatten to 1D array for easier iteration
# Define custom background color for subplots
background_color = '#f0f0f0' # Light gray background
# Create a bar chart for each driver state
for idx, state in enumerate(selected_states):
ax = axes[idx]
# Filter data for current state
state_data = plot_data[plot_data['driver_state'] == state]
# Sort regions by count (descending) for better visualization
state_data = state_data.sort_values('count', ascending=False)
# Create bar chart
ax.bar(state_data['region'], state_data['count'],
color='#5b8db8', # Steel blue color for bars (neutral)
alpha=0.8)
# Set custom background color
ax.set_facecolor(background_color)
# Set title for subplot
ax.set_title(f'Stav řidiče: {state}', fontsize=11, fontweight='bold')
# Set axis labels
ax.set_xlabel('Kraj', fontsize=9)
ax.set_ylabel('Počet nehod', fontsize=9)
# Rotate x-axis labels for better readability
ax.tick_params(axis='x', rotation=45, labelsize=8)
ax.tick_params(axis='y', labelsize=8)
# Add grid for better readability
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)
# Save figure if path provided
if fig_location:
plt.savefig(fig_location, dpi=100, bbox_inches='tight')
# Show figure if requested
if show_figure:
plt.show()
else:
plt.close(fig)
# Ukol4: alkohol a roky v krajích
def plot_alcohol(df: pd.DataFrame, df_consequences: pd.DataFrame,
fig_location: str = None, show_figure: bool = False):
"""
Plot accident consequences with alcohol involvement by region and year.
Args:
df: Parsed DataFrame from parse_data()
df_consequences: DataFrame with consequence information
fig_location: Path to save figure
show_figure: Whether to display the figure
"""
# Merge dataframes on accident ID (1:n relationship)
# One accident can have multiple consequences
# Use inner join to keep only accidents with consequences
merged = df.merge(df_consequences, on='p1', how='inner')
# Filter for alcohol involvement (p11 >= 3 indicates alcohol-related conditions)
# Vectorized comparison is more efficient than iterating
merged = merged[merged['p11'] >= 3]
# Extract year and month using .dt accessor (efficient datetime operations)
merged['year'] = merged['date'].dt.year
merged['month'] = merged['date'].dt.month
# Filter for months 1-10 only (exclude Nov/Dec)
merged = merged[merged['month'] <= 10]
# Map p59g to injury type labels
injury_map = {
1: "Usmrcení",
2: "Těžké zranění",
3: "Lehké zranění",
4: "Bez zranění"
}
# Use category dtype for memory efficiency (repeated string values)
merged['injury_type'] = merged['p59g'].map(injury_map).astype('category')
# Remove rows with no injury type (NaN)
merged = merged.dropna(subset=['injury_type'])
# Count consequences per region, year, and injury type
# Using groupby for efficient aggregation
plot_data = merged.groupby(
['region', 'year', 'injury_type'], observed=True
).size().reset_index(name='count')
# Create figure with subplots for each injury type
g = sns.catplot(
data=plot_data,
x='region',
y='count',
hue='year',
col='injury_type',
kind='bar',
col_wrap=2,
height=4.5,
aspect=1.5,
palette='Set2',
sharex=True,
sharey=False # Independent y-axis scaling for each subplot
)
# Set titles and labels
g.set_axis_labels("Kraj", "Počet následků")
g.set_titles("Následky nehody: {col_name}")
# Rotate x-axis labels and show them on ALL subplots (not just bottom ones)
# Also set background color for consistency with other plots
background_color = '#f0f0f0' # Light gray background
for ax in g.axes.flat:
ax.tick_params(axis='x', rotation=45, labelbottom=True)
ax.set_facecolor(background_color)
# Add grid for better readability
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)
# Move legend outside to the right to prevent overlap with data
g._legend.set_bbox_to_anchor((1.05, 0.5))
g._legend.set_loc('center left')
g._legend.set_title('Rok')
# Adjust layout to prevent overlaps
plt.tight_layout()
# Save figure if path provided
if fig_location:
plt.savefig(fig_location, dpi=100, bbox_inches='tight')
# Show figure if requested
if show_figure:
plt.show()
else:
plt.close(g.figure)
# Ukol 5: Podmínky v čase
def plot_conditions(df: pd.DataFrame, fig_location: str = None,
show_figure: bool = False):
"""
Plot accident counts over time by conditions for selected regions.
Args:
df: Parsed DataFrame from parse_data()
fig_location: Path to save figure
show_figure: Whether to display the figure
"""
# Select 4 regions
selected_regions = ['JHM', 'MSK', 'OLK', 'ZLK']
df_filtered = df[df['region'].isin(selected_regions)].copy()
# Map p11 to condition labels
condition_map = {
0: "jiné",
1: "na počátku deště",
2: "mlha",
3: "alkohol 1 ‰ a více",
4: "alkohol do 0,99 ‰",
5: "pod vlivem drog",
6: "náledí",
7: "nárazový vítr",
8: "sněžení",
9: "déšť"
}
# Use category dtype for memory efficiency (repeated string values)
df_filtered['condition'] = df_filtered['p11'].map(condition_map).astype('category')
# Remove rows with unmapped conditions
df_filtered = df_filtered.dropna(subset=['condition'])
# Create pivot table: date × condition -> count (aggfunc='count' counts p1 values)
# Each date-region combination gets counts for each condition type
pivot = df_filtered.pivot_table(
values='p1',
index=['date', 'region'],
columns='condition',
aggfunc='count',
fill_value=0,
observed=True
).reset_index()
# Resample to monthly frequency for each region
# (must process regions separately due to multi-index structure)
resampled_dfs = []
for region in selected_regions:
# Filter data for current region and set date as index for resampling
region_data = pivot[pivot['region'] == region].set_index('date')
region_data = region_data.drop(columns=['region'])
# Resample to monthly frequency ('MS' = Month Start) and sum counts
monthly = region_data.resample('MS').sum()
monthly['region'] = region
resampled_dfs.append(monthly.reset_index())
# Concatenate all regions back together
monthly_data = pd.concat(resampled_dfs, ignore_index=True)
# Melt to long format (wide -> long transformation for plotting)
# Converts condition columns into rows for easier visualization with seaborn
plot_data = monthly_data.melt(
id_vars=['date', 'region'],
var_name='Podmínky',
value_name='Počet nehod'
)
# Create line plot
g = sns.relplot(
data=plot_data,
x='date',
y='Počet nehod',
hue='Podmínky',
col='region',
kind='line',
col_wrap=2,
height=4.5,
aspect=1.5,
facet_kws={'sharex': True, 'sharey': False},
legend='brief'
)
# Set titles and labels
g.set_axis_labels("Datum", "Počet nehod")
g.set_titles("Kraj: {col_name}")
# Set x-axis limits, rotate date labels
background_color = '#f0f0f0' # Light gray background
for ax in g.axes.flat:
ax.set_xlim(pd.Timestamp('2023-01-01'), pd.Timestamp('2025-01-01'))
ax.tick_params(axis='x', rotation=45, labelbottom=True)
ax.set_facecolor(background_color)
# Add grid for better readability
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)
# Move legend outside to prevent overlap
g._legend.set_bbox_to_anchor((1.05, 0.5))
g._legend.set_loc('center left')
g._legend.set_title('Podmínky')
# Adjust layout to prevent overlaps
plt.tight_layout()
# Save figure if path provided
if fig_location:
plt.savefig(fig_location, dpi=100, bbox_inches='tight')
# Show figure if requested
if show_figure:
plt.show()
else:
plt.close(g.figure)
if __name__ == "__main__":
# zde je ukazka pouziti, tuto cast muzete modifikovat podle libosti
# skript nebude pri testovani pousten primo, ale budou volany konkreni
# funkce.
df = load_data("data_23_25.zip", "nehody")
df_consequences = load_data("data_23_25.zip", "nasledky")
df_vehicles = load_data("data_23_25.zip", "Vozidla")
df2 = parse_data(df, True)
plot_state(df2, df_vehicles, "01_state.png")
plot_alcohol(df2, df_consequences, "02_alcohol.png")
plot_conditions(df2, "03_conditions.png")
# Poznamka:
# pro to, abyste se vyhnuli castemu nacitani muzete vyuzit napr
# VS Code a oznaceni jako bunky (radek #%%% )
# Pak muzete data jednou nacist a dale ladit jednotlive funkce
# Pripadne si muzete vysledny dataframe ulozit nekam na disk (pro ladici
# ucely) a nacitat jej naparsovany z disku