440 lines
14 KiB
Python
440 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
# coding=utf-8
|
||
"""
|
||
Author: Roman Nečas (xnecasr00)
|
||
Date: 8.11.2025
|
||
|
||
Traffic Accident Data Analysis Module.
|
||
|
||
This module provides functions for loading, parsing, and visualizing
|
||
Czech traffic accident data from 2023-2025.
|
||
|
||
Functions:
|
||
load_data: Load data from ZIP archive
|
||
parse_data: Parse and clean accident data
|
||
plot_state: Visualize accidents by driver state
|
||
plot_alcohol: Visualize alcohol-related accidents by consequences
|
||
plot_conditions: Visualize accidents over time by weather conditions
|
||
"""
|
||
|
||
from matplotlib import pyplot as plt
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
import numpy as np
|
||
import zipfile
|
||
|
||
# muzete pridat libovolnou zakladni knihovnu ci knihovnu predstavenou na prednaskach
|
||
# dalsi knihovny pak na dotaz
|
||
|
||
# Ukol 1: nacteni dat ze ZIP souboru
|
||
|
||
|
||
def load_data(filename: str, ds: str) -> pd.DataFrame:
|
||
"""
|
||
Load traffic accident data from ZIP file for years 2023-2025.
|
||
|
||
Args:
|
||
filename: Path to ZIP file containing data
|
||
ds: Dataset name without 'I' prefix (e.g., 'nehody', 'Vozidla', 'nasledky')
|
||
|
||
Returns:
|
||
DataFrame containing concatenated data from all three years
|
||
"""
|
||
# Add 'I' prefix to dataset name
|
||
file_ds = 'I' + ds
|
||
years = ['2023', '2024', '2025']
|
||
dfs = []
|
||
|
||
with zipfile.ZipFile(filename, 'r') as z:
|
||
for year in years:
|
||
file_path = f"{year}/{file_ds}.xls"
|
||
with z.open(file_path) as f:
|
||
# Read HTML file with cp1250 encoding
|
||
df = pd.read_html(f, encoding='cp1250')[0]
|
||
dfs.append(df)
|
||
|
||
# Concatenate all years
|
||
result = pd.concat(dfs, ignore_index=True)
|
||
|
||
# Remove unnamed columns
|
||
result = result.loc[:, ~result.columns.str.contains('^Unnamed')]
|
||
|
||
return result
|
||
|
||
# Ukol 2: zpracovani dat
|
||
|
||
|
||
def parse_data(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
|
||
"""
|
||
Parse and clean accident data.
|
||
|
||
Args:
|
||
df: Raw DataFrame from load_data()
|
||
verbose: If True, print memory usage information
|
||
|
||
Returns:
|
||
Cleaned DataFrame with date and region columns
|
||
"""
|
||
# Create a copy to avoid modifying original
|
||
df = df.copy()
|
||
|
||
# Create date column from p2a (format: DD.MM.YYYY)
|
||
df['date'] = pd.to_datetime(df['p2a'], format='%d.%m.%Y')
|
||
|
||
# Map region codes to names
|
||
region_map = {
|
||
0: "PHA", 1: "STC", 2: "JHC", 3: "PLK", 4: "ULK", 5: "HKK",
|
||
6: "JHM", 7: "MSK", 14: "OLK", 15: "ZLK", 16: "VYS", 17: "PAK",
|
||
18: "LBK", 19: "KVK"
|
||
}
|
||
# Use category dtype for memory efficiency (repeated string values)
|
||
df['region'] = df['p4a'].map(region_map).astype('category')
|
||
|
||
# Remove duplicates by accident ID
|
||
df = df.drop_duplicates(subset=['p1'])
|
||
|
||
# Calculate and print memory usage if verbose
|
||
if verbose:
|
||
memory_bytes = df.memory_usage(deep=True).sum()
|
||
memory_mb = memory_bytes / 1_000_000 # 1 MB = 10^6 bytes
|
||
print(f"new_size={memory_mb:.1f} MB")
|
||
|
||
return df
|
||
|
||
|
||
# Ukol 3: počty nehod v jednotlivých regionech podle stavu řidiče
|
||
def plot_state(df: pd.DataFrame, df_vehicles: pd.DataFrame, fig_location: str = None,
|
||
show_figure: bool = False):
|
||
"""
|
||
Plot accident counts by driver state across regions.
|
||
|
||
Args:
|
||
df: Parsed DataFrame from parse_data()
|
||
df_vehicles: DataFrame with vehicle information
|
||
fig_location: Path to save figure
|
||
show_figure: Whether to display the figure
|
||
"""
|
||
# Merge dataframes on accident ID (left join to keep all accidents)
|
||
# Select only p1 and p57 columns from vehicles to reduce memory usage
|
||
merged = df.merge(df_vehicles[['p1', 'p57']], on='p1', how='left')
|
||
|
||
# Filter for p57 values 3-9 only (specific driver states)
|
||
# Use .isin() for efficient vectorized filtering (faster than multiple conditions)
|
||
merged = merged[merged['p57'].isin([3, 4, 5, 6, 7, 8, 9])]
|
||
|
||
# Map p57 values to descriptive labels
|
||
state_map = {
|
||
3: "Nemoc, únava",
|
||
4: "Pod vlivem léků",
|
||
5: "Invalida",
|
||
6: "Řidič při jiné činnosti",
|
||
7: "Řidič se nevěnoval řízení",
|
||
8: "Náhlá fyzická indispozice",
|
||
9: "Jiné"
|
||
}
|
||
# Use category dtype for memory efficiency (repeated string values)
|
||
merged['driver_state'] = merged['p57'].map(state_map).astype('category')
|
||
|
||
# Aggregate by region and driver state using groupby (vectorized aggregation)
|
||
# .size() counts rows per group, reset_index() converts to DataFrame
|
||
plot_data = merged.groupby(
|
||
['region', 'driver_state'], observed=True
|
||
).size().reset_index(name='count')
|
||
|
||
# Select 6 driver states for the 6 subplots (excluding one of the 7 states)
|
||
selected_states = [
|
||
"Nemoc, únava",
|
||
"Pod vlivem léků",
|
||
"Invalida",
|
||
"Řidič při jiné činnosti",
|
||
"Řidič se nevěnoval řízení",
|
||
"Náhlá fyzická indispozice"
|
||
]
|
||
|
||
# Filter data to only include selected states
|
||
plot_data = plot_data[plot_data['driver_state'].isin(selected_states)]
|
||
|
||
# Create figure with 6 subplots in 3 rows × 2 columns grid
|
||
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 12), constrained_layout=True)
|
||
axes = axes.flatten() # Flatten to 1D array for easier iteration
|
||
|
||
# Define custom background color for subplots
|
||
background_color = '#f0f0f0' # Light gray background
|
||
|
||
# Create a bar chart for each driver state
|
||
for idx, state in enumerate(selected_states):
|
||
ax = axes[idx]
|
||
|
||
# Filter data for current state
|
||
state_data = plot_data[plot_data['driver_state'] == state]
|
||
|
||
# Sort regions by count (descending) for better visualization
|
||
state_data = state_data.sort_values('count', ascending=False)
|
||
|
||
# Create bar chart
|
||
ax.bar(state_data['region'], state_data['count'],
|
||
color='#5b8db8', # Steel blue color for bars (neutral)
|
||
alpha=0.8)
|
||
|
||
# Set custom background color
|
||
ax.set_facecolor(background_color)
|
||
|
||
# Set title for subplot
|
||
ax.set_title(f'Stav řidiče: {state}', fontsize=11, fontweight='bold')
|
||
|
||
# Set axis labels
|
||
ax.set_xlabel('Kraj', fontsize=9)
|
||
ax.set_ylabel('Počet nehod', fontsize=9)
|
||
|
||
# Rotate x-axis labels for better readability
|
||
ax.tick_params(axis='x', rotation=45, labelsize=8)
|
||
ax.tick_params(axis='y', labelsize=8)
|
||
|
||
# Add grid for better readability
|
||
ax.grid(axis='y', alpha=0.3, linestyle='--')
|
||
ax.set_axisbelow(True)
|
||
|
||
# Save figure if path provided
|
||
if fig_location:
|
||
plt.savefig(fig_location, dpi=100, bbox_inches='tight')
|
||
|
||
# Show figure if requested
|
||
if show_figure:
|
||
plt.show()
|
||
else:
|
||
plt.close(fig)
|
||
|
||
|
||
# Ukol4: alkohol a roky v krajích
|
||
def plot_alcohol(df: pd.DataFrame, df_consequences: pd.DataFrame,
|
||
fig_location: str = None, show_figure: bool = False):
|
||
"""
|
||
Plot accident consequences with alcohol involvement by region and year.
|
||
|
||
Args:
|
||
df: Parsed DataFrame from parse_data()
|
||
df_consequences: DataFrame with consequence information
|
||
fig_location: Path to save figure
|
||
show_figure: Whether to display the figure
|
||
"""
|
||
# Merge dataframes on accident ID (1:n relationship)
|
||
# One accident can have multiple consequences
|
||
# Use inner join to keep only accidents with consequences
|
||
merged = df.merge(df_consequences, on='p1', how='inner')
|
||
|
||
# Filter for alcohol involvement (p11 >= 3 indicates alcohol-related conditions)
|
||
# Vectorized comparison is more efficient than iterating
|
||
merged = merged[merged['p11'] >= 3]
|
||
|
||
# Extract year and month using .dt accessor (efficient datetime operations)
|
||
merged['year'] = merged['date'].dt.year
|
||
merged['month'] = merged['date'].dt.month
|
||
|
||
# Filter for months 1-10 only (exclude Nov/Dec)
|
||
merged = merged[merged['month'] <= 10]
|
||
|
||
# Map p59g to injury type labels
|
||
injury_map = {
|
||
1: "Usmrcení",
|
||
2: "Těžké zranění",
|
||
3: "Lehké zranění",
|
||
4: "Bez zranění"
|
||
}
|
||
# Use category dtype for memory efficiency (repeated string values)
|
||
merged['injury_type'] = merged['p59g'].map(injury_map).astype('category')
|
||
|
||
# Remove rows with no injury type (NaN)
|
||
merged = merged.dropna(subset=['injury_type'])
|
||
|
||
# Count consequences per region, year, and injury type
|
||
# Using groupby for efficient aggregation
|
||
plot_data = merged.groupby(
|
||
['region', 'year', 'injury_type'], observed=True
|
||
).size().reset_index(name='count')
|
||
|
||
# Create figure with subplots for each injury type
|
||
g = sns.catplot(
|
||
data=plot_data,
|
||
x='region',
|
||
y='count',
|
||
hue='year',
|
||
col='injury_type',
|
||
kind='bar',
|
||
col_wrap=2,
|
||
height=4.5,
|
||
aspect=1.5,
|
||
palette='Set2',
|
||
sharex=True,
|
||
sharey=False # Independent y-axis scaling for each subplot
|
||
)
|
||
|
||
# Set titles and labels
|
||
g.set_axis_labels("Kraj", "Počet následků")
|
||
g.set_titles("Následky nehody: {col_name}")
|
||
|
||
# Rotate x-axis labels and show them on ALL subplots (not just bottom ones)
|
||
# Also set background color for consistency with other plots
|
||
background_color = '#f0f0f0' # Light gray background
|
||
for ax in g.axes.flat:
|
||
ax.tick_params(axis='x', rotation=45, labelbottom=True)
|
||
ax.set_facecolor(background_color)
|
||
# Add grid for better readability
|
||
ax.grid(axis='y', alpha=0.3, linestyle='--')
|
||
ax.set_axisbelow(True)
|
||
|
||
# Move legend outside to the right to prevent overlap with data
|
||
g._legend.set_bbox_to_anchor((1.05, 0.5))
|
||
g._legend.set_loc('center left')
|
||
g._legend.set_title('Rok')
|
||
|
||
# Adjust layout to prevent overlaps
|
||
plt.tight_layout()
|
||
|
||
# Save figure if path provided
|
||
if fig_location:
|
||
plt.savefig(fig_location, dpi=100, bbox_inches='tight')
|
||
|
||
# Show figure if requested
|
||
if show_figure:
|
||
plt.show()
|
||
else:
|
||
plt.close(g.figure)
|
||
|
||
|
||
# Ukol 5: Podmínky v čase
|
||
def plot_conditions(df: pd.DataFrame, fig_location: str = None,
|
||
show_figure: bool = False):
|
||
"""
|
||
Plot accident counts over time by conditions for selected regions.
|
||
|
||
Args:
|
||
df: Parsed DataFrame from parse_data()
|
||
fig_location: Path to save figure
|
||
show_figure: Whether to display the figure
|
||
"""
|
||
# Select 4 regions
|
||
selected_regions = ['JHM', 'MSK', 'OLK', 'ZLK']
|
||
df_filtered = df[df['region'].isin(selected_regions)].copy()
|
||
|
||
# Map p11 to condition labels
|
||
condition_map = {
|
||
0: "jiné",
|
||
1: "na počátku deště",
|
||
2: "mlha",
|
||
3: "alkohol 1 ‰ a více",
|
||
4: "alkohol do 0,99 ‰",
|
||
5: "pod vlivem drog",
|
||
6: "náledí",
|
||
7: "nárazový vítr",
|
||
8: "sněžení",
|
||
9: "déšť"
|
||
}
|
||
# Use category dtype for memory efficiency (repeated string values)
|
||
df_filtered['condition'] = df_filtered['p11'].map(condition_map).astype('category')
|
||
|
||
# Remove rows with unmapped conditions
|
||
df_filtered = df_filtered.dropna(subset=['condition'])
|
||
|
||
# Create pivot table: date × condition -> count (aggfunc='count' counts p1 values)
|
||
# Each date-region combination gets counts for each condition type
|
||
pivot = df_filtered.pivot_table(
|
||
values='p1',
|
||
index=['date', 'region'],
|
||
columns='condition',
|
||
aggfunc='count',
|
||
fill_value=0,
|
||
observed=True
|
||
).reset_index()
|
||
|
||
# Resample to monthly frequency for each region
|
||
# (must process regions separately due to multi-index structure)
|
||
resampled_dfs = []
|
||
for region in selected_regions:
|
||
# Filter data for current region and set date as index for resampling
|
||
region_data = pivot[pivot['region'] == region].set_index('date')
|
||
region_data = region_data.drop(columns=['region'])
|
||
|
||
# Resample to monthly frequency ('MS' = Month Start) and sum counts
|
||
monthly = region_data.resample('MS').sum()
|
||
monthly['region'] = region
|
||
resampled_dfs.append(monthly.reset_index())
|
||
|
||
# Concatenate all regions back together
|
||
monthly_data = pd.concat(resampled_dfs, ignore_index=True)
|
||
|
||
# Melt to long format (wide -> long transformation for plotting)
|
||
# Converts condition columns into rows for easier visualization with seaborn
|
||
plot_data = monthly_data.melt(
|
||
id_vars=['date', 'region'],
|
||
var_name='Podmínky',
|
||
value_name='Počet nehod'
|
||
)
|
||
|
||
# Create line plot
|
||
g = sns.relplot(
|
||
data=plot_data,
|
||
x='date',
|
||
y='Počet nehod',
|
||
hue='Podmínky',
|
||
col='region',
|
||
kind='line',
|
||
col_wrap=2,
|
||
height=4.5,
|
||
aspect=1.5,
|
||
facet_kws={'sharex': True, 'sharey': False},
|
||
legend='brief'
|
||
)
|
||
|
||
# Set titles and labels
|
||
g.set_axis_labels("Datum", "Počet nehod")
|
||
g.set_titles("Kraj: {col_name}")
|
||
|
||
# Set x-axis limits, rotate date labels
|
||
background_color = '#f0f0f0' # Light gray background
|
||
for ax in g.axes.flat:
|
||
ax.set_xlim(pd.Timestamp('2023-01-01'), pd.Timestamp('2025-01-01'))
|
||
ax.tick_params(axis='x', rotation=45, labelbottom=True)
|
||
ax.set_facecolor(background_color)
|
||
# Add grid for better readability
|
||
ax.grid(axis='y', alpha=0.3, linestyle='--')
|
||
ax.set_axisbelow(True)
|
||
|
||
# Move legend outside to prevent overlap
|
||
g._legend.set_bbox_to_anchor((1.05, 0.5))
|
||
g._legend.set_loc('center left')
|
||
g._legend.set_title('Podmínky')
|
||
|
||
# Adjust layout to prevent overlaps
|
||
plt.tight_layout()
|
||
|
||
# Save figure if path provided
|
||
if fig_location:
|
||
plt.savefig(fig_location, dpi=100, bbox_inches='tight')
|
||
|
||
# Show figure if requested
|
||
if show_figure:
|
||
plt.show()
|
||
else:
|
||
plt.close(g.figure)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# zde je ukazka pouziti, tuto cast muzete modifikovat podle libosti
|
||
# skript nebude pri testovani pousten primo, ale budou volany konkreni
|
||
# funkce.
|
||
|
||
df = load_data("data_23_25.zip", "nehody")
|
||
df_consequences = load_data("data_23_25.zip", "nasledky")
|
||
df_vehicles = load_data("data_23_25.zip", "Vozidla")
|
||
df2 = parse_data(df, True)
|
||
|
||
plot_state(df2, df_vehicles, "01_state.png")
|
||
plot_alcohol(df2, df_consequences, "02_alcohol.png")
|
||
plot_conditions(df2, "03_conditions.png")
|
||
|
||
# Poznamka:
|
||
# pro to, abyste se vyhnuli castemu nacitani muzete vyuzit napr
|
||
# VS Code a oznaceni jako bunky (radek #%%% )
|
||
# Pak muzete data jednou nacist a dale ladit jednotlive funkce
|
||
# Pripadne si muzete vysledny dataframe ulozit nekam na disk (pro ladici
|
||
# ucely) a nacitat jej naparsovany z disku
|