--- title: "Hausaufgabe 3 zur Vorlesung 'Programmierung für Data Science'" author: "VORNAME NACHNAME (MATRIKELNUMMER)" format: html execute: warning: false message: false --- ## Daten Der folgende Startcode ist bereitgestellt und muss nicht selbst entwickelt werden. Er lädt die Saison-Datei und erzeugt das Long-Format. ```{r setup} #| echo: true #| code-fold: true #| code-summary: "Startcode anzeigen" library(ggplot2) data_candidates <- c( "bundesliga_2024_25.csv", "static/bundesliga_2024_25.csv" ) data_file <- data_candidates[file.exists(data_candidates)][1] if (length(data_file) == 0 || is.na(data_file)) { stop("Die Datei 'bundesliga_2024_25.csv' wurde nicht gefunden.") } matches <- read.csv(data_file, stringsAsFactors = FALSE) matches$date <- as.Date(matches$date) matches$match_id <- seq_len(nrow(matches)) matches_long <- rbind( data.frame( match_id = matches$match_id, matchday = matches$matchday, team = matches$home_team, opponent = matches$away_team, home = 1L, goals = matches$home_goals, stringsAsFactors = FALSE ), data.frame( match_id = matches$match_id, matchday = matches$matchday, team = matches$away_team, opponent = matches$home_team, home = 0L, goals = matches$away_goals, stringsAsFactors = FALSE ) ) matches_long$team <- factor(matches_long$team) matches_long$opponent <- factor(matches_long$opponent) ``` ## Teil 1: Modell Beschreiben Sie hier kurz, wie Sie das Poisson-Modell schätzen und was es grundsätzlich modelliert. ```{r fit-model} #| echo: true fit_goals <- glm( goals ~ home + team + opponent, family = poisson(link = "log"), data = matches_long ) summary(fit_goals) ``` ## Teil 2: Erwartete Tore Berechnen Sie hier `lambda_home` und `lambda_away` für alle Saisonspiele und interpretieren Sie 2 bis 3 konkrete Begegnungen kurz. ```{r expected-goals} #| echo: true #| code-fold: true #| code-summary: "Code zur Berechnung der erwarteten Tore anzeigen" make_fixtures_with_lambda <- function(fit_goals, matches) { fixtures_with_lambda <- matches[ , c("matchday", "date", "home_team", "away_team") ] team_levels <- fit_goals$xlevels$team opponent_levels <- fit_goals$xlevels$opponent newdata_home <- data.frame( home = 1L, team = factor(matches$home_team, levels = team_levels), opponent = factor(matches$away_team, levels = opponent_levels) ) newdata_away <- data.frame( home = 0L, team = factor(matches$away_team, levels = team_levels), opponent = factor(matches$home_team, levels = opponent_levels) ) fixtures_with_lambda$lambda_home <- predict( fit_goals, newdata = newdata_home, type = "response" ) fixtures_with_lambda$lambda_away <- predict( fit_goals, newdata = newdata_away, type = "response" ) return(fixtures_with_lambda) } fixtures_with_lambda <- make_fixtures_with_lambda(fit_goals, matches) head(fixtures_with_lambda, 10) ``` ## Teil 3: Saison-Simulation Erstellen Sie hier eine Funktion, die eine komplette Saison einmal simuliert. Wiederholen Sie die Saison danach mindestens 1000 Mal. ```{r simulate-season} #| echo: true #| code-fold: true #| code-summary: "Code zur Saison-Simulation anzeigen" build_league_table <- function(results_df) { teams <- sort(unique(c(results_df$home_team, results_df$away_team))) table_df <- data.frame( team = teams, played = 0L, wins = 0L, draws = 0L, losses = 0L, goals_for = 0L, goals_against = 0L, goal_diff = 0L, points = 0L, stringsAsFactors = FALSE ) for (i in seq_len(nrow(results_df))) { home_team <- results_df$home_team[i] away_team <- results_df$away_team[i] home_goals <- results_df$sim_home_goals[i] away_goals <- results_df$sim_away_goals[i] home_idx <- match(home_team, table_df$team) away_idx <- match(away_team, table_df$team) table_df$played[home_idx] <- table_df$played[home_idx] + 1L table_df$played[away_idx] <- table_df$played[away_idx] + 1L table_df$goals_for[home_idx] <- table_df$goals_for[home_idx] + home_goals table_df$goals_against[home_idx] <- table_df$goals_against[home_idx] + away_goals table_df$goals_for[away_idx] <- table_df$goals_for[away_idx] + away_goals table_df$goals_against[away_idx] <- table_df$goals_against[away_idx] + home_goals if (home_goals > away_goals) { table_df$wins[home_idx] <- table_df$wins[home_idx] + 1L table_df$losses[away_idx] <- table_df$losses[away_idx] + 1L table_df$points[home_idx] <- table_df$points[home_idx] + 3L } else if (home_goals < away_goals) { table_df$wins[away_idx] <- table_df$wins[away_idx] + 1L table_df$losses[home_idx] <- table_df$losses[home_idx] + 1L table_df$points[away_idx] <- table_df$points[away_idx] + 3L } else { table_df$draws[home_idx] <- table_df$draws[home_idx] + 1L table_df$draws[away_idx] <- table_df$draws[away_idx] + 1L table_df$points[home_idx] <- table_df$points[home_idx] + 1L table_df$points[away_idx] <- table_df$points[away_idx] + 1L } } table_df$goal_diff <- table_df$goals_for - table_df$goals_against table_df <- table_df[ order( -table_df$points, -table_df$goal_diff, -table_df$goals_for, table_df$team ), ] table_df$position <- seq_len(nrow(table_df)) rownames(table_df) <- NULL table_df <- table_df[ , c("position", "team", "points", "goal_diff") ] return(table_df) } simulate_season <- function(fixtures_with_lambda) { results_df <- fixtures_with_lambda results_df$sim_home_goals <- rpois( n = nrow(results_df), lambda = results_df$lambda_home ) results_df$sim_away_goals <- rpois( n = nrow(results_df), lambda = results_df$lambda_away ) return(results_df) } set.seed(123) n_sim <- 1000L # Idee: # - eine Matrix fuer die Punkte anlegen # - eine zweite Matrix fuer die Tabellenpositionen anlegen # - pro Simulation die Abschlusstabelle berechnen # - Punkte und Positionen teamweise in die Matrizen eintragen teams <- sort(unique(fixtures_with_lambda$home_team)) n_teams <- length(teams) points_mat <- matrix(NA_real_, nrow = n_sim, ncol = n_teams) position_mat <- matrix(NA_integer_, nrow = n_sim, ncol = n_teams) colnames(points_mat) <- teams colnames(position_mat) <- teams for (sim in seq_len(n_sim)) { simulated_results <- simulate_season(fixtures_with_lambda) season_table <- build_league_table(simulated_results) points_mat[sim, season_table$team] <- season_table$points position_mat[sim, season_table$team] <- season_table$position } points_mat[1:6, 1:5] ``` ## Teil 4: Auswertung und Visualisierung Werten Sie die Simulationsergebnisse aus, erstellen Sie die beiden geforderten Visualisierungen und interpretieren Sie die wichtigsten Resultate. ```{r analysis} #| echo: true #| code-fold: true #| code-summary: "Code zur Auswertung anzeigen" champion_probability <- colMeans(position_mat == 1) mean_points <- colMeans(points_mat) summary_table <- data.frame( team = names(champion_probability), champion_probability = as.numeric(champion_probability), mean_points = as.numeric(mean_points), row.names = NULL, check.names = FALSE ) summary_table <- summary_table[order(-summary_table$champion_probability), ] summary_table ``` ```{r plot-champion-prob} #| echo: true ggplot(summary_table, aes( x = reorder(team, champion_probability), y = champion_probability )) + geom_col(fill = "#1b6ca8") + coord_flip() + labs( x = "Team", y = "Meisterwahrscheinlichkeit", title = "Geschätzte Meisterwahrscheinlichkeit aus 1000 Simulationen" ) ``` ```{r plot-team-points} #| echo: true team_order <- summary_table$team points_df <- data.frame(points_mat, check.names = FALSE) points_long <- stack(points_df) names(points_long) <- c("points", "team") ggplot( transform(points_long, team = factor(as.character(team), levels = rev(team_order))), aes(x = team, y = points) ) + geom_boxplot(fill = "#c16622", outlier.alpha = 0.25) + coord_flip() + labs( x = "Team", y = "Simulierte Punkte", title = "Boxplots der simulierten Punkteverteilungen je Team" ) ``` Beschreiben Sie hier kurz: - die wichtigsten Ergebnisse, - was die beiden Visualisierungen zeigen, - welche Grenzen das verwendete Modell hat.