Will follow up soon, this is wrong

2025-02-03 23:54:12 +01:00 · 2025-02-03 23:54:12 +01:00 · 8e60517251
commit 8e60517251
parent 3c399244ef
6 changed files with 469 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
 # statistical_analyses

-Random statistical analyses
+Random statistical analyses
+
+Basically an excuse to learn Grafana and R.
--- a/probable_teams/.gitignore
+++ b/probable_teams/.gitignore
@ -0,0 +1,2 @@
+*.csv
+*.json
--- a/probable_teams/GSA.py
+++ b/probable_teams/GSA.py
@ -0,0 +1,94 @@
+import csv
+import json
+from collections import defaultdict, deque
+from tqdm import tqdm
+
+def parse_students(file_path: str) -> dict:
+    """
+    Parse the CSV file and return a dictionary with student preferences.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        data = {}
+        for row in reader:
+            data[row['Name'].strip()] = [int(row[f'P{i}']) for i in range(1, 6) if row[f'P{i}'].isdigit()]
+    return data
+
+def parse_projects(file_path: str, team_size: int) -> dict:
+    """
+    Parse the projects file and initialize project capacities.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        data = {}
+        for row in reader:
+            proj_id = int(row['proj ID'])
+            data[proj_id] = {
+                'name': row['Project name'].replace(".pdf", "").strip(),
+                'capacity': team_size,
+                'students': []  # Will store assigned students
+            }
+    return data
+
+def stable_marriage(students: dict, projects: dict) -> dict:
+    """
+    Gale-Shapley algorithm to assign students to projects fairly, ensuring all students are included.
+    """
+    free_students = deque(students.keys())  # Students who haven't been assigned
+    student_next_choice = {student: 0 for student in students}  # Track which project each student is trying next
+    student_assigned = {student: None for student in students}  # Track which project each student is assigned to
+
+    while free_students:
+        student = free_students.popleft()
+        preferences = students[student]
+        next_choice_index = student_next_choice[student]
+        
+        if next_choice_index < len(preferences):
+            proj_id = preferences[next_choice_index]
+            student_next_choice[student] += 1  # Move to the next project in the list
+
+            if proj_id in projects:
+                project = projects[proj_id]
+
+                # If the project has space, add the student
+                if len(project['students']) < project['capacity']:
+                    project['students'].append(student)
+                    student_assigned[student] = proj_id  # Mark the student as assigned
+                else:
+                    # If the project is full, find the least preferred student in the project list
+                    worst_student = min(project['students'], key=lambda s: students[s].index(proj_id))
+                    worst_student_index = project['students'].index(worst_student)
+                    project['students'][worst_student_index] = student
+                    student_assigned[worst_student] = None  # The worst student is now free
+                    free_students.append(worst_student)  # The removed student is free again
+                    student_assigned[student] = proj_id  # Mark the current student as assigned
+
+        # Ensure students are only re-added if they are unassigned
+        if student_assigned[student] is None and student_next_choice[student] < len(preferences):
+            free_students.append(student)
+
+    return {proj_id: proj['students'] for proj_id, proj in projects.items()}
+
+def save_teams_to_json(teams: dict, projects: dict, output_file: str):
+    """
+    Save the team assignments to a JSON file.
+    """
+    teams_with_names = {f"{proj_id}: {projects[proj_id]['name']}": students for proj_id, students in teams.items()}
+    with open(output_file, 'w', encoding='utf-8') as file:
+        json.dump(teams_with_names, file, indent=4, ensure_ascii=False)
+
+def main():
+    student_file = 'data.csv'
+    project_file = 'projects.csv'
+    
+    students = parse_students(student_file)
+    
+    for team_size in range(3, 6):
+        projects = parse_projects(project_file, team_size)
+        print(f"Assigning teams with size {team_size}...")
+        teams = stable_marriage(students, projects)
+        save_teams_to_json(teams, projects, f'transformed/gsa_assigned_teams_{team_size}.json')
+    
+
+if __name__ == '__main__':
+    main()
--- a/probable_teams/README.Rmd
+++ b/probable_teams/README.Rmd
@ -0,0 +1,243 @@
+## Probable Teams
+I have a course called **Software Engineering**, which involves students picking a project proposed by external companies. We have access to a Google Spreadsheet, which lists all of the students and their preferences in the format:
+
+```csv
+Student,Preference 1,Preference 2,Preference 3, Preference 4, Preference 5
+Alice,1,2,3,4,5
+```
+
+Where the numbers represent the project IDs. The projects are listed in a separate Google Spreadsheet, which lists the project ID, the company name, and the project name.
+
+I will **not** be sharing the Google Spreadsheet with you(for obvious reasons), so take my word for it.
+
+
+
+### Normalized weights
+
+While thinking about how to figure out which students will be in which team, I realized that there's exists a concept which deals with this exact problem: [The Stable Marriage Problem](https://en.wikipedia.org/wiki/Stable_marriage_problem).
+
+The Stable Marriage Problem is a problem where there are `n` men and `n` women, and each person has a preference list of the opposite. The goal is to find a stable matching.
+
+Unfortunately, it is unknown how many students will be in a team, so we can't use the Stable Marriage Problem directly. However, we can use the concept of the Stable Marriage Problem to create a heuristic to solve this problem.
+
+### Heuristic
+
+1. For each student, assign them to their first preference.
+2. For each student, if their first preference is full, assign them to their second preference.
+3. Repeat step 2 until all students are assigned to a team.
+4. If a student has no more preferences left, assign them to a random team.
+5. If a team is full, assign the student to a random team.
+6. If a student is assigned to a team, remove them from the list of students.
+7. Repeat steps 3 to 7 until all students are assigned to a team.
+
+
+This is basically just a greedy algorithm.
+
+### GSA (Gale-Shapley Algorithm)
+
+The Gale-Shapley Algorithm is an algorithm that solves the Stable Marriage Problem. It is a heuristic that is guaranteed to find a stable matching. The idea is to have a list of free agents, and each free agent proposes to their most preferred option. The other party can either accept or reject the proposal. If they reject the proposal, they propose to their next most preferred option. This continues until all free agents are matched.
+
+We can use the Gale-Shapley Algorithm to solve this problem. The students will be the free agents, and the projects will be the other party.
+
+It might seem like the Gale-Shapley Algorithm is basically the same as the greedy approach, but it is not. Let's consider the following example:
+
+```csv
+Student,Preference 1,Preference 2,Preference 3, Preference 4, Preference 5
+Alice,1,2,3,4,5
+Bob,1,2,3,4,5
+Charlie,1,2,3,4,5
+```
+
+If we use the heuristic, Alice, Bob, and Charlie will all be assigned to the first project. However, if we use the Gale-Shapley Algorithm, Alice will be assigned to the first project, Bob will be assigned to the second project, and Charlie will be assigned to the third project. This is because the Gale-Shapley Algorithm is guaranteed to find a stable matching, while the heuristic is not.
+
+This, however may lead to a situation where a student is assigned to a project that is not in their top 5 preferences. This is a trade-off between a stable matching and the students' preferences.
+
+
+## Statistical analysis
+
+I will compare the results of the heuristic and the Gale-Shapley Algorithm to see which one is better. Exploration will be done for a team capacity $\in \{3,4,5\}$, which seemed like the most realistic range of team sizes.
+
+Loading the data:
+
+```{r, echo=FALSE, message=FALSE}
+# Load the required libraries
+library(ggplot2)
+library(dplyr)
+library(jsonlite)
+library(tidyr)
+
+# Load the previously assigned teams data
+greedy_3 <- fromJSON("transformed/greedy_assigned_teams_3.json")
+greedy_4 <- fromJSON("transformed/greedy_assigned_teams_4.json")
+greedy_5 <- fromJSON("transformed/greedy_assigned_teams_5.json")
+
+gsa_3 <- fromJSON("transformed/gsa_assigned_teams_3.json")
+gsa_4 <- fromJSON("transformed/gsa_assigned_teams_4.json")
+gsa_5 <- fromJSON("transformed/gsa_assigned_teams_5.json")
+
+# Function to convert JSON data to a tidy data frame for analysis
+convert_to_df <- function(team_data, team_size) {
+  team_list <- data.frame()
+  
+  for (proj_name in names(team_data)) {
+    proj_id <- as.numeric(strsplit(proj_name, ":")[[1]][1])
+    students <- team_data[[proj_name]]
+    
+    team_list <- rbind(team_list, data.frame(Project_ID = proj_id, Student = students, Team_Size = team_size))
+  }
+  
+  return(team_list)
+}
+
+# Convert the JSON data into data frames for easy analysis
+greedy_3_df <- convert_to_df(greedy_3, 3)
+greedy_4_df <- convert_to_df(greedy_4, 4)
+greedy_5_df <- convert_to_df(greedy_5, 5)
+
+gsa_3_df <- convert_to_df(gsa_3, 3)
+gsa_4_df <- convert_to_df(gsa_4, 4)
+gsa_5_df <- convert_to_df(gsa_5, 5)
+
+
+
+```
+
+We'll now measure how well the students' preferences align with the projects they were assigned to. One way to measure preference alignment is to calculate the average ranking of the assigned project from the student's preference list. Lower rankings indicate better alignment (i.e., students are assigned to their higher-ranked projects).
+
+```{r, echo=FALSE}
+# Function to calculate the average preference rank for each student in a given assignment
+calculate_preference_score <- function(assignment_df, student_preferences) {
+  # Merge the assignment data with student preferences
+  assignment_df <- merge(assignment_df, student_preferences, by.x = "Student", by.y = "Name")
+  
+  # Calculate the average rank of the assigned project for each student
+  assignment_df$Preference_Score <- apply(assignment_df, 1, function(row) {
+    project_id <- row["Project_ID"]
+    student_prefs <- row[c("P1", "P2", "P3", "P4", "P5")]
+    
+    # Find the rank of the assigned project in the student's preference list (1 is highest preference)
+    rank <- which(student_prefs == project_id)
+    
+    if (length(rank) == 0) {
+      return(NA)  # If the project is not in the preference list, return NA
+    } else {
+      return(rank)
+    }
+  })
+  
+  # Return the average preference score
+  return(mean(assignment_df$Preference_Score, na.rm = TRUE))  # Ignore NAs
+}
+
+# Load student preferences
+#  CSV format: Name,ID,Email,P1,P2,P3,P4,P5 (where P1 to P5 are project IDs and P1 is the most preferred)
+student_preferences <- read.csv("data.csv", stringsAsFactors = FALSE)
+student_preferences <- student_preferences[, c("Name", "P1", "P2", "P3", "P4", "P5")]
+
+# Calculate preference alignment for both methods and all team sizes
+greedy_3_score <- calculate_preference_score(greedy_3_df, student_preferences)
+greedy_4_score <- calculate_preference_score(greedy_4_df, student_preferences)
+greedy_5_score <- calculate_preference_score(greedy_5_df, student_preferences)
+
+gsa_3_score <- calculate_preference_score(gsa_3_df, student_preferences)
+gsa_4_score <- calculate_preference_score(gsa_4_df, student_preferences)
+gsa_5_score <- calculate_preference_score(gsa_5_df, student_preferences)
+
+# To visualize:
+preference_scores <- data.frame(
+  Method = rep(c("Greedy", "GSA"), each = 3),
+  Team_Size = rep(c(3, 4, 5), 2),
+  Preference_Score = c(greedy_3_score, greedy_4_score, greedy_5_score,
+                       gsa_3_score, gsa_4_score, gsa_5_score)
+)
+
+# Plot preference alignment
+ggplot(preference_scores, aes(x = Team_Size, y = Preference_Score, color = Method, group = Method)) +
+  geom_line() +
+  geom_point() +
+  labs(title = "Preference Alignment between Greedy and GSA", 
+       x = "Team Size", y = "Average Project Rank", color = "Method") +
+  theme_minimal()
+
+```
+
+We can clearly see that the Gale-Shapley Algorithm(GSA) outperforms the greedy approach in terms of preference alignment. This is expected, as the Gale-Shapley Algorithm is designed to find a stable matching that is optimal for both parties.
+
+Visualizing:
+
+```{r, echo=FALSE}
+# Load necessary library for visualization
+library(ggplot2)
+
+# Assuming preference_scores is the data frame with your results
+ggplot(preference_scores, aes(x = Method, y = Preference_Score, fill = Method)) +
+  geom_boxplot() +
+  labs(title = "Distribution of Preference Scores for Greedy and GSA",
+       x = "Method", y = "Preference Score (Average Rank)") +
+  theme_minimal()
+
+# Alternatively, a density plot for comparing distributions
+ggplot(preference_scores, aes(x = Preference_Score, fill = Method, color = Method)) +
+  geom_density(alpha = 0.5) +
+  labs(title = "Density Plot of Preference Scores for Greedy and GSA",
+       x = "Preference Score (Average Rank)", y = "Density") +
+  theme_minimal()
+```
+
+Again, we can see that the Gale-Shapley Algorithm(GSA) has a better distribution of preference scores compared to the greedy approach.
+
+We do not have enough data to perform a statistical test (as the greedy approach introduces randomness and needs to be run many many times), but the visualizations clearly show that the Gale-Shapley Algorithm(GSA) is better at aligning students' preferences with the projects they are assigned to.
+
+
+## Example: Ata Kircadag
+With all of that out of the way, let's take a specific student and see which groups they were assigned to using both the greedy approach and the Gale-Shapley Algorithm(GSA).
+
+Let's take the student with the name "Ata Kircadag" and see which groups they were assigned to using both methods.
+
+```{r, echo=FALSE}
+## Function to get the assigned teams for a specific student
+get_student_teams <- function(student_name, assignment_df) {
+  # Ensure we have a 'Student' column in the assignment dataframe
+  assignment_df <- assignment_df[assignment_df$Student == student_name, ]
+  return(assignment_df)
+}
+
+# Combine all greedy and GSA data frames
+greedy_combined_df <- rbind(greedy_3_df, greedy_4_df, greedy_5_df)
+gsa_combined_df <- rbind(gsa_3_df, gsa_4_df, gsa_5_df)
+
+# Get the assigned teams for the student "Ata Kircadag"
+student_name <- "Ata Kircadag"
+greedy_teams <- get_student_teams(student_name, greedy_combined_df)
+gsa_teams <- get_student_teams(student_name, gsa_combined_df)
+
+# Print the results
+print(paste("Greedy Approach Teams for", student_name))
+print(greedy_teams)
+
+print(paste("Gale-Shapley Algorithm Teams for", student_name))
+print(gsa_teams)
+
+
+```
+
+From the results, we can see that the student "Ata Kircadag" was assigned to different teams using the greedy approach and the Gale-Shapley Algorithm(GSA). This is expected, as the Gale-Shapley Algorithm is designed to find a stable matching that is optimal for all parties.
+
+Oddly, the student is not present in a 3 person team in the GS algorithm, which is a bit strange. This could be due to the fact that the student's preferences were not aligned with the available projects in the 3 person team.
+
+Well, we've come to the end of this analysis. Until the official results are out, we can't say for sure which method is better. But based on the statistical analysis, the Gale-Shapley Algorithm(GSA) seems to be more consistent (as it does not rely on randomness) and somewhat aligns better with the students' preferences.
+
+## Is the Gale-Shapley Algorithm(GSA) an appropriate choice?
+
+Short answer: No.
+
+The Gale-Shapley Algorithm(GSA) is designed to solve the Stable Marriage Problem, where there are equal numbers of opposing parties (i.e. men and women), and each party has a preference list of the other party. The algorithm is designed to find a **stable** matching that is optimal for both parties. 
+
+The problem we are trying to solve is different. We have students and projects, and the number of students is not equal to the number of projects. The Gale-Shapley Algorithm(GSA) is not designed to handle this scenario.
+
+Another issue with this approach is that we _don't really care about stability_ in this context. We care more about aligning students with their preferred projects. The Gale-Shapley Algorithm(GSA) may not always align students with their top preferences, as it aims to find a stable matching.
+
+The heuristic approach we discussed earlier is more appropriate for this problem, as it is designed to align students with their top preferences. It may not always find a stable matching, but that is not our primary concern.
+
+## Conclusion
+Idfk but i'm definitely getting into the project i want lmfao.
--- a/probable_teams/greedy.py
+++ b/probable_teams/greedy.py
@ -0,0 +1,126 @@
+import csv
+import json
+import random
+from collections import defaultdict
+
+def parse_students(file_path: str) -> dict:
+    """
+    Parse the CSV file and return a dictionary with student preferences.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        data = {}
+        for row in reader:
+            # Convert project IDs to integers if they exist, otherwise use an empty list
+            data[row['Name'].strip()] = [int(row[f'P{i}']) for i in range(1, 6) if row[f'P{i}'].isdigit()]
+    return data
+
+def parse_projects(file_path: str) -> dict:
+    # Project name, proj ID, chosen by:
+    with open(file_path, 'r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        data = {}
+        for row in reader:
+            # Project id to project name
+            data[int(row['proj ID'])] = row['Project name'].replace(".pdf", "").strip()
+    return data
+
+def assign_teams(data: dict, num_projects: int, team_size: int) -> dict:
+    """
+    Greedily assign students to teams based on their preferences using a heuristic approach.
+    """
+    project_capacity = defaultdict(lambda: team_size)
+    teams = defaultdict(list)
+    unassigned_students = []
+    
+    for student, preferences in data.items():
+        assigned = False
+        for project in preferences:
+            if project_capacity[project] > 0:
+                teams[project].append(student)
+                project_capacity[project] -= 1
+                assigned = True
+                break
+        if not assigned:
+            unassigned_students.append(student)
+    
+    available_projects = [p for p, slots in project_capacity.items() if slots > 0]
+    for student in unassigned_students:
+        if available_projects:
+            project = random.choice(available_projects)
+            teams[project].append(student)
+            project_capacity[project] -= 1
+            if project_capacity[project] == 0:
+                available_projects.remove(project)
+    
+    return teams
+
+def save_teams_to_json(teams: dict, output_file: str):
+    """
+    Save the team assignments to a JSON file.
+    """
+    with open(output_file, 'w', encoding='utf-8') as file:
+        json.dump(teams, file, indent=4, ensure_ascii=False)
+
+def replace_id_with_name(teams: dict, project_data: dict) -> dict:
+    """
+    Replace project IDs with project names in the team assignments.
+    """
+    teams_with_names = {}
+    for project_id, students in teams.items():
+        project_name = f"{project_id}: " + project_data.get(project_id, 'Unknown Project')
+        teams_with_names[project_name] = students
+    return teams_with_names
+
+def pos(look_for: str, data: dict, num_projects: int, team_size: int) -> dict:
+    """
+    Track the percentage of times a student gets assigned to each project for team sizes 4-7.
+    """
+    project_counts = defaultdict(int)
+    total_iterations = 10000  # Total number of iterations
+    
+    for i in range(total_iterations):
+        teams = assign_teams(data, num_projects, team_size)
+        for project_id, students in teams.items():
+            if look_for in students:
+                project_counts[project_id] += 1
+    
+    # Calculate the percentage of assignments for each project
+    project_percentages = {project_id: (count / total_iterations) * 100
+                           for project_id, count in project_counts.items()}
+    
+    return project_percentages
+
+def main():
+    file_path = 'data.csv'
+    num_projects = 24
+    student_name = input("Student name: ")
+    
+    # Parse the student preferences
+    data = parse_students(file_path)
+    
+    best_project = None
+    team_sizes = range(3, 100)
+    project_average_percentages = defaultdict(float)
+    
+    for team_size in team_sizes:
+        project_percentages = pos(student_name, data, num_projects, team_size)
+    
+        for project_id, percentage in project_percentages.items():
+            try:
+                project_average_percentages[project_id] += percentage
+            except KeyError:
+                project_average_percentages[project_id] = percentage
+                
+        num_team_sizes = len(team_sizes)
+        project_average_percentages = {project_id: percentage / num_team_sizes
+                                    for project_id, percentage in project_average_percentages.items()}
+        
+    best_project = max(project_average_percentages, key=project_average_percentages.get)
+    highest_average_percentage = project_average_percentages[best_project]
+    
+    project_name = parse_projects("projects.csv").get(best_project, 'Unknown Project')
+    print(f"{best_project}: {project_name} with a {highest_average_percentage:.2f}% chance of assignment")
+    print(f"Where the team size is {team_size}")
+if __name__ == '__main__':
+    main()