statistical_analyses/probable_teams/GSA.py

import csv
import json
from collections import defaultdict, deque
from tqdm import tqdm

def parse_students(file_path: str) -> dict:
    """
    Parse the CSV file and return a dictionary with student preferences.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        data = {}
        for row in reader:
            data[row['Name'].strip()] = [int(row[f'P{i}']) for i in range(1, 6) if row[f'P{i}'].isdigit()]
    return data

def parse_projects(file_path: str, team_size: int) -> dict:
    """
    Parse the projects file and initialize project capacities.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        data = {}
        for row in reader:
            proj_id = int(row['proj ID'])
            data[proj_id] = {
                'name': row['Project name'].replace(".pdf", "").strip(),
                'capacity': team_size,
                'students': []  # Will store assigned students
            }
    return data

def stable_marriage(students: dict, projects: dict) -> dict:
    """
    Gale-Shapley algorithm to assign students to projects fairly, ensuring all students are included.
    """
    free_students = deque(students.keys())  # Students who haven't been assigned
    student_next_choice = {student: 0 for student in students}  # Track which project each student is trying next
    student_assigned = {student: None for student in students}  # Track which project each student is assigned to

    while free_students:
        student = free_students.popleft()
        preferences = students[student]
        next_choice_index = student_next_choice[student]

        if next_choice_index < len(preferences):
            proj_id = preferences[next_choice_index]
            student_next_choice[student] += 1  # Move to the next project in the list

            if proj_id in projects:
                project = projects[proj_id]

                # If the project has space, add the student
                if len(project['students']) < project['capacity']:
                    project['students'].append(student)
                    student_assigned[student] = proj_id  # Mark the student as assigned
                else:
                    # If the project is full, find the least preferred student in the project list
                    worst_student = min(project['students'], key=lambda s: students[s].index(proj_id))
                    worst_student_index = project['students'].index(worst_student)
                    project['students'][worst_student_index] = student
                    student_assigned[worst_student] = None  # The worst student is now free
                    free_students.append(worst_student)  # The removed student is free again
                    student_assigned[student] = proj_id  # Mark the current student as assigned

        # Ensure students are only re-added if they are unassigned
        if student_assigned[student] is None and student_next_choice[student] < len(preferences):
            free_students.append(student)

    return {proj_id: proj['students'] for proj_id, proj in projects.items()}

def save_teams_to_json(teams: dict, projects: dict, output_file: str):
    """
    Save the team assignments to a JSON file.
    """
    teams_with_names = {f"{proj_id}: {projects[proj_id]['name']}": students for proj_id, students in teams.items()}
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(teams_with_names, file, indent=4, ensure_ascii=False)

def main():
    student_file = 'data.csv'
    project_file = 'projects.csv'

    students = parse_students(student_file)

    for team_size in range(3, 6):
        projects = parse_projects(project_file, team_size)
        print(f"Assigning teams with size {team_size}...")
        teams = stable_marriage(students, projects)
        save_teams_to_json(teams, projects, f'transformed/gsa_assigned_teams_{team_size}.json')


if __name__ == '__main__':
    main()