import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import LabelEncoder import joblib # ---------------------------- # 1. Define the symptom keywords you care about (fixed set) # ---------------------------- SYMPTOM_KEYWORDS = [ "cough", "shortness of breath", "wheezing", "chest pain", "fever", "sore throat", "fatigue", "nasal congestion" ] # ---------------------------- # 2. Load your CSV (labels come ONLY from here) # ---------------------------- CSV_FILE = "filtered_dataset.csv" # ← your actual file TEXT_COL = "Symptoms" # ← column with symptom descriptions LABEL_COL = "Disease" # ← column with disease names df = pd.read_csv(CSV_FILE) # Optional: Drop rows with missing symptoms or labels df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy() df[TEXT_COL] = df[TEXT_COL].astype(str) print(f"Loaded {len(df)} rows from CSV.") print(f"Unique diseases found: {sorted(df[LABEL_COL].unique())}") # ---------------------------- # 3. Convert free-text → binary symptom vector # ---------------------------- def symptoms_to_binary_vector(text: str): text = text.lower() return [1 if keyword in text else 0 for keyword in SYMPTOM_KEYWORDS] # Apply to every row X = df[TEXT_COL].apply(symptoms_to_binary_vector).tolist() y = df[LABEL_COL].values # labels directly from CSV # ---------------------------- # 4. Encode labels (if not already numeric) # ---------------------------- label_encoder = LabelEncoder() y_encoded = label_encoder.fit_transform(y) # ---------------------------- # 5. Train model # ---------------------------- X_train, X_test, y_train, y_test = train_test_split( X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train, y_train) # ---------------------------- # 6. Save everything for your app # ---------------------------- joblib.dump(model, "disease_model.pkl") joblib.dump(label_encoder, "label_encoder.pkl") joblib.dump(SYMPTOM_KEYWORDS, "symptom_keywords.pkl") print("\n✅ Training complete!") print("Saved: disease_model.pkl, label_encoder.pkl, symptom_keywords.pkl")