Spaces:

halsabbah
/

depscreen

Runtime error

App Files Files Community

halsabbah commited on Apr 26

Commit

9baab59

verified ·

1 Parent(s): a3e927e

deploy: sync code from GitHub main

Browse files

Files changed (10) hide show

.env.example +4 -0
.github/workflows/ci.yml +7 -1
.gitignore +1 -0
app/core/config.py +4 -0
app/services/inference.py +56 -0
app/services/x_client.py +102 -35
main.py +20 -0
requirements.txt +3 -2
tests/test_inference_splitter.py +171 -0
tests/test_x_client.py +0 -1

.env.example CHANGED Viewed

@@ -37,6 +37,10 @@ CORS_ORIGINS=["http://localhost:3000","http://localhost:5173"]
 X_USERNAME=your-x-username
 X_EMAIL=your-x-email@example.com
 X_PASSWORD=your-x-password
 # Environment
 ENVIRONMENT=development

 X_USERNAME=your-x-username
 X_EMAIL=your-x-email@example.com
 X_PASSWORD=your-x-password
+# Base64-encoded JSON cookies fallback. Generate with:
+#   python -c "import base64,json; print(base64.b64encode(json.dumps({'auth_token':'...','ct0':'...'}).encode()).decode())"
+# On startup, login() is tried first. If it succeeds, this is ignored.
+X_COOKIES=
 # Environment
 ENVIRONMENT=development

.github/workflows/ci.yml CHANGED Viewed

@@ -116,6 +116,12 @@ jobs:
       - name: Audit backend requirements
         # --desc: show a one-line description of each CVE
         # --ignore-vuln: deliberately ignored CVEs with justification:
         #   CVE-2026-1839 (transformers) — affects Trainer._load_rng_state,
         #     only reachable when loading malicious rng_state.pth checkpoints
@@ -129,7 +135,7 @@ jobs:
         #     Attack requires local access; no remote exploit path. Fix is in
         #     2.8.0; re-evaluate when 2.8.0 ships stable.
         run: |
-          pip-audit --requirement requirements.txt --desc --strict \
             --ignore-vuln CVE-2026-1839 \
             --ignore-vuln CVE-2025-2953 \
             --ignore-vuln CVE-2025-3730

       - name: Audit backend requirements
         # --desc: show a one-line description of each CVE
+        # --no-strict: twikit is installed from a Codeberg VCS URL (phin fork
+        #   that fixes the KEY_BYTE indices bug in upstream 2.3.3). VCS
+        #   dependencies are not registered on PyPI and cannot be audited by
+        #   pip-audit. --no-strict (the default without --strict) treats
+        #   unauditable packages as warnings rather than hard failures, so
+        #   the scan still covers all 40+ PyPI-sourced dependencies.
         # --ignore-vuln: deliberately ignored CVEs with justification:
         #   CVE-2026-1839 (transformers) — affects Trainer._load_rng_state,
         #     only reachable when loading malicious rng_state.pth checkpoints
         #     Attack requires local access; no remote exploit path. Fix is in
         #     2.8.0; re-evaluate when 2.8.0 ships stable.
         run: |
+          pip-audit --requirement requirements.txt --desc \
             --ignore-vuln CVE-2026-1839 \
             --ignore-vuln CVE-2025-2953 \
             --ignore-vuln CVE-2025-3730

.gitignore CHANGED Viewed

@@ -110,3 +110,4 @@ ml/data/suicide_watch/
 # X/Twitter twikit cookie cache (contains session tokens)
 .x_cookies.json

 # X/Twitter twikit cookie cache (contains session tokens)
 .x_cookies.json
+*.tw_session

app/core/config.py CHANGED Viewed

@@ -57,6 +57,10 @@ class Settings(BaseSettings):
     x_username: str = ""
     x_email: str = ""
     x_password: str = ""
     # ── Error monitoring (Sentry) ──────────────────────────────────────────
     # If unset, Sentry initializes as a no-op (local dev, CI).

     x_username: str = ""
     x_email: str = ""
     x_password: str = ""
+    # Base64-encoded JSON cookies: {"auth_token": "...", "ct0": "..."}
+    # Fallback when login() is blocked (new accounts). On startup, login()
+    # is tried first; if it succeeds, this is ignored and cookies auto-refresh.
+    x_cookies: str = ""
     # ── Error monitoring (Sentry) ──────────────────────────────────────────
     # If unset, Sentry initializes as a no-op (local dev, CI).

app/services/inference.py CHANGED Viewed

@@ -63,6 +63,55 @@ class SymptomClassifier(nn.Module):
 # ── Sentence Splitting ────────────────────────────────────────────────────────
 def split_into_sentences(text: str) -> list[str]:
     """Rule-based sentence splitter for English and Arabic informal text."""
@@ -409,6 +458,13 @@ class ModelService:
                 dsm5_criteria_met=[],
             )
         detections: list[SymptomDetection] = []
         for i, sentence in enumerate(sentences):

 # ── Sentence Splitting ────────────────────────────────────────────────────────
+# Minimum character length a clause must have after compound splitting.
+# Fragments shorter than this lack enough context for reliable classification.
+_MIN_CLAUSE_LEN = 15
+_COMMA_RE = re.compile(r",\s+")
+_SEMICOLON_RE = re.compile(r";\s*")
+_ADVERSATIVE_RE = re.compile(
+    r"\s+(?:but|yet|however|though|although|whereas|still|while|meanwhile)\s+",
+    re.IGNORECASE,
+)
+_AND_RE = re.compile(r"\s+and\s+", re.IGNORECASE)
+# Only attempt "and" splits on sentences longer than this to avoid fragmenting
+# short phrases like "sad and tired" into useless single-word segments.
+_AND_MIN_LEN = 40
+def split_compound_sentence(sentence: str) -> list[str]:
+    """Split a compound sentence into clauses for finer-grained classification.
+    A single compound input ("overwhelmed and exhausted, can't focus") forces
+    the model to pick ONE label — the dominant symptom swamps the rest.
+    Splitting it lets each clause be classified independently, dramatically
+    improving recall for co-occurring symptoms.
+    Only splits when every resulting segment is >= _MIN_CLAUSE_LEN characters;
+    shorter fragments lack context and hurt model accuracy. Simple sentences
+    pass through unchanged — this is strictly additive (zero regression risk).
+    Split priority (first successful split wins):
+      1. Semicolons — strongest clause boundary in informal writing.
+      2. Commas — most common mid-sentence clause boundary.
+      3. Adversative conjunctions (but / yet / however / though / …).
+      4. "and" — only for sentences longer than _AND_MIN_LEN.
+    """
+    def _try(pattern: re.Pattern) -> list[str] | None:
+        parts = [p.strip() for p in pattern.split(sentence) if p.strip()]
+        if len(parts) > 1 and all(len(p) >= _MIN_CLAUSE_LEN for p in parts):
+            return parts
+        return None
+    return (
+        _try(_SEMICOLON_RE)
+        or _try(_COMMA_RE)
+        or _try(_ADVERSATIVE_RE)
+        or (len(sentence) >= _AND_MIN_LEN and _try(_AND_RE))
+        or [sentence]
+    )
 def split_into_sentences(text: str) -> list[str]:
     """Rule-based sentence splitter for English and Arabic informal text."""
                 dsm5_criteria_met=[],
             )
+        # Second pass: split compound sentences into clauses so each symptom
+        # gets its own focused input rather than competing inside one sentence.
+        expanded: list[str] = []
+        for sent in sentences:
+            expanded.extend(split_compound_sentence(sent))
+        sentences = expanded
         detections: list[SymptomDetection] = []
         for i, sentence in enumerate(sentences):

app/services/x_client.py CHANGED Viewed

@@ -1,64 +1,139 @@
 """
-X/Twitter integration via twikit.
-Wraps twikit's async Client to fetch a user's public tweets for
-depression screening. Uses cookie-based auth with the @depscreen
-service account.
-Cookie caching avoids re-login on every request. If cookies expire,
-one re-login attempt is made before failing.
 """
 import logging
 import math
 from datetime import UTC, datetime
-from pathlib import Path
 from twikit import Client, TooManyRequests
 from app.services.ingestion import MENTAL_HEALTH_KEYWORDS, Tweet
 logger = logging.getLogger(__name__)
-# Resolve cookie path relative to this file → backend/.x_cookies.json
-_COOKIES_PATH = str(Path(__file__).resolve().parent.parent.parent / ".x_cookies.json")
 class XClient:
     """Singleton wrapper around twikit for fetching X/Twitter user tweets."""
-    def __init__(self, username: str, email: str, password: str) -> None:
         self._username = username
         self._email = email
         self._password = password
         self._client = Client("en-US")
         self._authenticated = False
     async def initialize(self) -> None:
-        """Authenticate with X. Try cookies first, fall back to login."""
-        try:
-            self._client.load_cookies(_COOKIES_PATH)
-            self._authenticated = True
-            logger.info("X/Twitter: loaded cached cookies")
-        except Exception:
-            logger.info("X/Twitter: no cached cookies, performing login")
-            await self._login()
-    async def _login(self) -> None:
-        """Perform a fresh login and save cookies."""
         try:
             await self._client.login(
                 auth_info_1=self._username,
                 auth_info_2=self._email,
                 password=self._password,
             )
-            self._client.save_cookies(_COOKIES_PATH)
             self._authenticated = True
-            logger.info("X/Twitter: login successful, cookies saved")
         except Exception as e:
-            logger.error(f"X/Twitter login failed: {e}")
-            self._authenticated = False
-            raise ValueError("X/Twitter authentication failed. Please check credentials.") from e
     async def fetch_user_tweets(
         self,
@@ -89,15 +164,7 @@ class XClient:
             minutes = self._rate_limit_minutes(e)
             raise ValueError(f"X rate limit reached — please try again in {minutes} minutes.") from e
         except Exception as e:
-            # Try one re-login in case cookies expired
-            if "auth" in str(e).lower() or "login" in str(e).lower():
-                try:
-                    await self._login()
-                    user = await self._client.get_user_by_screen_name(username)
-                except Exception:
-                    raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
-            else:
-                raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
         try:
             tweets_result = await user.get_tweets("Tweets", count=limit)

 """
+X/Twitter integration via twikit (phin fork).
+Deployment-friendly auth strategy:
+1. On startup: try login() for fresh cookies (auto-refresh).
+2. If login fails (new account, error 399): fall back to X_COOKIES env var.
+3. Every 12h: background task retries login(). The moment X accepts it
+   (account ages past anti-spam gate), cookie refresh becomes automatic.
+Cookies are stored in the X_COOKIES env var as base64-encoded JSON,
+not on disk — works in stateless containers (HuggingFace Spaces, Docker).
+Workaround for twikit User parsing: new X accounts are missing optional
+fields like 'withheld_in_countries'. SafeDict returns sensible defaults.
 """
+import base64
+import json
 import logging
 import math
 from datetime import UTC, datetime
 from twikit import Client, TooManyRequests
+from twikit.user import User as TwikitUser
 from app.services.ingestion import MENTAL_HEALTH_KEYWORDS, Tweet
 logger = logging.getLogger(__name__)
+# ── twikit User parsing workaround ───────────────────────────────────────────
+class _SafeDict(dict):
+    """Dict that returns sensible defaults for missing keys.
+    twikit's User.__init__ accesses optional fields with raw dict[] lookups.
+    New X accounts lack fields like 'pinned_tweet_ids_str',
+    'withheld_in_countries'. This prevents KeyError crashes.
+    """
+    def __missing__(self, key: str):
+        if any(key.endswith(s) for s in ("_str", "_ids", "urls", "countries")):
+            return []
+        if key.endswith("_count") or key.endswith("_int"):
+            return 0
+        if key.endswith("_url") or key.endswith("_https"):
+            return ""
+        if key in ("entities", "description", "url"):
+            return _SafeDict()
+        return None
+def _safe_wrap(data):
+    """Recursively wrap dicts in _SafeDict."""
+    if isinstance(data, dict):
+        return _SafeDict({k: _safe_wrap(v) for k, v in data.items()})
+    if isinstance(data, list):
+        return [_safe_wrap(i) for i in data]
+    return data
+_original_user_init = TwikitUser.__init__
+def _patched_user_init(self, client, data, **kwargs):
+    if "legacy" in data:
+        data["legacy"] = _safe_wrap(data["legacy"])
+    _original_user_init(self, client, data, **kwargs)
+TwikitUser.__init__ = _patched_user_init
+# ── XClient ──────────────────────────────────────────────────────────────────
 class XClient:
     """Singleton wrapper around twikit for fetching X/Twitter user tweets."""
+    def __init__(self, username: str, email: str, password: str, cookies_b64: str = "") -> None:
         self._username = username
         self._email = email
         self._password = password
+        self._cookies_b64 = cookies_b64
         self._client = Client("en-US")
         self._authenticated = False
     async def initialize(self) -> None:
+        """Authenticate with X. Try login() first, fall back to env var cookies."""
+        # Try login for fresh cookies (auto-refresh path)
+        if await self._try_login():
+            return
+        # Fall back to stored cookies from X_COOKIES env var
+        if self._cookies_b64:
+            try:
+                cookie_json = base64.b64decode(self._cookies_b64).decode()
+                cookies = json.loads(cookie_json)
+                self._client.set_cookies(cookies)
+                self._authenticated = True
+                logger.info("X/Twitter: loaded cookies from X_COOKIES env var")
+                return
+            except Exception as e:
+                logger.error(f"X/Twitter: failed to parse X_COOKIES: {e}")
+        raise ValueError(
+            "X/Twitter: could not authenticate. login() failed and no valid X_COOKIES provided. "
+            'Set X_COOKIES env var with base64-encoded JSON: {"auth_token": "...", "ct0": "..."}'
+        )
+    async def _try_login(self) -> bool:
+        """Attempt login(). Returns True on success, False on failure (non-fatal)."""
         try:
             await self._client.login(
                 auth_info_1=self._username,
                 auth_info_2=self._email,
                 password=self._password,
             )
             self._authenticated = True
+            logger.info("X/Twitter: login() succeeded — cookies auto-refreshed")
+            return True
         except Exception as e:
+            logger.info(f"X/Twitter: login() failed (will try stored cookies): {e}")
+            return False
+    async def refresh_cookies(self) -> None:
+        """Background task: retry login() to auto-refresh cookies.
+        Called by APScheduler every 12 hours. When the account ages past
+        X's anti-spam gate, this starts succeeding automatically.
+        """
+        if await self._try_login():
+            logger.info("X/Twitter: background cookie refresh succeeded")
+        else:
+            logger.debug("X/Twitter: background cookie refresh failed (not critical)")
     async def fetch_user_tweets(
         self,
             minutes = self._rate_limit_minutes(e)
             raise ValueError(f"X rate limit reached — please try again in {minutes} minutes.") from e
         except Exception as e:
+            raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
         try:
             tweets_result = await user.get_tweets("Tweets", count=limit)

main.py CHANGED Viewed

@@ -127,10 +127,30 @@ async def lifespan(app: FastAPI):
                 username=settings.x_username,
                 email=settings.x_email,
                 password=settings.x_password,
             )
             await x_client.initialize()
             set_x_client(x_client)
             logger.info("X/Twitter client initialized")
         except Exception as e:
             logger.warning(f"X/Twitter client initialization failed (non-fatal): {e}")
     else:

                 username=settings.x_username,
                 email=settings.x_email,
                 password=settings.x_password,
+                cookies_b64=settings.x_cookies,
             )
             await x_client.initialize()
             set_x_client(x_client)
             logger.info("X/Twitter client initialized")
+            # Schedule background cookie refresh every 12 hours.
+            # When the account ages past X's anti-spam gate, login()
+            # starts succeeding and cookies auto-refresh.
+            try:
+                from app.services.scheduler import get_scheduler
+                scheduler = get_scheduler()
+                if scheduler and scheduler.running:
+                    scheduler.add_job(
+                        x_client.refresh_cookies,
+                        "interval",
+                        hours=12,
+                        id="x_cookie_refresh",
+                        replace_existing=True,
+                    )
+                    logger.info("X/Twitter: scheduled 12h cookie refresh")
+            except Exception as e:
+                logger.debug(f"X/Twitter: could not schedule cookie refresh: {e}")
         except Exception as e:
             logger.warning(f"X/Twitter client initialization failed (non-fatal): {e}")
     else:

requirements.txt CHANGED Viewed

@@ -66,8 +66,9 @@ pydantic>=2.11,<3
 pydantic-settings>=2.1.0
 httpx==0.26.0
-# X/Twitter integration (unofficial GraphQL client — cookie-based auth)
-twikit>=2.3.0
 # PDF generation (screening result reports, patient export, clinical summary)
 reportlab>=4.0.0

 pydantic-settings>=2.1.0
 httpx==0.26.0
+# X/Twitter integration (phin fork — fixes KEY_BYTE indices bug in upstream 2.3.3)
+twikit @ git+https://codeberg.org/phin/twikit.git
+curl_cffi>=0.7.0
 # PDF generation (screening result reports, patient export, clinical summary)
 reportlab>=4.0.0

tests/test_inference_splitter.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""Unit tests for split_compound_sentence — the second-pass clause splitter.
+These tests guard the strictly-additive guarantee: simple sentences pass
+through unchanged, and compound sentences are split only when every resulting
+segment is long enough to be meaningful (_MIN_CLAUSE_LEN = 15 chars).
+"""
+from __future__ import annotations
+import pytest
+from app.services.inference import (
+    _MIN_CLAUSE_LEN,
+    split_compound_sentence,
+    split_into_sentences,
+)
+# ── Regression: simple sentences must pass through unchanged ──────────────────
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "I feel sad today",
+        "tired",
+        "Can't sleep at all",
+        "I'm feeling hopeless",
+        "sad and tired",  # "and" split would produce "sad" (3 chars) < 15 → no split
+        "Feeling sad and blue",  # both fragments < 15 chars → no split
+        "No appetite lately",
+    ],
+)
+def test_simple_sentences_pass_through(sentence: str):
+    result = split_compound_sentence(sentence)
+    assert result == [sentence], f"Should not have split: {sentence!r}"
+# ── Comma splits ──────────────────────────────────────────────────────────────
+def test_comma_split_fatigue_cognitive():
+    """The motivating case: FATIGUE + COGNITIVE compound sentence."""
+    sentence = "Feeling overwhelmed and exhausted lately, can't seem to focus on anything"
+    result = split_compound_sentence(sentence)
+    assert len(result) == 2
+    assert result[0] == "Feeling overwhelmed and exhausted lately"
+    assert result[1] == "can't seem to focus on anything"
+def test_comma_split_two_long_clauses():
+    sentence = "I haven't been sleeping well at all, my appetite has completely disappeared"
+    result = split_compound_sentence(sentence)
+    assert len(result) == 2
+    assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
+def test_comma_no_split_if_fragment_too_short():
+    """'crying most days' (16 chars) is borderline — verify it does split."""
+    sentence = "I've been feeling very depressed lately, crying most days"
+    result = split_compound_sentence(sentence)
+    # "crying most days" = 16 chars >= 15, so this should split
+    assert len(result) == 2
+def test_comma_list_short_items_no_split():
+    """Comma-separated short list items should not produce micro-fragments."""
+    sentence = "Sad, tired, hopeless"  # all parts < 15 chars
+    result = split_compound_sentence(sentence)
+    assert result == [sentence]
+# ── Semicolon splits ──────────────────────────────────────────────────────────
+def test_semicolon_split():
+    sentence = "I can't get out of bed in the morning; everything feels pointless"
+    result = split_compound_sentence(sentence)
+    assert len(result) == 2
+    assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
+def test_semicolon_rejected_when_fragment_too_short_falls_to_comma():
+    # "I feel empty" = 12 chars < _MIN_CLAUSE_LEN → semicolon split rejected.
+    # Falls through to comma split:
+    #   "I feel empty; nothing brings me joy" (36 chars) + "not even the things I used to love" (34 chars)
+    sentence = "I feel empty; nothing brings me joy, not even the things I used to love"
+    result = split_compound_sentence(sentence)
+    assert len(result) == 2
+    assert result[0] == "I feel empty; nothing brings me joy"
+    assert result[1] == "not even the things I used to love"
+# ── Adversative conjunction splits ───────────────────────────────────────────
+@pytest.mark.parametrize(
+    "sentence",
+    [
+        "I want to feel better but nothing seems to help anymore",
+        "I try to push through yet the exhaustion never lifts",
+        "I used to enjoy cooking though now it feels like a chore",
+        "I show up to work however inside I feel completely numb",
+    ],
+)
+def test_adversative_conjunction_split(sentence: str):
+    result = split_compound_sentence(sentence)
+    assert len(result) == 2, f"Expected 2 clauses for: {sentence!r}"
+    assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
+# ── "and" splits (long sentences only) ───────────────────────────────────────
+def test_and_split_long_sentence():
+    """Long sentence with "and" joining two independent clauses should split."""
+    sentence = "I've been feeling really exhausted and I can't concentrate at work at all"
+    result = split_compound_sentence(sentence)
+    assert len(result) == 2
+    assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
+def test_and_no_split_short_sentence():
+    """Short sentence with "and" must NOT split (under _AND_MIN_LEN = 40)."""
+    sentence = "Feeling sad and completely hopeless now"  # 39 chars — just under threshold
+    result = split_compound_sentence(sentence)
+    assert result == [sentence]
+# ── Integration with split_into_sentences pipeline ───────────────────────────
+def test_full_pipeline_compound_post():
+    """Verify that the two-pass pipeline (sentences → compound split) expands
+    a multi-clause post into the right number of analyzable segments."""
+    text = (
+        "I haven't been sleeping well lately. "
+        "Feeling overwhelmed and exhausted, can't seem to focus on anything. "
+        "I don't enjoy the things I used to love."
+    )
+    # First pass: 3 sentences (split on `. `)
+    sentences = split_into_sentences(text)
+    assert len(sentences) == 3
+    # Second pass: middle sentence should split on comma
+    expanded: list[str] = []
+    for s in sentences:
+        expanded.extend(split_compound_sentence(s))
+    # Sentence 1: no split → 1 segment
+    # Sentence 2: comma split → 2 segments
+    # Sentence 3: no split → 1 segment
+    assert len(expanded) == 4
+    assert "Feeling overwhelmed and exhausted" in expanded[1]
+    assert "can't seem to focus on anything" in expanded[2]
+def test_all_segments_meet_min_length():
+    """Any output of split_compound_sentence must be >= _MIN_CLAUSE_LEN chars."""
+    cases = [
+        "Feeling overwhelmed and exhausted lately, can't seem to focus on anything",
+        "I can't sleep at night; I'm exhausted all day long",
+        "I want to feel better but nothing seems to work",
+        "sad and tired",
+        "I feel completely empty",
+    ]
+    for sentence in cases:
+        result = split_compound_sentence(sentence)
+        for part in result:
+            assert len(part) >= _MIN_CLAUSE_LEN or result == [sentence], (
+                f"Fragment {part!r} too short (from {sentence!r})"
+            )

tests/test_x_client.py CHANGED Viewed

@@ -120,7 +120,6 @@ async def test_user_not_found_raises_valueerror(x_client):
             new_callable=AsyncMock,
             side_effect=Exception("User not found"),
         ),
-        patch.object(x_client, "_login", new_callable=AsyncMock, side_effect=Exception("login failed")),
         pytest.raises(ValueError, match="not found"),
     ):
         await x_client.fetch_user_tweets("nonexistent_user_xyz")

             new_callable=AsyncMock,
             side_effect=Exception("User not found"),
         ),
         pytest.raises(ValueError, match="not found"),
     ):
         await x_client.fetch_user_tweets("nonexistent_user_xyz")