halsabbah commited on
Commit
9baab59
Β·
verified Β·
1 Parent(s): a3e927e

deploy: sync code from GitHub main

Browse files
.env.example CHANGED
@@ -37,6 +37,10 @@ CORS_ORIGINS=["http://localhost:3000","http://localhost:5173"]
37
  X_USERNAME=your-x-username
38
  X_EMAIL=your-x-email@example.com
39
  X_PASSWORD=your-x-password
 
 
 
 
40
 
41
  # Environment
42
  ENVIRONMENT=development
 
37
  X_USERNAME=your-x-username
38
  X_EMAIL=your-x-email@example.com
39
  X_PASSWORD=your-x-password
40
+ # Base64-encoded JSON cookies fallback. Generate with:
41
+ # python -c "import base64,json; print(base64.b64encode(json.dumps({'auth_token':'...','ct0':'...'}).encode()).decode())"
42
+ # On startup, login() is tried first. If it succeeds, this is ignored.
43
+ X_COOKIES=
44
 
45
  # Environment
46
  ENVIRONMENT=development
.github/workflows/ci.yml CHANGED
@@ -116,6 +116,12 @@ jobs:
116
 
117
  - name: Audit backend requirements
118
  # --desc: show a one-line description of each CVE
 
 
 
 
 
 
119
  # --ignore-vuln: deliberately ignored CVEs with justification:
120
  # CVE-2026-1839 (transformers) β€” affects Trainer._load_rng_state,
121
  # only reachable when loading malicious rng_state.pth checkpoints
@@ -129,7 +135,7 @@ jobs:
129
  # Attack requires local access; no remote exploit path. Fix is in
130
  # 2.8.0; re-evaluate when 2.8.0 ships stable.
131
  run: |
132
- pip-audit --requirement requirements.txt --desc --strict \
133
  --ignore-vuln CVE-2026-1839 \
134
  --ignore-vuln CVE-2025-2953 \
135
  --ignore-vuln CVE-2025-3730
 
116
 
117
  - name: Audit backend requirements
118
  # --desc: show a one-line description of each CVE
119
+ # --no-strict: twikit is installed from a Codeberg VCS URL (phin fork
120
+ # that fixes the KEY_BYTE indices bug in upstream 2.3.3). VCS
121
+ # dependencies are not registered on PyPI and cannot be audited by
122
+ # pip-audit. --no-strict (the default without --strict) treats
123
+ # unauditable packages as warnings rather than hard failures, so
124
+ # the scan still covers all 40+ PyPI-sourced dependencies.
125
  # --ignore-vuln: deliberately ignored CVEs with justification:
126
  # CVE-2026-1839 (transformers) β€” affects Trainer._load_rng_state,
127
  # only reachable when loading malicious rng_state.pth checkpoints
 
135
  # Attack requires local access; no remote exploit path. Fix is in
136
  # 2.8.0; re-evaluate when 2.8.0 ships stable.
137
  run: |
138
+ pip-audit --requirement requirements.txt --desc \
139
  --ignore-vuln CVE-2026-1839 \
140
  --ignore-vuln CVE-2025-2953 \
141
  --ignore-vuln CVE-2025-3730
.gitignore CHANGED
@@ -110,3 +110,4 @@ ml/data/suicide_watch/
110
 
111
  # X/Twitter twikit cookie cache (contains session tokens)
112
  .x_cookies.json
 
 
110
 
111
  # X/Twitter twikit cookie cache (contains session tokens)
112
  .x_cookies.json
113
+ *.tw_session
app/core/config.py CHANGED
@@ -57,6 +57,10 @@ class Settings(BaseSettings):
57
  x_username: str = ""
58
  x_email: str = ""
59
  x_password: str = ""
 
 
 
 
60
 
61
  # ── Error monitoring (Sentry) ──────────────────────────────────────────
62
  # If unset, Sentry initializes as a no-op (local dev, CI).
 
57
  x_username: str = ""
58
  x_email: str = ""
59
  x_password: str = ""
60
+ # Base64-encoded JSON cookies: {"auth_token": "...", "ct0": "..."}
61
+ # Fallback when login() is blocked (new accounts). On startup, login()
62
+ # is tried first; if it succeeds, this is ignored and cookies auto-refresh.
63
+ x_cookies: str = ""
64
 
65
  # ── Error monitoring (Sentry) ──────────────────────────────────────────
66
  # If unset, Sentry initializes as a no-op (local dev, CI).
app/services/inference.py CHANGED
@@ -63,6 +63,55 @@ class SymptomClassifier(nn.Module):
63
 
64
  # ── Sentence Splitting ────────────────────────────────────────────────────────
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def split_into_sentences(text: str) -> list[str]:
68
  """Rule-based sentence splitter for English and Arabic informal text."""
@@ -409,6 +458,13 @@ class ModelService:
409
  dsm5_criteria_met=[],
410
  )
411
 
 
 
 
 
 
 
 
412
  detections: list[SymptomDetection] = []
413
 
414
  for i, sentence in enumerate(sentences):
 
63
 
64
  # ── Sentence Splitting ────────────────────────────────────────────────────────
65
 
66
+ # Minimum character length a clause must have after compound splitting.
67
+ # Fragments shorter than this lack enough context for reliable classification.
68
+ _MIN_CLAUSE_LEN = 15
69
+
70
+ _COMMA_RE = re.compile(r",\s+")
71
+ _SEMICOLON_RE = re.compile(r";\s*")
72
+ _ADVERSATIVE_RE = re.compile(
73
+ r"\s+(?:but|yet|however|though|although|whereas|still|while|meanwhile)\s+",
74
+ re.IGNORECASE,
75
+ )
76
+ _AND_RE = re.compile(r"\s+and\s+", re.IGNORECASE)
77
+ # Only attempt "and" splits on sentences longer than this to avoid fragmenting
78
+ # short phrases like "sad and tired" into useless single-word segments.
79
+ _AND_MIN_LEN = 40
80
+
81
+
82
+ def split_compound_sentence(sentence: str) -> list[str]:
83
+ """Split a compound sentence into clauses for finer-grained classification.
84
+
85
+ A single compound input ("overwhelmed and exhausted, can't focus") forces
86
+ the model to pick ONE label β€” the dominant symptom swamps the rest.
87
+ Splitting it lets each clause be classified independently, dramatically
88
+ improving recall for co-occurring symptoms.
89
+
90
+ Only splits when every resulting segment is >= _MIN_CLAUSE_LEN characters;
91
+ shorter fragments lack context and hurt model accuracy. Simple sentences
92
+ pass through unchanged β€” this is strictly additive (zero regression risk).
93
+
94
+ Split priority (first successful split wins):
95
+ 1. Semicolons β€” strongest clause boundary in informal writing.
96
+ 2. Commas β€” most common mid-sentence clause boundary.
97
+ 3. Adversative conjunctions (but / yet / however / though / …).
98
+ 4. "and" β€” only for sentences longer than _AND_MIN_LEN.
99
+ """
100
+
101
+ def _try(pattern: re.Pattern) -> list[str] | None:
102
+ parts = [p.strip() for p in pattern.split(sentence) if p.strip()]
103
+ if len(parts) > 1 and all(len(p) >= _MIN_CLAUSE_LEN for p in parts):
104
+ return parts
105
+ return None
106
+
107
+ return (
108
+ _try(_SEMICOLON_RE)
109
+ or _try(_COMMA_RE)
110
+ or _try(_ADVERSATIVE_RE)
111
+ or (len(sentence) >= _AND_MIN_LEN and _try(_AND_RE))
112
+ or [sentence]
113
+ )
114
+
115
 
116
  def split_into_sentences(text: str) -> list[str]:
117
  """Rule-based sentence splitter for English and Arabic informal text."""
 
458
  dsm5_criteria_met=[],
459
  )
460
 
461
+ # Second pass: split compound sentences into clauses so each symptom
462
+ # gets its own focused input rather than competing inside one sentence.
463
+ expanded: list[str] = []
464
+ for sent in sentences:
465
+ expanded.extend(split_compound_sentence(sent))
466
+ sentences = expanded
467
+
468
  detections: list[SymptomDetection] = []
469
 
470
  for i, sentence in enumerate(sentences):
app/services/x_client.py CHANGED
@@ -1,64 +1,139 @@
1
  """
2
- X/Twitter integration via twikit.
3
 
4
- Wraps twikit's async Client to fetch a user's public tweets for
5
- depression screening. Uses cookie-based auth with the @depscreen
6
- service account.
 
 
7
 
8
- Cookie caching avoids re-login on every request. If cookies expire,
9
- one re-login attempt is made before failing.
 
 
 
10
  """
11
 
 
 
12
  import logging
13
  import math
14
  from datetime import UTC, datetime
15
- from pathlib import Path
16
 
17
  from twikit import Client, TooManyRequests
 
18
 
19
  from app.services.ingestion import MENTAL_HEALTH_KEYWORDS, Tweet
20
 
21
  logger = logging.getLogger(__name__)
22
 
23
- # Resolve cookie path relative to this file β†’ backend/.x_cookies.json
24
- _COOKIES_PATH = str(Path(__file__).resolve().parent.parent.parent / ".x_cookies.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
 
27
  class XClient:
28
  """Singleton wrapper around twikit for fetching X/Twitter user tweets."""
29
 
30
- def __init__(self, username: str, email: str, password: str) -> None:
31
  self._username = username
32
  self._email = email
33
  self._password = password
 
34
  self._client = Client("en-US")
35
  self._authenticated = False
36
 
37
  async def initialize(self) -> None:
38
- """Authenticate with X. Try cookies first, fall back to login."""
39
- try:
40
- self._client.load_cookies(_COOKIES_PATH)
41
- self._authenticated = True
42
- logger.info("X/Twitter: loaded cached cookies")
43
- except Exception:
44
- logger.info("X/Twitter: no cached cookies, performing login")
45
- await self._login()
46
-
47
- async def _login(self) -> None:
48
- """Perform a fresh login and save cookies."""
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  try:
50
  await self._client.login(
51
  auth_info_1=self._username,
52
  auth_info_2=self._email,
53
  password=self._password,
54
  )
55
- self._client.save_cookies(_COOKIES_PATH)
56
  self._authenticated = True
57
- logger.info("X/Twitter: login successful, cookies saved")
 
58
  except Exception as e:
59
- logger.error(f"X/Twitter login failed: {e}")
60
- self._authenticated = False
61
- raise ValueError("X/Twitter authentication failed. Please check credentials.") from e
 
 
 
 
 
 
 
 
 
 
62
 
63
  async def fetch_user_tweets(
64
  self,
@@ -89,15 +164,7 @@ class XClient:
89
  minutes = self._rate_limit_minutes(e)
90
  raise ValueError(f"X rate limit reached β€” please try again in {minutes} minutes.") from e
91
  except Exception as e:
92
- # Try one re-login in case cookies expired
93
- if "auth" in str(e).lower() or "login" in str(e).lower():
94
- try:
95
- await self._login()
96
- user = await self._client.get_user_by_screen_name(username)
97
- except Exception:
98
- raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
99
- else:
100
- raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
101
 
102
  try:
103
  tweets_result = await user.get_tweets("Tweets", count=limit)
 
1
  """
2
+ X/Twitter integration via twikit (phin fork).
3
 
4
+ Deployment-friendly auth strategy:
5
+ 1. On startup: try login() for fresh cookies (auto-refresh).
6
+ 2. If login fails (new account, error 399): fall back to X_COOKIES env var.
7
+ 3. Every 12h: background task retries login(). The moment X accepts it
8
+ (account ages past anti-spam gate), cookie refresh becomes automatic.
9
 
10
+ Cookies are stored in the X_COOKIES env var as base64-encoded JSON,
11
+ not on disk β€” works in stateless containers (HuggingFace Spaces, Docker).
12
+
13
+ Workaround for twikit User parsing: new X accounts are missing optional
14
+ fields like 'withheld_in_countries'. SafeDict returns sensible defaults.
15
  """
16
 
17
+ import base64
18
+ import json
19
  import logging
20
  import math
21
  from datetime import UTC, datetime
 
22
 
23
  from twikit import Client, TooManyRequests
24
+ from twikit.user import User as TwikitUser
25
 
26
  from app.services.ingestion import MENTAL_HEALTH_KEYWORDS, Tweet
27
 
28
  logger = logging.getLogger(__name__)
29
 
30
+
31
+ # ── twikit User parsing workaround ───────────────────────────────────────────
32
+
33
+
34
+ class _SafeDict(dict):
35
+ """Dict that returns sensible defaults for missing keys.
36
+
37
+ twikit's User.__init__ accesses optional fields with raw dict[] lookups.
38
+ New X accounts lack fields like 'pinned_tweet_ids_str',
39
+ 'withheld_in_countries'. This prevents KeyError crashes.
40
+ """
41
+
42
+ def __missing__(self, key: str):
43
+ if any(key.endswith(s) for s in ("_str", "_ids", "urls", "countries")):
44
+ return []
45
+ if key.endswith("_count") or key.endswith("_int"):
46
+ return 0
47
+ if key.endswith("_url") or key.endswith("_https"):
48
+ return ""
49
+ if key in ("entities", "description", "url"):
50
+ return _SafeDict()
51
+ return None
52
+
53
+
54
+ def _safe_wrap(data):
55
+ """Recursively wrap dicts in _SafeDict."""
56
+ if isinstance(data, dict):
57
+ return _SafeDict({k: _safe_wrap(v) for k, v in data.items()})
58
+ if isinstance(data, list):
59
+ return [_safe_wrap(i) for i in data]
60
+ return data
61
+
62
+
63
+ _original_user_init = TwikitUser.__init__
64
+
65
+
66
+ def _patched_user_init(self, client, data, **kwargs):
67
+ if "legacy" in data:
68
+ data["legacy"] = _safe_wrap(data["legacy"])
69
+ _original_user_init(self, client, data, **kwargs)
70
+
71
+
72
+ TwikitUser.__init__ = _patched_user_init
73
+
74
+
75
+ # ── XClient ──────────────────────────────────────────────────────────────────
76
 
77
 
78
  class XClient:
79
  """Singleton wrapper around twikit for fetching X/Twitter user tweets."""
80
 
81
+ def __init__(self, username: str, email: str, password: str, cookies_b64: str = "") -> None:
82
  self._username = username
83
  self._email = email
84
  self._password = password
85
+ self._cookies_b64 = cookies_b64
86
  self._client = Client("en-US")
87
  self._authenticated = False
88
 
89
  async def initialize(self) -> None:
90
+ """Authenticate with X. Try login() first, fall back to env var cookies."""
91
+ # Try login for fresh cookies (auto-refresh path)
92
+ if await self._try_login():
93
+ return
94
+
95
+ # Fall back to stored cookies from X_COOKIES env var
96
+ if self._cookies_b64:
97
+ try:
98
+ cookie_json = base64.b64decode(self._cookies_b64).decode()
99
+ cookies = json.loads(cookie_json)
100
+ self._client.set_cookies(cookies)
101
+ self._authenticated = True
102
+ logger.info("X/Twitter: loaded cookies from X_COOKIES env var")
103
+ return
104
+ except Exception as e:
105
+ logger.error(f"X/Twitter: failed to parse X_COOKIES: {e}")
106
+
107
+ raise ValueError(
108
+ "X/Twitter: could not authenticate. login() failed and no valid X_COOKIES provided. "
109
+ 'Set X_COOKIES env var with base64-encoded JSON: {"auth_token": "...", "ct0": "..."}'
110
+ )
111
+
112
+ async def _try_login(self) -> bool:
113
+ """Attempt login(). Returns True on success, False on failure (non-fatal)."""
114
  try:
115
  await self._client.login(
116
  auth_info_1=self._username,
117
  auth_info_2=self._email,
118
  password=self._password,
119
  )
 
120
  self._authenticated = True
121
+ logger.info("X/Twitter: login() succeeded β€” cookies auto-refreshed")
122
+ return True
123
  except Exception as e:
124
+ logger.info(f"X/Twitter: login() failed (will try stored cookies): {e}")
125
+ return False
126
+
127
+ async def refresh_cookies(self) -> None:
128
+ """Background task: retry login() to auto-refresh cookies.
129
+
130
+ Called by APScheduler every 12 hours. When the account ages past
131
+ X's anti-spam gate, this starts succeeding automatically.
132
+ """
133
+ if await self._try_login():
134
+ logger.info("X/Twitter: background cookie refresh succeeded")
135
+ else:
136
+ logger.debug("X/Twitter: background cookie refresh failed (not critical)")
137
 
138
  async def fetch_user_tweets(
139
  self,
 
164
  minutes = self._rate_limit_minutes(e)
165
  raise ValueError(f"X rate limit reached β€” please try again in {minutes} minutes.") from e
166
  except Exception as e:
167
+ raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
 
 
 
 
 
 
 
 
168
 
169
  try:
170
  tweets_result = await user.get_tweets("Tweets", count=limit)
main.py CHANGED
@@ -127,10 +127,30 @@ async def lifespan(app: FastAPI):
127
  username=settings.x_username,
128
  email=settings.x_email,
129
  password=settings.x_password,
 
130
  )
131
  await x_client.initialize()
132
  set_x_client(x_client)
133
  logger.info("X/Twitter client initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  except Exception as e:
135
  logger.warning(f"X/Twitter client initialization failed (non-fatal): {e}")
136
  else:
 
127
  username=settings.x_username,
128
  email=settings.x_email,
129
  password=settings.x_password,
130
+ cookies_b64=settings.x_cookies,
131
  )
132
  await x_client.initialize()
133
  set_x_client(x_client)
134
  logger.info("X/Twitter client initialized")
135
+
136
+ # Schedule background cookie refresh every 12 hours.
137
+ # When the account ages past X's anti-spam gate, login()
138
+ # starts succeeding and cookies auto-refresh.
139
+ try:
140
+ from app.services.scheduler import get_scheduler
141
+
142
+ scheduler = get_scheduler()
143
+ if scheduler and scheduler.running:
144
+ scheduler.add_job(
145
+ x_client.refresh_cookies,
146
+ "interval",
147
+ hours=12,
148
+ id="x_cookie_refresh",
149
+ replace_existing=True,
150
+ )
151
+ logger.info("X/Twitter: scheduled 12h cookie refresh")
152
+ except Exception as e:
153
+ logger.debug(f"X/Twitter: could not schedule cookie refresh: {e}")
154
  except Exception as e:
155
  logger.warning(f"X/Twitter client initialization failed (non-fatal): {e}")
156
  else:
requirements.txt CHANGED
@@ -66,8 +66,9 @@ pydantic>=2.11,<3
66
  pydantic-settings>=2.1.0
67
  httpx==0.26.0
68
 
69
- # X/Twitter integration (unofficial GraphQL client β€” cookie-based auth)
70
- twikit>=2.3.0
 
71
 
72
  # PDF generation (screening result reports, patient export, clinical summary)
73
  reportlab>=4.0.0
 
66
  pydantic-settings>=2.1.0
67
  httpx==0.26.0
68
 
69
+ # X/Twitter integration (phin fork β€” fixes KEY_BYTE indices bug in upstream 2.3.3)
70
+ twikit @ git+https://codeberg.org/phin/twikit.git
71
+ curl_cffi>=0.7.0
72
 
73
  # PDF generation (screening result reports, patient export, clinical summary)
74
  reportlab>=4.0.0
tests/test_inference_splitter.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for split_compound_sentence β€” the second-pass clause splitter.
2
+
3
+ These tests guard the strictly-additive guarantee: simple sentences pass
4
+ through unchanged, and compound sentences are split only when every resulting
5
+ segment is long enough to be meaningful (_MIN_CLAUSE_LEN = 15 chars).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import pytest
11
+
12
+ from app.services.inference import (
13
+ _MIN_CLAUSE_LEN,
14
+ split_compound_sentence,
15
+ split_into_sentences,
16
+ )
17
+
18
+ # ── Regression: simple sentences must pass through unchanged ──────────────────
19
+
20
+
21
+ @pytest.mark.parametrize(
22
+ "sentence",
23
+ [
24
+ "I feel sad today",
25
+ "tired",
26
+ "Can't sleep at all",
27
+ "I'm feeling hopeless",
28
+ "sad and tired", # "and" split would produce "sad" (3 chars) < 15 β†’ no split
29
+ "Feeling sad and blue", # both fragments < 15 chars β†’ no split
30
+ "No appetite lately",
31
+ ],
32
+ )
33
+ def test_simple_sentences_pass_through(sentence: str):
34
+ result = split_compound_sentence(sentence)
35
+ assert result == [sentence], f"Should not have split: {sentence!r}"
36
+
37
+
38
+ # ── Comma splits ──────────────────────────────────────────────────────────────
39
+
40
+
41
+ def test_comma_split_fatigue_cognitive():
42
+ """The motivating case: FATIGUE + COGNITIVE compound sentence."""
43
+ sentence = "Feeling overwhelmed and exhausted lately, can't seem to focus on anything"
44
+ result = split_compound_sentence(sentence)
45
+ assert len(result) == 2
46
+ assert result[0] == "Feeling overwhelmed and exhausted lately"
47
+ assert result[1] == "can't seem to focus on anything"
48
+
49
+
50
+ def test_comma_split_two_long_clauses():
51
+ sentence = "I haven't been sleeping well at all, my appetite has completely disappeared"
52
+ result = split_compound_sentence(sentence)
53
+ assert len(result) == 2
54
+ assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
55
+
56
+
57
+ def test_comma_no_split_if_fragment_too_short():
58
+ """'crying most days' (16 chars) is borderline β€” verify it does split."""
59
+ sentence = "I've been feeling very depressed lately, crying most days"
60
+ result = split_compound_sentence(sentence)
61
+ # "crying most days" = 16 chars >= 15, so this should split
62
+ assert len(result) == 2
63
+
64
+
65
+ def test_comma_list_short_items_no_split():
66
+ """Comma-separated short list items should not produce micro-fragments."""
67
+ sentence = "Sad, tired, hopeless" # all parts < 15 chars
68
+ result = split_compound_sentence(sentence)
69
+ assert result == [sentence]
70
+
71
+
72
+ # ── Semicolon splits ──────────────────────────────────────────────────────────
73
+
74
+
75
+ def test_semicolon_split():
76
+ sentence = "I can't get out of bed in the morning; everything feels pointless"
77
+ result = split_compound_sentence(sentence)
78
+ assert len(result) == 2
79
+ assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
80
+
81
+
82
+ def test_semicolon_rejected_when_fragment_too_short_falls_to_comma():
83
+ # "I feel empty" = 12 chars < _MIN_CLAUSE_LEN β†’ semicolon split rejected.
84
+ # Falls through to comma split:
85
+ # "I feel empty; nothing brings me joy" (36 chars) + "not even the things I used to love" (34 chars)
86
+ sentence = "I feel empty; nothing brings me joy, not even the things I used to love"
87
+ result = split_compound_sentence(sentence)
88
+ assert len(result) == 2
89
+ assert result[0] == "I feel empty; nothing brings me joy"
90
+ assert result[1] == "not even the things I used to love"
91
+
92
+
93
+ # ── Adversative conjunction splits ───────────────────────────────────────────
94
+
95
+
96
+ @pytest.mark.parametrize(
97
+ "sentence",
98
+ [
99
+ "I want to feel better but nothing seems to help anymore",
100
+ "I try to push through yet the exhaustion never lifts",
101
+ "I used to enjoy cooking though now it feels like a chore",
102
+ "I show up to work however inside I feel completely numb",
103
+ ],
104
+ )
105
+ def test_adversative_conjunction_split(sentence: str):
106
+ result = split_compound_sentence(sentence)
107
+ assert len(result) == 2, f"Expected 2 clauses for: {sentence!r}"
108
+ assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
109
+
110
+
111
+ # ── "and" splits (long sentences only) ───────────────────────────────────────
112
+
113
+
114
+ def test_and_split_long_sentence():
115
+ """Long sentence with "and" joining two independent clauses should split."""
116
+ sentence = "I've been feeling really exhausted and I can't concentrate at work at all"
117
+ result = split_compound_sentence(sentence)
118
+ assert len(result) == 2
119
+ assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
120
+
121
+
122
+ def test_and_no_split_short_sentence():
123
+ """Short sentence with "and" must NOT split (under _AND_MIN_LEN = 40)."""
124
+ sentence = "Feeling sad and completely hopeless now" # 39 chars β€” just under threshold
125
+ result = split_compound_sentence(sentence)
126
+ assert result == [sentence]
127
+
128
+
129
+ # ── Integration with split_into_sentences pipeline ───────────────────────────
130
+
131
+
132
+ def test_full_pipeline_compound_post():
133
+ """Verify that the two-pass pipeline (sentences β†’ compound split) expands
134
+ a multi-clause post into the right number of analyzable segments."""
135
+ text = (
136
+ "I haven't been sleeping well lately. "
137
+ "Feeling overwhelmed and exhausted, can't seem to focus on anything. "
138
+ "I don't enjoy the things I used to love."
139
+ )
140
+ # First pass: 3 sentences (split on `. `)
141
+ sentences = split_into_sentences(text)
142
+ assert len(sentences) == 3
143
+
144
+ # Second pass: middle sentence should split on comma
145
+ expanded: list[str] = []
146
+ for s in sentences:
147
+ expanded.extend(split_compound_sentence(s))
148
+
149
+ # Sentence 1: no split β†’ 1 segment
150
+ # Sentence 2: comma split β†’ 2 segments
151
+ # Sentence 3: no split β†’ 1 segment
152
+ assert len(expanded) == 4
153
+ assert "Feeling overwhelmed and exhausted" in expanded[1]
154
+ assert "can't seem to focus on anything" in expanded[2]
155
+
156
+
157
+ def test_all_segments_meet_min_length():
158
+ """Any output of split_compound_sentence must be >= _MIN_CLAUSE_LEN chars."""
159
+ cases = [
160
+ "Feeling overwhelmed and exhausted lately, can't seem to focus on anything",
161
+ "I can't sleep at night; I'm exhausted all day long",
162
+ "I want to feel better but nothing seems to work",
163
+ "sad and tired",
164
+ "I feel completely empty",
165
+ ]
166
+ for sentence in cases:
167
+ result = split_compound_sentence(sentence)
168
+ for part in result:
169
+ assert len(part) >= _MIN_CLAUSE_LEN or result == [sentence], (
170
+ f"Fragment {part!r} too short (from {sentence!r})"
171
+ )
tests/test_x_client.py CHANGED
@@ -120,7 +120,6 @@ async def test_user_not_found_raises_valueerror(x_client):
120
  new_callable=AsyncMock,
121
  side_effect=Exception("User not found"),
122
  ),
123
- patch.object(x_client, "_login", new_callable=AsyncMock, side_effect=Exception("login failed")),
124
  pytest.raises(ValueError, match="not found"),
125
  ):
126
  await x_client.fetch_user_tweets("nonexistent_user_xyz")
 
120
  new_callable=AsyncMock,
121
  side_effect=Exception("User not found"),
122
  ),
 
123
  pytest.raises(ValueError, match="not found"),
124
  ):
125
  await x_client.fetch_user_tweets("nonexistent_user_xyz")