deploy: sync code from GitHub main
Browse files- .env.example +4 -0
- .github/workflows/ci.yml +7 -1
- .gitignore +1 -0
- app/core/config.py +4 -0
- app/services/inference.py +56 -0
- app/services/x_client.py +102 -35
- main.py +20 -0
- requirements.txt +3 -2
- tests/test_inference_splitter.py +171 -0
- tests/test_x_client.py +0 -1
.env.example
CHANGED
|
@@ -37,6 +37,10 @@ CORS_ORIGINS=["http://localhost:3000","http://localhost:5173"]
|
|
| 37 |
X_USERNAME=your-x-username
|
| 38 |
X_EMAIL=your-x-email@example.com
|
| 39 |
X_PASSWORD=your-x-password
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# Environment
|
| 42 |
ENVIRONMENT=development
|
|
|
|
| 37 |
X_USERNAME=your-x-username
|
| 38 |
X_EMAIL=your-x-email@example.com
|
| 39 |
X_PASSWORD=your-x-password
|
| 40 |
+
# Base64-encoded JSON cookies fallback. Generate with:
|
| 41 |
+
# python -c "import base64,json; print(base64.b64encode(json.dumps({'auth_token':'...','ct0':'...'}).encode()).decode())"
|
| 42 |
+
# On startup, login() is tried first. If it succeeds, this is ignored.
|
| 43 |
+
X_COOKIES=
|
| 44 |
|
| 45 |
# Environment
|
| 46 |
ENVIRONMENT=development
|
.github/workflows/ci.yml
CHANGED
|
@@ -116,6 +116,12 @@ jobs:
|
|
| 116 |
|
| 117 |
- name: Audit backend requirements
|
| 118 |
# --desc: show a one-line description of each CVE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# --ignore-vuln: deliberately ignored CVEs with justification:
|
| 120 |
# CVE-2026-1839 (transformers) β affects Trainer._load_rng_state,
|
| 121 |
# only reachable when loading malicious rng_state.pth checkpoints
|
|
@@ -129,7 +135,7 @@ jobs:
|
|
| 129 |
# Attack requires local access; no remote exploit path. Fix is in
|
| 130 |
# 2.8.0; re-evaluate when 2.8.0 ships stable.
|
| 131 |
run: |
|
| 132 |
-
pip-audit --requirement requirements.txt --desc
|
| 133 |
--ignore-vuln CVE-2026-1839 \
|
| 134 |
--ignore-vuln CVE-2025-2953 \
|
| 135 |
--ignore-vuln CVE-2025-3730
|
|
|
|
| 116 |
|
| 117 |
- name: Audit backend requirements
|
| 118 |
# --desc: show a one-line description of each CVE
|
| 119 |
+
# --no-strict: twikit is installed from a Codeberg VCS URL (phin fork
|
| 120 |
+
# that fixes the KEY_BYTE indices bug in upstream 2.3.3). VCS
|
| 121 |
+
# dependencies are not registered on PyPI and cannot be audited by
|
| 122 |
+
# pip-audit. --no-strict (the default without --strict) treats
|
| 123 |
+
# unauditable packages as warnings rather than hard failures, so
|
| 124 |
+
# the scan still covers all 40+ PyPI-sourced dependencies.
|
| 125 |
# --ignore-vuln: deliberately ignored CVEs with justification:
|
| 126 |
# CVE-2026-1839 (transformers) β affects Trainer._load_rng_state,
|
| 127 |
# only reachable when loading malicious rng_state.pth checkpoints
|
|
|
|
| 135 |
# Attack requires local access; no remote exploit path. Fix is in
|
| 136 |
# 2.8.0; re-evaluate when 2.8.0 ships stable.
|
| 137 |
run: |
|
| 138 |
+
pip-audit --requirement requirements.txt --desc \
|
| 139 |
--ignore-vuln CVE-2026-1839 \
|
| 140 |
--ignore-vuln CVE-2025-2953 \
|
| 141 |
--ignore-vuln CVE-2025-3730
|
.gitignore
CHANGED
|
@@ -110,3 +110,4 @@ ml/data/suicide_watch/
|
|
| 110 |
|
| 111 |
# X/Twitter twikit cookie cache (contains session tokens)
|
| 112 |
.x_cookies.json
|
|
|
|
|
|
| 110 |
|
| 111 |
# X/Twitter twikit cookie cache (contains session tokens)
|
| 112 |
.x_cookies.json
|
| 113 |
+
*.tw_session
|
app/core/config.py
CHANGED
|
@@ -57,6 +57,10 @@ class Settings(BaseSettings):
|
|
| 57 |
x_username: str = ""
|
| 58 |
x_email: str = ""
|
| 59 |
x_password: str = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# ββ Error monitoring (Sentry) ββββββββββββββββββββββββββββββββββββββββββ
|
| 62 |
# If unset, Sentry initializes as a no-op (local dev, CI).
|
|
|
|
| 57 |
x_username: str = ""
|
| 58 |
x_email: str = ""
|
| 59 |
x_password: str = ""
|
| 60 |
+
# Base64-encoded JSON cookies: {"auth_token": "...", "ct0": "..."}
|
| 61 |
+
# Fallback when login() is blocked (new accounts). On startup, login()
|
| 62 |
+
# is tried first; if it succeeds, this is ignored and cookies auto-refresh.
|
| 63 |
+
x_cookies: str = ""
|
| 64 |
|
| 65 |
# ββ Error monitoring (Sentry) ββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
# If unset, Sentry initializes as a no-op (local dev, CI).
|
app/services/inference.py
CHANGED
|
@@ -63,6 +63,55 @@ class SymptomClassifier(nn.Module):
|
|
| 63 |
|
| 64 |
# ββ Sentence Splitting ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
def split_into_sentences(text: str) -> list[str]:
|
| 68 |
"""Rule-based sentence splitter for English and Arabic informal text."""
|
|
@@ -409,6 +458,13 @@ class ModelService:
|
|
| 409 |
dsm5_criteria_met=[],
|
| 410 |
)
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
detections: list[SymptomDetection] = []
|
| 413 |
|
| 414 |
for i, sentence in enumerate(sentences):
|
|
|
|
| 63 |
|
| 64 |
# ββ Sentence Splitting ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
|
| 66 |
+
# Minimum character length a clause must have after compound splitting.
|
| 67 |
+
# Fragments shorter than this lack enough context for reliable classification.
|
| 68 |
+
_MIN_CLAUSE_LEN = 15
|
| 69 |
+
|
| 70 |
+
_COMMA_RE = re.compile(r",\s+")
|
| 71 |
+
_SEMICOLON_RE = re.compile(r";\s*")
|
| 72 |
+
_ADVERSATIVE_RE = re.compile(
|
| 73 |
+
r"\s+(?:but|yet|however|though|although|whereas|still|while|meanwhile)\s+",
|
| 74 |
+
re.IGNORECASE,
|
| 75 |
+
)
|
| 76 |
+
_AND_RE = re.compile(r"\s+and\s+", re.IGNORECASE)
|
| 77 |
+
# Only attempt "and" splits on sentences longer than this to avoid fragmenting
|
| 78 |
+
# short phrases like "sad and tired" into useless single-word segments.
|
| 79 |
+
_AND_MIN_LEN = 40
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def split_compound_sentence(sentence: str) -> list[str]:
|
| 83 |
+
"""Split a compound sentence into clauses for finer-grained classification.
|
| 84 |
+
|
| 85 |
+
A single compound input ("overwhelmed and exhausted, can't focus") forces
|
| 86 |
+
the model to pick ONE label β the dominant symptom swamps the rest.
|
| 87 |
+
Splitting it lets each clause be classified independently, dramatically
|
| 88 |
+
improving recall for co-occurring symptoms.
|
| 89 |
+
|
| 90 |
+
Only splits when every resulting segment is >= _MIN_CLAUSE_LEN characters;
|
| 91 |
+
shorter fragments lack context and hurt model accuracy. Simple sentences
|
| 92 |
+
pass through unchanged β this is strictly additive (zero regression risk).
|
| 93 |
+
|
| 94 |
+
Split priority (first successful split wins):
|
| 95 |
+
1. Semicolons β strongest clause boundary in informal writing.
|
| 96 |
+
2. Commas β most common mid-sentence clause boundary.
|
| 97 |
+
3. Adversative conjunctions (but / yet / however / though / β¦).
|
| 98 |
+
4. "and" β only for sentences longer than _AND_MIN_LEN.
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
def _try(pattern: re.Pattern) -> list[str] | None:
|
| 102 |
+
parts = [p.strip() for p in pattern.split(sentence) if p.strip()]
|
| 103 |
+
if len(parts) > 1 and all(len(p) >= _MIN_CLAUSE_LEN for p in parts):
|
| 104 |
+
return parts
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
return (
|
| 108 |
+
_try(_SEMICOLON_RE)
|
| 109 |
+
or _try(_COMMA_RE)
|
| 110 |
+
or _try(_ADVERSATIVE_RE)
|
| 111 |
+
or (len(sentence) >= _AND_MIN_LEN and _try(_AND_RE))
|
| 112 |
+
or [sentence]
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
|
| 116 |
def split_into_sentences(text: str) -> list[str]:
|
| 117 |
"""Rule-based sentence splitter for English and Arabic informal text."""
|
|
|
|
| 458 |
dsm5_criteria_met=[],
|
| 459 |
)
|
| 460 |
|
| 461 |
+
# Second pass: split compound sentences into clauses so each symptom
|
| 462 |
+
# gets its own focused input rather than competing inside one sentence.
|
| 463 |
+
expanded: list[str] = []
|
| 464 |
+
for sent in sentences:
|
| 465 |
+
expanded.extend(split_compound_sentence(sent))
|
| 466 |
+
sentences = expanded
|
| 467 |
+
|
| 468 |
detections: list[SymptomDetection] = []
|
| 469 |
|
| 470 |
for i, sentence in enumerate(sentences):
|
app/services/x_client.py
CHANGED
|
@@ -1,64 +1,139 @@
|
|
| 1 |
"""
|
| 2 |
-
X/Twitter integration via twikit.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
|
|
|
|
|
|
| 12 |
import logging
|
| 13 |
import math
|
| 14 |
from datetime import UTC, datetime
|
| 15 |
-
from pathlib import Path
|
| 16 |
|
| 17 |
from twikit import Client, TooManyRequests
|
|
|
|
| 18 |
|
| 19 |
from app.services.ingestion import MENTAL_HEALTH_KEYWORDS, Tweet
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class XClient:
|
| 28 |
"""Singleton wrapper around twikit for fetching X/Twitter user tweets."""
|
| 29 |
|
| 30 |
-
def __init__(self, username: str, email: str, password: str) -> None:
|
| 31 |
self._username = username
|
| 32 |
self._email = email
|
| 33 |
self._password = password
|
|
|
|
| 34 |
self._client = Client("en-US")
|
| 35 |
self._authenticated = False
|
| 36 |
|
| 37 |
async def initialize(self) -> None:
|
| 38 |
-
"""Authenticate with X. Try
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
try:
|
| 50 |
await self._client.login(
|
| 51 |
auth_info_1=self._username,
|
| 52 |
auth_info_2=self._email,
|
| 53 |
password=self._password,
|
| 54 |
)
|
| 55 |
-
self._client.save_cookies(_COOKIES_PATH)
|
| 56 |
self._authenticated = True
|
| 57 |
-
logger.info("X/Twitter: login
|
|
|
|
| 58 |
except Exception as e:
|
| 59 |
-
logger.
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
async def fetch_user_tweets(
|
| 64 |
self,
|
|
@@ -89,15 +164,7 @@ class XClient:
|
|
| 89 |
minutes = self._rate_limit_minutes(e)
|
| 90 |
raise ValueError(f"X rate limit reached β please try again in {minutes} minutes.") from e
|
| 91 |
except Exception as e:
|
| 92 |
-
|
| 93 |
-
if "auth" in str(e).lower() or "login" in str(e).lower():
|
| 94 |
-
try:
|
| 95 |
-
await self._login()
|
| 96 |
-
user = await self._client.get_user_by_screen_name(username)
|
| 97 |
-
except Exception:
|
| 98 |
-
raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
|
| 99 |
-
else:
|
| 100 |
-
raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
|
| 101 |
|
| 102 |
try:
|
| 103 |
tweets_result = await user.get_tweets("Tweets", count=limit)
|
|
|
|
| 1 |
"""
|
| 2 |
+
X/Twitter integration via twikit (phin fork).
|
| 3 |
|
| 4 |
+
Deployment-friendly auth strategy:
|
| 5 |
+
1. On startup: try login() for fresh cookies (auto-refresh).
|
| 6 |
+
2. If login fails (new account, error 399): fall back to X_COOKIES env var.
|
| 7 |
+
3. Every 12h: background task retries login(). The moment X accepts it
|
| 8 |
+
(account ages past anti-spam gate), cookie refresh becomes automatic.
|
| 9 |
|
| 10 |
+
Cookies are stored in the X_COOKIES env var as base64-encoded JSON,
|
| 11 |
+
not on disk β works in stateless containers (HuggingFace Spaces, Docker).
|
| 12 |
+
|
| 13 |
+
Workaround for twikit User parsing: new X accounts are missing optional
|
| 14 |
+
fields like 'withheld_in_countries'. SafeDict returns sensible defaults.
|
| 15 |
"""
|
| 16 |
|
| 17 |
+
import base64
|
| 18 |
+
import json
|
| 19 |
import logging
|
| 20 |
import math
|
| 21 |
from datetime import UTC, datetime
|
|
|
|
| 22 |
|
| 23 |
from twikit import Client, TooManyRequests
|
| 24 |
+
from twikit.user import User as TwikitUser
|
| 25 |
|
| 26 |
from app.services.ingestion import MENTAL_HEALTH_KEYWORDS, Tweet
|
| 27 |
|
| 28 |
logger = logging.getLogger(__name__)
|
| 29 |
|
| 30 |
+
|
| 31 |
+
# ββ twikit User parsing workaround βββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class _SafeDict(dict):
|
| 35 |
+
"""Dict that returns sensible defaults for missing keys.
|
| 36 |
+
|
| 37 |
+
twikit's User.__init__ accesses optional fields with raw dict[] lookups.
|
| 38 |
+
New X accounts lack fields like 'pinned_tweet_ids_str',
|
| 39 |
+
'withheld_in_countries'. This prevents KeyError crashes.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __missing__(self, key: str):
|
| 43 |
+
if any(key.endswith(s) for s in ("_str", "_ids", "urls", "countries")):
|
| 44 |
+
return []
|
| 45 |
+
if key.endswith("_count") or key.endswith("_int"):
|
| 46 |
+
return 0
|
| 47 |
+
if key.endswith("_url") or key.endswith("_https"):
|
| 48 |
+
return ""
|
| 49 |
+
if key in ("entities", "description", "url"):
|
| 50 |
+
return _SafeDict()
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _safe_wrap(data):
|
| 55 |
+
"""Recursively wrap dicts in _SafeDict."""
|
| 56 |
+
if isinstance(data, dict):
|
| 57 |
+
return _SafeDict({k: _safe_wrap(v) for k, v in data.items()})
|
| 58 |
+
if isinstance(data, list):
|
| 59 |
+
return [_safe_wrap(i) for i in data]
|
| 60 |
+
return data
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
_original_user_init = TwikitUser.__init__
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _patched_user_init(self, client, data, **kwargs):
|
| 67 |
+
if "legacy" in data:
|
| 68 |
+
data["legacy"] = _safe_wrap(data["legacy"])
|
| 69 |
+
_original_user_init(self, client, data, **kwargs)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
TwikitUser.__init__ = _patched_user_init
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ββ XClient ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
|
| 77 |
|
| 78 |
class XClient:
|
| 79 |
"""Singleton wrapper around twikit for fetching X/Twitter user tweets."""
|
| 80 |
|
| 81 |
+
def __init__(self, username: str, email: str, password: str, cookies_b64: str = "") -> None:
|
| 82 |
self._username = username
|
| 83 |
self._email = email
|
| 84 |
self._password = password
|
| 85 |
+
self._cookies_b64 = cookies_b64
|
| 86 |
self._client = Client("en-US")
|
| 87 |
self._authenticated = False
|
| 88 |
|
| 89 |
async def initialize(self) -> None:
|
| 90 |
+
"""Authenticate with X. Try login() first, fall back to env var cookies."""
|
| 91 |
+
# Try login for fresh cookies (auto-refresh path)
|
| 92 |
+
if await self._try_login():
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
+
# Fall back to stored cookies from X_COOKIES env var
|
| 96 |
+
if self._cookies_b64:
|
| 97 |
+
try:
|
| 98 |
+
cookie_json = base64.b64decode(self._cookies_b64).decode()
|
| 99 |
+
cookies = json.loads(cookie_json)
|
| 100 |
+
self._client.set_cookies(cookies)
|
| 101 |
+
self._authenticated = True
|
| 102 |
+
logger.info("X/Twitter: loaded cookies from X_COOKIES env var")
|
| 103 |
+
return
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"X/Twitter: failed to parse X_COOKIES: {e}")
|
| 106 |
+
|
| 107 |
+
raise ValueError(
|
| 108 |
+
"X/Twitter: could not authenticate. login() failed and no valid X_COOKIES provided. "
|
| 109 |
+
'Set X_COOKIES env var with base64-encoded JSON: {"auth_token": "...", "ct0": "..."}'
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
async def _try_login(self) -> bool:
|
| 113 |
+
"""Attempt login(). Returns True on success, False on failure (non-fatal)."""
|
| 114 |
try:
|
| 115 |
await self._client.login(
|
| 116 |
auth_info_1=self._username,
|
| 117 |
auth_info_2=self._email,
|
| 118 |
password=self._password,
|
| 119 |
)
|
|
|
|
| 120 |
self._authenticated = True
|
| 121 |
+
logger.info("X/Twitter: login() succeeded β cookies auto-refreshed")
|
| 122 |
+
return True
|
| 123 |
except Exception as e:
|
| 124 |
+
logger.info(f"X/Twitter: login() failed (will try stored cookies): {e}")
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
async def refresh_cookies(self) -> None:
|
| 128 |
+
"""Background task: retry login() to auto-refresh cookies.
|
| 129 |
+
|
| 130 |
+
Called by APScheduler every 12 hours. When the account ages past
|
| 131 |
+
X's anti-spam gate, this starts succeeding automatically.
|
| 132 |
+
"""
|
| 133 |
+
if await self._try_login():
|
| 134 |
+
logger.info("X/Twitter: background cookie refresh succeeded")
|
| 135 |
+
else:
|
| 136 |
+
logger.debug("X/Twitter: background cookie refresh failed (not critical)")
|
| 137 |
|
| 138 |
async def fetch_user_tweets(
|
| 139 |
self,
|
|
|
|
| 164 |
minutes = self._rate_limit_minutes(e)
|
| 165 |
raise ValueError(f"X rate limit reached β please try again in {minutes} minutes.") from e
|
| 166 |
except Exception as e:
|
| 167 |
+
raise ValueError(f"X/Twitter user '@{username}' not found or profile is private.") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
try:
|
| 170 |
tweets_result = await user.get_tweets("Tweets", count=limit)
|
main.py
CHANGED
|
@@ -127,10 +127,30 @@ async def lifespan(app: FastAPI):
|
|
| 127 |
username=settings.x_username,
|
| 128 |
email=settings.x_email,
|
| 129 |
password=settings.x_password,
|
|
|
|
| 130 |
)
|
| 131 |
await x_client.initialize()
|
| 132 |
set_x_client(x_client)
|
| 133 |
logger.info("X/Twitter client initialized")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
except Exception as e:
|
| 135 |
logger.warning(f"X/Twitter client initialization failed (non-fatal): {e}")
|
| 136 |
else:
|
|
|
|
| 127 |
username=settings.x_username,
|
| 128 |
email=settings.x_email,
|
| 129 |
password=settings.x_password,
|
| 130 |
+
cookies_b64=settings.x_cookies,
|
| 131 |
)
|
| 132 |
await x_client.initialize()
|
| 133 |
set_x_client(x_client)
|
| 134 |
logger.info("X/Twitter client initialized")
|
| 135 |
+
|
| 136 |
+
# Schedule background cookie refresh every 12 hours.
|
| 137 |
+
# When the account ages past X's anti-spam gate, login()
|
| 138 |
+
# starts succeeding and cookies auto-refresh.
|
| 139 |
+
try:
|
| 140 |
+
from app.services.scheduler import get_scheduler
|
| 141 |
+
|
| 142 |
+
scheduler = get_scheduler()
|
| 143 |
+
if scheduler and scheduler.running:
|
| 144 |
+
scheduler.add_job(
|
| 145 |
+
x_client.refresh_cookies,
|
| 146 |
+
"interval",
|
| 147 |
+
hours=12,
|
| 148 |
+
id="x_cookie_refresh",
|
| 149 |
+
replace_existing=True,
|
| 150 |
+
)
|
| 151 |
+
logger.info("X/Twitter: scheduled 12h cookie refresh")
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.debug(f"X/Twitter: could not schedule cookie refresh: {e}")
|
| 154 |
except Exception as e:
|
| 155 |
logger.warning(f"X/Twitter client initialization failed (non-fatal): {e}")
|
| 156 |
else:
|
requirements.txt
CHANGED
|
@@ -66,8 +66,9 @@ pydantic>=2.11,<3
|
|
| 66 |
pydantic-settings>=2.1.0
|
| 67 |
httpx==0.26.0
|
| 68 |
|
| 69 |
-
# X/Twitter integration (
|
| 70 |
-
twikit
|
|
|
|
| 71 |
|
| 72 |
# PDF generation (screening result reports, patient export, clinical summary)
|
| 73 |
reportlab>=4.0.0
|
|
|
|
| 66 |
pydantic-settings>=2.1.0
|
| 67 |
httpx==0.26.0
|
| 68 |
|
| 69 |
+
# X/Twitter integration (phin fork β fixes KEY_BYTE indices bug in upstream 2.3.3)
|
| 70 |
+
twikit @ git+https://codeberg.org/phin/twikit.git
|
| 71 |
+
curl_cffi>=0.7.0
|
| 72 |
|
| 73 |
# PDF generation (screening result reports, patient export, clinical summary)
|
| 74 |
reportlab>=4.0.0
|
tests/test_inference_splitter.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for split_compound_sentence β the second-pass clause splitter.
|
| 2 |
+
|
| 3 |
+
These tests guard the strictly-additive guarantee: simple sentences pass
|
| 4 |
+
through unchanged, and compound sentences are split only when every resulting
|
| 5 |
+
segment is long enough to be meaningful (_MIN_CLAUSE_LEN = 15 chars).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
from app.services.inference import (
|
| 13 |
+
_MIN_CLAUSE_LEN,
|
| 14 |
+
split_compound_sentence,
|
| 15 |
+
split_into_sentences,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# ββ Regression: simple sentences must pass through unchanged ββββββββββββββββββ
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@pytest.mark.parametrize(
|
| 22 |
+
"sentence",
|
| 23 |
+
[
|
| 24 |
+
"I feel sad today",
|
| 25 |
+
"tired",
|
| 26 |
+
"Can't sleep at all",
|
| 27 |
+
"I'm feeling hopeless",
|
| 28 |
+
"sad and tired", # "and" split would produce "sad" (3 chars) < 15 β no split
|
| 29 |
+
"Feeling sad and blue", # both fragments < 15 chars β no split
|
| 30 |
+
"No appetite lately",
|
| 31 |
+
],
|
| 32 |
+
)
|
| 33 |
+
def test_simple_sentences_pass_through(sentence: str):
|
| 34 |
+
result = split_compound_sentence(sentence)
|
| 35 |
+
assert result == [sentence], f"Should not have split: {sentence!r}"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ββ Comma splits ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_comma_split_fatigue_cognitive():
|
| 42 |
+
"""The motivating case: FATIGUE + COGNITIVE compound sentence."""
|
| 43 |
+
sentence = "Feeling overwhelmed and exhausted lately, can't seem to focus on anything"
|
| 44 |
+
result = split_compound_sentence(sentence)
|
| 45 |
+
assert len(result) == 2
|
| 46 |
+
assert result[0] == "Feeling overwhelmed and exhausted lately"
|
| 47 |
+
assert result[1] == "can't seem to focus on anything"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_comma_split_two_long_clauses():
|
| 51 |
+
sentence = "I haven't been sleeping well at all, my appetite has completely disappeared"
|
| 52 |
+
result = split_compound_sentence(sentence)
|
| 53 |
+
assert len(result) == 2
|
| 54 |
+
assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_comma_no_split_if_fragment_too_short():
|
| 58 |
+
"""'crying most days' (16 chars) is borderline β verify it does split."""
|
| 59 |
+
sentence = "I've been feeling very depressed lately, crying most days"
|
| 60 |
+
result = split_compound_sentence(sentence)
|
| 61 |
+
# "crying most days" = 16 chars >= 15, so this should split
|
| 62 |
+
assert len(result) == 2
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_comma_list_short_items_no_split():
|
| 66 |
+
"""Comma-separated short list items should not produce micro-fragments."""
|
| 67 |
+
sentence = "Sad, tired, hopeless" # all parts < 15 chars
|
| 68 |
+
result = split_compound_sentence(sentence)
|
| 69 |
+
assert result == [sentence]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ββ Semicolon splits ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_semicolon_split():
|
| 76 |
+
sentence = "I can't get out of bed in the morning; everything feels pointless"
|
| 77 |
+
result = split_compound_sentence(sentence)
|
| 78 |
+
assert len(result) == 2
|
| 79 |
+
assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def test_semicolon_rejected_when_fragment_too_short_falls_to_comma():
|
| 83 |
+
# "I feel empty" = 12 chars < _MIN_CLAUSE_LEN β semicolon split rejected.
|
| 84 |
+
# Falls through to comma split:
|
| 85 |
+
# "I feel empty; nothing brings me joy" (36 chars) + "not even the things I used to love" (34 chars)
|
| 86 |
+
sentence = "I feel empty; nothing brings me joy, not even the things I used to love"
|
| 87 |
+
result = split_compound_sentence(sentence)
|
| 88 |
+
assert len(result) == 2
|
| 89 |
+
assert result[0] == "I feel empty; nothing brings me joy"
|
| 90 |
+
assert result[1] == "not even the things I used to love"
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# ββ Adversative conjunction splits βββββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@pytest.mark.parametrize(
|
| 97 |
+
"sentence",
|
| 98 |
+
[
|
| 99 |
+
"I want to feel better but nothing seems to help anymore",
|
| 100 |
+
"I try to push through yet the exhaustion never lifts",
|
| 101 |
+
"I used to enjoy cooking though now it feels like a chore",
|
| 102 |
+
"I show up to work however inside I feel completely numb",
|
| 103 |
+
],
|
| 104 |
+
)
|
| 105 |
+
def test_adversative_conjunction_split(sentence: str):
|
| 106 |
+
result = split_compound_sentence(sentence)
|
| 107 |
+
assert len(result) == 2, f"Expected 2 clauses for: {sentence!r}"
|
| 108 |
+
assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ββ "and" splits (long sentences only) βββββββββββββββββββββββββββββββββββββββ
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_and_split_long_sentence():
|
| 115 |
+
"""Long sentence with "and" joining two independent clauses should split."""
|
| 116 |
+
sentence = "I've been feeling really exhausted and I can't concentrate at work at all"
|
| 117 |
+
result = split_compound_sentence(sentence)
|
| 118 |
+
assert len(result) == 2
|
| 119 |
+
assert all(len(p) >= _MIN_CLAUSE_LEN for p in result)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def test_and_no_split_short_sentence():
|
| 123 |
+
"""Short sentence with "and" must NOT split (under _AND_MIN_LEN = 40)."""
|
| 124 |
+
sentence = "Feeling sad and completely hopeless now" # 39 chars β just under threshold
|
| 125 |
+
result = split_compound_sentence(sentence)
|
| 126 |
+
assert result == [sentence]
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# ββ Integration with split_into_sentences pipeline βββββββββββββββββββββββββββ
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def test_full_pipeline_compound_post():
|
| 133 |
+
"""Verify that the two-pass pipeline (sentences β compound split) expands
|
| 134 |
+
a multi-clause post into the right number of analyzable segments."""
|
| 135 |
+
text = (
|
| 136 |
+
"I haven't been sleeping well lately. "
|
| 137 |
+
"Feeling overwhelmed and exhausted, can't seem to focus on anything. "
|
| 138 |
+
"I don't enjoy the things I used to love."
|
| 139 |
+
)
|
| 140 |
+
# First pass: 3 sentences (split on `. `)
|
| 141 |
+
sentences = split_into_sentences(text)
|
| 142 |
+
assert len(sentences) == 3
|
| 143 |
+
|
| 144 |
+
# Second pass: middle sentence should split on comma
|
| 145 |
+
expanded: list[str] = []
|
| 146 |
+
for s in sentences:
|
| 147 |
+
expanded.extend(split_compound_sentence(s))
|
| 148 |
+
|
| 149 |
+
# Sentence 1: no split β 1 segment
|
| 150 |
+
# Sentence 2: comma split β 2 segments
|
| 151 |
+
# Sentence 3: no split β 1 segment
|
| 152 |
+
assert len(expanded) == 4
|
| 153 |
+
assert "Feeling overwhelmed and exhausted" in expanded[1]
|
| 154 |
+
assert "can't seem to focus on anything" in expanded[2]
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def test_all_segments_meet_min_length():
|
| 158 |
+
"""Any output of split_compound_sentence must be >= _MIN_CLAUSE_LEN chars."""
|
| 159 |
+
cases = [
|
| 160 |
+
"Feeling overwhelmed and exhausted lately, can't seem to focus on anything",
|
| 161 |
+
"I can't sleep at night; I'm exhausted all day long",
|
| 162 |
+
"I want to feel better but nothing seems to work",
|
| 163 |
+
"sad and tired",
|
| 164 |
+
"I feel completely empty",
|
| 165 |
+
]
|
| 166 |
+
for sentence in cases:
|
| 167 |
+
result = split_compound_sentence(sentence)
|
| 168 |
+
for part in result:
|
| 169 |
+
assert len(part) >= _MIN_CLAUSE_LEN or result == [sentence], (
|
| 170 |
+
f"Fragment {part!r} too short (from {sentence!r})"
|
| 171 |
+
)
|
tests/test_x_client.py
CHANGED
|
@@ -120,7 +120,6 @@ async def test_user_not_found_raises_valueerror(x_client):
|
|
| 120 |
new_callable=AsyncMock,
|
| 121 |
side_effect=Exception("User not found"),
|
| 122 |
),
|
| 123 |
-
patch.object(x_client, "_login", new_callable=AsyncMock, side_effect=Exception("login failed")),
|
| 124 |
pytest.raises(ValueError, match="not found"),
|
| 125 |
):
|
| 126 |
await x_client.fetch_user_tweets("nonexistent_user_xyz")
|
|
|
|
| 120 |
new_callable=AsyncMock,
|
| 121 |
side_effect=Exception("User not found"),
|
| 122 |
),
|
|
|
|
| 123 |
pytest.raises(ValueError, match="not found"),
|
| 124 |
):
|
| 125 |
await x_client.fetch_user_tweets("nonexistent_user_xyz")
|