itelmen-asr-fold1 / audio_quality_analyzer.py
sut0's picture
Update: Add quality analyzer module
5a97c59 verified
"""
音声品質分析モジュール
K-fold CERと実機CERの乖離を理解するための分析ツール
- 音声品質スコア
- 予測CER範囲
- モデルの弱点音素の検出
"""
import numpy as np
import librosa
class AudioQualityAnalyzer:
"""音声品質分析クラス"""
# モデルの既知の弱点(Phase 7-4の分析から)
DIFFICULT_PHONEMES = {
'ʔ': 'glottal_stop', # 声門閉鎖音
'ʲ': 'palatalization', # 口蓋化
'ɬ': 'lateral_fricative', # 側面摩擦音
'χ': 'uvular_fricative', # 口蓋垂摩擦音
'x': 'velar_fricative', # 軟口蓋摩擦音
'q': 'uvular_stop', # 口蓋垂閉鎖音
}
# 訓練データ統計(Phase 7-4 Fold 1)
TRAINING_STATS = {
'mean_duration': 3.0, # 平均3秒
'max_duration': 15.0, # 最大15秒
'mean_snr': 25.0, # 平均SNR 25dB
'min_snr': 15.0, # 最小SNR 15dB
'silence_ratio': 0.1, # 無音割合10%
}
# CER予測モデル(経験的)
CER_BASELINE = 5.23 # Fold 1のベースラインCER
def __init__(self):
pass
def analyze_audio_quality(self, audio_data, sample_rate):
"""
音声品質を分析
Args:
audio_data: 音声データ(numpy array)
sample_rate: サンプリングレート
Returns:
dict: 分析結果
"""
results = {}
# 1. 音声の長さ
duration = len(audio_data) / sample_rate
results['duration'] = duration
results['duration_warning'] = duration > self.TRAINING_STATS['max_duration']
# 2. SNR(Signal-to-Noise Ratio)推定
snr = self._estimate_snr(audio_data)
results['snr'] = snr
results['snr_warning'] = snr < self.TRAINING_STATS['min_snr']
# 3. 音量レベル
rms = np.sqrt(np.mean(audio_data**2))
results['rms'] = rms
results['volume_warning'] = rms < 0.01 or rms > 0.9
# 4. 無音区間の割合
silence_ratio = self._calculate_silence_ratio(audio_data)
results['silence_ratio'] = silence_ratio
results['silence_warning'] = silence_ratio > 0.3
# 5. クリッピング検出
clipping_ratio = np.sum(np.abs(audio_data) > 0.99) / len(audio_data)
results['clipping_ratio'] = clipping_ratio
results['clipping_warning'] = clipping_ratio > 0.01
# 6. 周波数特性(スペクトル平坦度)
spectral_flatness = self._calculate_spectral_flatness(audio_data, sample_rate)
results['spectral_flatness'] = spectral_flatness
# 7. 総合品質スコア(0-100)
quality_score = self._calculate_quality_score(results)
results['quality_score'] = quality_score
return results
def estimate_expected_cer(self, audio_quality, transcription=None):
"""
期待CERを推定
Args:
audio_quality: analyze_audio_quality()の結果
transcription: 転写結果(オプション、音素分析用)
Returns:
dict: 予測CER範囲と警告
"""
base_cer = self.CER_BASELINE # 5.23%
# 品質スコアによるCER増加係数
quality_score = audio_quality['quality_score']
if quality_score >= 80:
cer_multiplier = 1.0 # 5.23%
confidence = "高"
elif quality_score >= 60:
cer_multiplier = 1.5 # 7.85%
confidence = "中"
elif quality_score >= 40:
cer_multiplier = 2.5 # 13.08%
confidence = "低"
else:
cer_multiplier = 4.0 # 20.92%
confidence = "非常に低"
# 音声長によるペナルティ
duration = audio_quality['duration']
if duration > 10:
cer_multiplier *= 1.3 # 長い音声はエラー蓄積
elif duration > 20:
cer_multiplier *= 1.6
# 困難音素によるペナルティ
if transcription:
difficult_ratio = self._calculate_difficult_phoneme_ratio(transcription)
if difficult_ratio > 0.1:
cer_multiplier *= (1 + difficult_ratio)
predicted_cer = base_cer * cer_multiplier
predicted_cer_range = (
max(base_cer, predicted_cer * 0.7),
min(50.0, predicted_cer * 1.3)
)
return {
'predicted_cer': predicted_cer,
'cer_range': predicted_cer_range,
'confidence': confidence,
'quality_score': quality_score,
'factors': self._identify_cer_factors(audio_quality, transcription)
}
def _estimate_snr(self, audio_data):
"""SNRを推定(簡易版)"""
# エネルギーベースの推定
# 上位50%をシグナル、下位20%をノイズと仮定
energies = audio_data ** 2
signal_energy = np.percentile(energies, 75)
noise_energy = np.percentile(energies, 25)
if noise_energy > 0:
snr = 10 * np.log10(signal_energy / noise_energy)
else:
snr = 60.0 # ノイズがほぼない
return float(snr)
def _calculate_silence_ratio(self, audio_data, threshold=0.02):
"""無音区間の割合を計算"""
silence_samples = np.sum(np.abs(audio_data) < threshold)
return silence_samples / len(audio_data)
def _calculate_spectral_flatness(self, audio_data, sample_rate):
"""スペクトル平坦度を計算(ノイズ度の指標)"""
try:
spectral_flatness = librosa.feature.spectral_flatness(y=audio_data)[0]
return float(np.mean(spectral_flatness))
except:
return 0.5 # デフォルト値
def _calculate_quality_score(self, results):
"""総合品質スコアを計算(0-100)"""
score = 100.0
# SNRペナルティ
if results['snr'] < 15:
score -= 30
elif results['snr'] < 20:
score -= 15
# 音量ペナルティ
if results['volume_warning']:
score -= 20
# クリッピングペナルティ
if results['clipping_warning']:
score -= 25
# 無音ペナルティ
if results['silence_ratio'] > 0.3:
score -= 15
# 長さペナルティ
if results['duration'] > 20:
score -= 10
elif results['duration'] > 30:
score -= 20
return max(0, score)
def _calculate_difficult_phoneme_ratio(self, transcription):
"""困難音素の割合を計算"""
if not transcription:
return 0.0
total_chars = len(transcription)
if total_chars == 0:
return 0.0
difficult_count = sum(
transcription.count(phoneme)
for phoneme in self.DIFFICULT_PHONEMES.keys()
)
return difficult_count / total_chars
def _identify_cer_factors(self, audio_quality, transcription):
"""CER増加要因を特定"""
factors = []
# 音声品質要因
if audio_quality['snr_warning']:
factors.append({
'type': 'audio_quality',
'name': 'ノイズが多い',
'impact': '高',
'description': f"SNR {audio_quality['snr']:.1f}dB(推奨: >15dB)"
})
if audio_quality['volume_warning']:
factors.append({
'type': 'audio_quality',
'name': '音量が不適切',
'impact': '中',
'description': f"RMS {audio_quality['rms']:.3f}(推奨: 0.1-0.7)"
})
if audio_quality['clipping_warning']:
factors.append({
'type': 'audio_quality',
'name': 'クリッピング検出',
'impact': '高',
'description': f"{audio_quality['clipping_ratio']*100:.1f}%のサンプルが飽和"
})
if audio_quality['silence_warning']:
factors.append({
'type': 'audio_quality',
'name': '無音区間が多い',
'impact': '低',
'description': f"無音: {audio_quality['silence_ratio']*100:.1f}%"
})
if audio_quality['duration_warning']:
factors.append({
'type': 'duration',
'name': '音声が長い',
'impact': '中',
'description': f"{audio_quality['duration']:.1f}秒(推奨: <15秒)"
})
# 音素要因
if transcription:
difficult_ratio = self._calculate_difficult_phoneme_ratio(transcription)
if difficult_ratio > 0.1:
factors.append({
'type': 'phoneme',
'name': '困難な音素が多い',
'impact': '高',
'description': f"困難音素: {difficult_ratio*100:.1f}%(ʔ, ʲ, ɬ, χなど)"
})
return factors
def generate_quality_report_html(self, audio_quality, cer_prediction, actual_cer=None):
"""
品質レポートのHTMLを生成
Args:
audio_quality: 音声品質分析結果
cer_prediction: CER予測結果
actual_cer: 実際のCER(オプション)
Returns:
str: HTML
"""
quality_score = audio_quality['quality_score']
# 品質スコアの色分け
if quality_score >= 80:
score_class = 'quality-excellent'
score_emoji = '🟢'
elif quality_score >= 60:
score_class = 'quality-good'
score_emoji = '🟡'
elif quality_score >= 40:
score_class = 'quality-fair'
score_emoji = '🟠'
else:
score_class = 'quality-poor'
score_emoji = '🔴'
html = f'''
<div class="quality-report">
<div class="quality-header">
<h3>📊 音声品質分析</h3>
<div class="quality-score-box {score_class}">
{score_emoji} 品質スコア: <strong>{quality_score:.0f}/100</strong>
</div>
</div>
<div class="quality-details">
<div class="quality-item">
<span class="quality-label">SNR:</span>
<span class="quality-value">{audio_quality['snr']:.1f} dB</span>
{'⚠️ 低い' if audio_quality['snr_warning'] else '✓'}
</div>
<div class="quality-item">
<span class="quality-label">音声長:</span>
<span class="quality-value">{audio_quality['duration']:.1f} 秒</span>
{'⚠️ 長い' if audio_quality['duration_warning'] else '✓'}
</div>
<div class="quality-item">
<span class="quality-label">無音割合:</span>
<span class="quality-value">{audio_quality['silence_ratio']*100:.1f}%</span>
{'⚠️ 多い' if audio_quality['silence_warning'] else '✓'}
</div>
<div class="quality-item">
<span class="quality-label">クリッピング:</span>
<span class="quality-value">{audio_quality['clipping_ratio']*100:.2f}%</span>
{'⚠️ 検出' if audio_quality['clipping_warning'] else '✓'}
</div>
</div>
<div class="cer-prediction">
<h4>🎯 予測CER範囲</h4>
<div class="predicted-cer">
<strong>{cer_prediction['cer_range'][0]:.1f}% - {cer_prediction['cer_range'][1]:.1f}%</strong>
(中央値: {cer_prediction['predicted_cer']:.1f}%)
</div>
<div class="confidence">
信頼度: <strong>{cer_prediction['confidence']}</strong>
</div>
'''
# 実際のCERとの比較
if actual_cer is not None:
actual_cer_pct = actual_cer * 100
in_range = cer_prediction['cer_range'][0] <= actual_cer_pct <= cer_prediction['cer_range'][1]
html += f'''
<div class="actual-cer-comparison">
<div class="actual-cer">
実際のCER: <strong>{actual_cer_pct:.2f}%</strong>
{' ✓ 予測範囲内' if in_range else ' ⚠️ 予測範囲外'}
</div>
'''
if not in_range:
if actual_cer_pct < cer_prediction['cer_range'][0]:
html += '<div class="analysis-note">📈 予測より良好!この音声は訓練データに近い可能性があります。</div>'
else:
html += '<div class="analysis-note">📉 予測より悪化。下記の要因を確認してください。</div>'
html += '</div>'
html += '</div>' # cer-prediction
# CER増加要因
if cer_prediction['factors']:
html += '<div class="cer-factors"><h4>⚠️ CER増加要因</h4><ul>'
for factor in cer_prediction['factors']:
impact_emoji = {'高': '🔴', '中': '🟠', '低': '🟡'}.get(factor['impact'], '⚪')
html += f'''
<li class="factor-item">
<span class="factor-name">{impact_emoji} {factor['name']}</span>
<span class="factor-impact">影響度: {factor['impact']}</span>
<div class="factor-description">{factor['description']}</div>
</li>
'''
html += '</ul></div>'
# 推奨事項
html += '<div class="recommendations"><h4>💡 推奨事項</h4><ul>'
if audio_quality['snr_warning']:
html += '<li>ノイズ除去処理を試してください</li>'
if audio_quality['duration_warning']:
html += '<li>音声を短い区間(<15秒)に分割することを検討してください</li>'
if audio_quality['volume_warning']:
html += '<li>音声の音量を調整してください(正規化)</li>'
if audio_quality['clipping_warning']:
html += '<li>クリッピングのない音源を使用してください</li>'
if not cer_prediction['factors']:
html += '<li>✓ 音声品質は良好です!</li>'
html += '</ul></div>'
html += '</div>' # quality-report
return html