Spaces:
Running
Running
| """ | |
| 音声品質分析モジュール | |
| K-fold CERと実機CERの乖離を理解するための分析ツール | |
| - 音声品質スコア | |
| - 予測CER範囲 | |
| - モデルの弱点音素の検出 | |
| """ | |
| import numpy as np | |
| import librosa | |
| class AudioQualityAnalyzer: | |
| """音声品質分析クラス""" | |
| # モデルの既知の弱点(Phase 7-4の分析から) | |
| DIFFICULT_PHONEMES = { | |
| 'ʔ': 'glottal_stop', # 声門閉鎖音 | |
| 'ʲ': 'palatalization', # 口蓋化 | |
| 'ɬ': 'lateral_fricative', # 側面摩擦音 | |
| 'χ': 'uvular_fricative', # 口蓋垂摩擦音 | |
| 'x': 'velar_fricative', # 軟口蓋摩擦音 | |
| 'q': 'uvular_stop', # 口蓋垂閉鎖音 | |
| } | |
| # 訓練データ統計(Phase 7-4 Fold 1) | |
| TRAINING_STATS = { | |
| 'mean_duration': 3.0, # 平均3秒 | |
| 'max_duration': 15.0, # 最大15秒 | |
| 'mean_snr': 25.0, # 平均SNR 25dB | |
| 'min_snr': 15.0, # 最小SNR 15dB | |
| 'silence_ratio': 0.1, # 無音割合10% | |
| } | |
| # CER予測モデル(経験的) | |
| CER_BASELINE = 5.23 # Fold 1のベースラインCER | |
| def __init__(self): | |
| pass | |
| def analyze_audio_quality(self, audio_data, sample_rate): | |
| """ | |
| 音声品質を分析 | |
| Args: | |
| audio_data: 音声データ(numpy array) | |
| sample_rate: サンプリングレート | |
| Returns: | |
| dict: 分析結果 | |
| """ | |
| results = {} | |
| # 1. 音声の長さ | |
| duration = len(audio_data) / sample_rate | |
| results['duration'] = duration | |
| results['duration_warning'] = duration > self.TRAINING_STATS['max_duration'] | |
| # 2. SNR(Signal-to-Noise Ratio)推定 | |
| snr = self._estimate_snr(audio_data) | |
| results['snr'] = snr | |
| results['snr_warning'] = snr < self.TRAINING_STATS['min_snr'] | |
| # 3. 音量レベル | |
| rms = np.sqrt(np.mean(audio_data**2)) | |
| results['rms'] = rms | |
| results['volume_warning'] = rms < 0.01 or rms > 0.9 | |
| # 4. 無音区間の割合 | |
| silence_ratio = self._calculate_silence_ratio(audio_data) | |
| results['silence_ratio'] = silence_ratio | |
| results['silence_warning'] = silence_ratio > 0.3 | |
| # 5. クリッピング検出 | |
| clipping_ratio = np.sum(np.abs(audio_data) > 0.99) / len(audio_data) | |
| results['clipping_ratio'] = clipping_ratio | |
| results['clipping_warning'] = clipping_ratio > 0.01 | |
| # 6. 周波数特性(スペクトル平坦度) | |
| spectral_flatness = self._calculate_spectral_flatness(audio_data, sample_rate) | |
| results['spectral_flatness'] = spectral_flatness | |
| # 7. 総合品質スコア(0-100) | |
| quality_score = self._calculate_quality_score(results) | |
| results['quality_score'] = quality_score | |
| return results | |
| def estimate_expected_cer(self, audio_quality, transcription=None): | |
| """ | |
| 期待CERを推定 | |
| Args: | |
| audio_quality: analyze_audio_quality()の結果 | |
| transcription: 転写結果(オプション、音素分析用) | |
| Returns: | |
| dict: 予測CER範囲と警告 | |
| """ | |
| base_cer = self.CER_BASELINE # 5.23% | |
| # 品質スコアによるCER増加係数 | |
| quality_score = audio_quality['quality_score'] | |
| if quality_score >= 80: | |
| cer_multiplier = 1.0 # 5.23% | |
| confidence = "高" | |
| elif quality_score >= 60: | |
| cer_multiplier = 1.5 # 7.85% | |
| confidence = "中" | |
| elif quality_score >= 40: | |
| cer_multiplier = 2.5 # 13.08% | |
| confidence = "低" | |
| else: | |
| cer_multiplier = 4.0 # 20.92% | |
| confidence = "非常に低" | |
| # 音声長によるペナルティ | |
| duration = audio_quality['duration'] | |
| if duration > 10: | |
| cer_multiplier *= 1.3 # 長い音声はエラー蓄積 | |
| elif duration > 20: | |
| cer_multiplier *= 1.6 | |
| # 困難音素によるペナルティ | |
| if transcription: | |
| difficult_ratio = self._calculate_difficult_phoneme_ratio(transcription) | |
| if difficult_ratio > 0.1: | |
| cer_multiplier *= (1 + difficult_ratio) | |
| predicted_cer = base_cer * cer_multiplier | |
| predicted_cer_range = ( | |
| max(base_cer, predicted_cer * 0.7), | |
| min(50.0, predicted_cer * 1.3) | |
| ) | |
| return { | |
| 'predicted_cer': predicted_cer, | |
| 'cer_range': predicted_cer_range, | |
| 'confidence': confidence, | |
| 'quality_score': quality_score, | |
| 'factors': self._identify_cer_factors(audio_quality, transcription) | |
| } | |
| def _estimate_snr(self, audio_data): | |
| """SNRを推定(簡易版)""" | |
| # エネルギーベースの推定 | |
| # 上位50%をシグナル、下位20%をノイズと仮定 | |
| energies = audio_data ** 2 | |
| signal_energy = np.percentile(energies, 75) | |
| noise_energy = np.percentile(energies, 25) | |
| if noise_energy > 0: | |
| snr = 10 * np.log10(signal_energy / noise_energy) | |
| else: | |
| snr = 60.0 # ノイズがほぼない | |
| return float(snr) | |
| def _calculate_silence_ratio(self, audio_data, threshold=0.02): | |
| """無音区間の割合を計算""" | |
| silence_samples = np.sum(np.abs(audio_data) < threshold) | |
| return silence_samples / len(audio_data) | |
| def _calculate_spectral_flatness(self, audio_data, sample_rate): | |
| """スペクトル平坦度を計算(ノイズ度の指標)""" | |
| try: | |
| spectral_flatness = librosa.feature.spectral_flatness(y=audio_data)[0] | |
| return float(np.mean(spectral_flatness)) | |
| except: | |
| return 0.5 # デフォルト値 | |
| def _calculate_quality_score(self, results): | |
| """総合品質スコアを計算(0-100)""" | |
| score = 100.0 | |
| # SNRペナルティ | |
| if results['snr'] < 15: | |
| score -= 30 | |
| elif results['snr'] < 20: | |
| score -= 15 | |
| # 音量ペナルティ | |
| if results['volume_warning']: | |
| score -= 20 | |
| # クリッピングペナルティ | |
| if results['clipping_warning']: | |
| score -= 25 | |
| # 無音ペナルティ | |
| if results['silence_ratio'] > 0.3: | |
| score -= 15 | |
| # 長さペナルティ | |
| if results['duration'] > 20: | |
| score -= 10 | |
| elif results['duration'] > 30: | |
| score -= 20 | |
| return max(0, score) | |
| def _calculate_difficult_phoneme_ratio(self, transcription): | |
| """困難音素の割合を計算""" | |
| if not transcription: | |
| return 0.0 | |
| total_chars = len(transcription) | |
| if total_chars == 0: | |
| return 0.0 | |
| difficult_count = sum( | |
| transcription.count(phoneme) | |
| for phoneme in self.DIFFICULT_PHONEMES.keys() | |
| ) | |
| return difficult_count / total_chars | |
| def _identify_cer_factors(self, audio_quality, transcription): | |
| """CER増加要因を特定""" | |
| factors = [] | |
| # 音声品質要因 | |
| if audio_quality['snr_warning']: | |
| factors.append({ | |
| 'type': 'audio_quality', | |
| 'name': 'ノイズが多い', | |
| 'impact': '高', | |
| 'description': f"SNR {audio_quality['snr']:.1f}dB(推奨: >15dB)" | |
| }) | |
| if audio_quality['volume_warning']: | |
| factors.append({ | |
| 'type': 'audio_quality', | |
| 'name': '音量が不適切', | |
| 'impact': '中', | |
| 'description': f"RMS {audio_quality['rms']:.3f}(推奨: 0.1-0.7)" | |
| }) | |
| if audio_quality['clipping_warning']: | |
| factors.append({ | |
| 'type': 'audio_quality', | |
| 'name': 'クリッピング検出', | |
| 'impact': '高', | |
| 'description': f"{audio_quality['clipping_ratio']*100:.1f}%のサンプルが飽和" | |
| }) | |
| if audio_quality['silence_warning']: | |
| factors.append({ | |
| 'type': 'audio_quality', | |
| 'name': '無音区間が多い', | |
| 'impact': '低', | |
| 'description': f"無音: {audio_quality['silence_ratio']*100:.1f}%" | |
| }) | |
| if audio_quality['duration_warning']: | |
| factors.append({ | |
| 'type': 'duration', | |
| 'name': '音声が長い', | |
| 'impact': '中', | |
| 'description': f"{audio_quality['duration']:.1f}秒(推奨: <15秒)" | |
| }) | |
| # 音素要因 | |
| if transcription: | |
| difficult_ratio = self._calculate_difficult_phoneme_ratio(transcription) | |
| if difficult_ratio > 0.1: | |
| factors.append({ | |
| 'type': 'phoneme', | |
| 'name': '困難な音素が多い', | |
| 'impact': '高', | |
| 'description': f"困難音素: {difficult_ratio*100:.1f}%(ʔ, ʲ, ɬ, χなど)" | |
| }) | |
| return factors | |
| def generate_quality_report_html(self, audio_quality, cer_prediction, actual_cer=None): | |
| """ | |
| 品質レポートのHTMLを生成 | |
| Args: | |
| audio_quality: 音声品質分析結果 | |
| cer_prediction: CER予測結果 | |
| actual_cer: 実際のCER(オプション) | |
| Returns: | |
| str: HTML | |
| """ | |
| quality_score = audio_quality['quality_score'] | |
| # 品質スコアの色分け | |
| if quality_score >= 80: | |
| score_class = 'quality-excellent' | |
| score_emoji = '🟢' | |
| elif quality_score >= 60: | |
| score_class = 'quality-good' | |
| score_emoji = '🟡' | |
| elif quality_score >= 40: | |
| score_class = 'quality-fair' | |
| score_emoji = '🟠' | |
| else: | |
| score_class = 'quality-poor' | |
| score_emoji = '🔴' | |
| html = f''' | |
| <div class="quality-report"> | |
| <div class="quality-header"> | |
| <h3>📊 音声品質分析</h3> | |
| <div class="quality-score-box {score_class}"> | |
| {score_emoji} 品質スコア: <strong>{quality_score:.0f}/100</strong> | |
| </div> | |
| </div> | |
| <div class="quality-details"> | |
| <div class="quality-item"> | |
| <span class="quality-label">SNR:</span> | |
| <span class="quality-value">{audio_quality['snr']:.1f} dB</span> | |
| {'⚠️ 低い' if audio_quality['snr_warning'] else '✓'} | |
| </div> | |
| <div class="quality-item"> | |
| <span class="quality-label">音声長:</span> | |
| <span class="quality-value">{audio_quality['duration']:.1f} 秒</span> | |
| {'⚠️ 長い' if audio_quality['duration_warning'] else '✓'} | |
| </div> | |
| <div class="quality-item"> | |
| <span class="quality-label">無音割合:</span> | |
| <span class="quality-value">{audio_quality['silence_ratio']*100:.1f}%</span> | |
| {'⚠️ 多い' if audio_quality['silence_warning'] else '✓'} | |
| </div> | |
| <div class="quality-item"> | |
| <span class="quality-label">クリッピング:</span> | |
| <span class="quality-value">{audio_quality['clipping_ratio']*100:.2f}%</span> | |
| {'⚠️ 検出' if audio_quality['clipping_warning'] else '✓'} | |
| </div> | |
| </div> | |
| <div class="cer-prediction"> | |
| <h4>🎯 予測CER範囲</h4> | |
| <div class="predicted-cer"> | |
| <strong>{cer_prediction['cer_range'][0]:.1f}% - {cer_prediction['cer_range'][1]:.1f}%</strong> | |
| (中央値: {cer_prediction['predicted_cer']:.1f}%) | |
| </div> | |
| <div class="confidence"> | |
| 信頼度: <strong>{cer_prediction['confidence']}</strong> | |
| </div> | |
| ''' | |
| # 実際のCERとの比較 | |
| if actual_cer is not None: | |
| actual_cer_pct = actual_cer * 100 | |
| in_range = cer_prediction['cer_range'][0] <= actual_cer_pct <= cer_prediction['cer_range'][1] | |
| html += f''' | |
| <div class="actual-cer-comparison"> | |
| <div class="actual-cer"> | |
| 実際のCER: <strong>{actual_cer_pct:.2f}%</strong> | |
| {' ✓ 予測範囲内' if in_range else ' ⚠️ 予測範囲外'} | |
| </div> | |
| ''' | |
| if not in_range: | |
| if actual_cer_pct < cer_prediction['cer_range'][0]: | |
| html += '<div class="analysis-note">📈 予測より良好!この音声は訓練データに近い可能性があります。</div>' | |
| else: | |
| html += '<div class="analysis-note">📉 予測より悪化。下記の要因を確認してください。</div>' | |
| html += '</div>' | |
| html += '</div>' # cer-prediction | |
| # CER増加要因 | |
| if cer_prediction['factors']: | |
| html += '<div class="cer-factors"><h4>⚠️ CER増加要因</h4><ul>' | |
| for factor in cer_prediction['factors']: | |
| impact_emoji = {'高': '🔴', '中': '🟠', '低': '🟡'}.get(factor['impact'], '⚪') | |
| html += f''' | |
| <li class="factor-item"> | |
| <span class="factor-name">{impact_emoji} {factor['name']}</span> | |
| <span class="factor-impact">影響度: {factor['impact']}</span> | |
| <div class="factor-description">{factor['description']}</div> | |
| </li> | |
| ''' | |
| html += '</ul></div>' | |
| # 推奨事項 | |
| html += '<div class="recommendations"><h4>💡 推奨事項</h4><ul>' | |
| if audio_quality['snr_warning']: | |
| html += '<li>ノイズ除去処理を試してください</li>' | |
| if audio_quality['duration_warning']: | |
| html += '<li>音声を短い区間(<15秒)に分割することを検討してください</li>' | |
| if audio_quality['volume_warning']: | |
| html += '<li>音声の音量を調整してください(正規化)</li>' | |
| if audio_quality['clipping_warning']: | |
| html += '<li>クリッピングのない音源を使用してください</li>' | |
| if not cer_prediction['factors']: | |
| html += '<li>✓ 音声品質は良好です!</li>' | |
| html += '</ul></div>' | |
| html += '</div>' # quality-report | |
| return html | |