114 SAFE_MEMSET(&g_sent_stats,
sizeof(g_sent_stats), 0,
sizeof(g_sent_stats));
115 SAFE_MEMSET(&g_received_stats,
sizeof(g_received_stats), 0,
sizeof(g_received_stats));
118 SAFE_MEMSET(g_received_gap_intervals_ms,
sizeof(g_received_gap_intervals_ms), 0,
sizeof(g_received_gap_intervals_ms));
119 g_received_gap_count = 0;
120 g_received_silence_start_sample = 0;
121 g_received_last_silence_end_sample = 0;
122 SAFE_MEMSET(g_received_packet_times,
sizeof(g_received_packet_times), 0,
sizeof(g_received_packet_times));
123 g_received_packet_times_count = 0;
124 SAFE_MEMSET(g_received_packet_sizes,
sizeof(g_received_packet_sizes), 0,
sizeof(g_received_packet_sizes));
125 g_received_total_audio_samples = 0;
128 SAFE_MEMSET(g_echo_buffer,
sizeof(g_echo_buffer), 0,
sizeof(g_echo_buffer));
129 g_echo_buffer_pos = 0;
130 g_echo_correlation_sample_count = 0;
132 g_echo_correlation_strength[i] = 0;
133 g_echo_match_count[i] = 0;
135 g_detected_echo_delay_ms = 0;
138 SAFE_MEMSET(g_received_beep_window,
sizeof(g_received_beep_window), 0,
sizeof(g_received_beep_window));
139 g_received_beep_window_idx = 0;
140 g_received_beep_events = 0;
141 g_received_tonal_samples = 0;
142 g_in_beep_burst =
false;
143 g_beep_burst_samples = 0;
146 clock_gettime(CLOCK_MONOTONIC, &ts);
147 int64_t now_us = (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
152 g_sent_last_sample = 0.0f;
153 g_received_last_sample = 0.0f;
154 g_sent_last_packet_time_us = now_us;
155 g_received_last_packet_time_us = now_us;
162 log_info(
"Dumping sent audio to /tmp/sent_audio.wav");
164 if (g_received_wav) {
165 log_info(
"Dumping received audio to /tmp/received_audio.wav");
169 g_analysis_enabled =
true;
175 if (!g_analysis_enabled)
181 float abs_sample = fabsf(sample);
187 if (abs_sample > 1.0f) {
189 g_sent_clipping_samples++;
193 float amp_change = fabsf(sample - g_sent_last_sample);
194 if (amp_change > 0.3f) {
195 g_sent_sharp_transitions++;
197 g_sent_transition_samples++;
200 g_sent_mean += sample;
206 static float s_sent_prev_sample_for_zero_crossing = 0.0f;
207 if ((s_sent_prev_sample_for_zero_crossing > 0 && sample < 0) ||
208 (s_sent_prev_sample_for_zero_crossing < 0 && sample > 0)) {
209 g_sent_zero_crossings++;
211 s_sent_prev_sample_for_zero_crossing = sample;
214 if (abs_sample < 0.001f) {
216 g_sent_silence_burst++;
219 if (g_sent_silence_burst > g_sent_max_silence_burst) {
220 g_sent_max_silence_burst = g_sent_silence_burst;
222 g_sent_silence_burst = 0;
226 float delta = fabsf(sample - g_sent_last_sample);
230 g_sent_last_sample = sample;
233 g_sent_rms_accumulator += sample * sample;
234 g_sent_rms_sample_count++;
242 g_echo_buffer[g_echo_buffer_pos] = sample;
276 if (!g_analysis_enabled)
282 float abs_sample = fabsf(sample);
283 if (abs_sample > g_received_stats.
peak_level) {
288 if (abs_sample > 1.0f) {
290 g_received_clipping_samples++;
294 float amp_change = fabsf(sample - g_received_last_sample);
295 if (amp_change > 0.3f) {
296 g_received_sharp_transitions++;
298 g_received_transition_samples++;
301 g_received_mean += sample;
307 static float s_received_prev_sample_for_zero_crossing = 0.0f;
308 if ((s_received_prev_sample_for_zero_crossing > 0 && sample < 0) ||
309 (s_received_prev_sample_for_zero_crossing < 0 && sample > 0)) {
310 g_received_zero_crossings++;
312 s_received_prev_sample_for_zero_crossing = sample;
315 if (abs_sample < 0.001f) {
317 g_received_silence_burst++;
318 g_received_below_noise_floor++;
321 if (g_received_silence_burst == 1) {
322 g_received_silence_start_sample = g_received_stats.
total_samples;
326 if (g_received_silence_burst > 0) {
328 if (g_received_last_silence_end_sample > 0) {
329 uint64_t samples_between = g_received_silence_start_sample - g_received_last_silence_end_sample;
334 g_received_gap_intervals_ms[g_received_gap_count++] = ms_between;
338 g_received_last_silence_end_sample = g_received_stats.
total_samples;
341 if (g_received_silence_burst > g_received_max_silence_burst) {
342 g_received_max_silence_burst = g_received_silence_burst;
345 g_received_silence_burst = 0;
349 if (abs_sample < 0.05f) {
350 g_received_low_energy_samples++;
354 float delta = fabsf(sample - g_received_last_sample);
358 g_received_last_sample = sample;
361 g_received_rms_accumulator += sample * sample;
362 g_received_rms_sample_count++;
366 if (g_echo_correlation_sample_count < 500000) {
369 uint32_t delay_samples = (g_echo_delays_ms[delay_idx] * 48000) / 1000;
373 if (g_echo_buffer_pos >= delay_samples) {
374 sent_pos = g_echo_buffer_pos - delay_samples;
379 float sent_sample = g_echo_buffer[sent_pos];
382 float diff = fabsf(sample - sent_sample);
383 if (diff < 0.1f && fabsf(sent_sample) > 0.01f) {
384 g_echo_match_count[delay_idx]++;
385 g_echo_correlation_strength[delay_idx] += (0.1f - diff);
388 g_echo_correlation_sample_count++;
393 g_received_beep_window[g_received_beep_window_idx] = sample;
394 g_received_beep_window_idx = (g_received_beep_window_idx + 1) %
BEEP_WINDOW_SIZE;
399 int zero_crossings = 0;
400 float min_amp = 1.0f, max_amp = 0.0f;
401 float sum_amp = 0.0f;
402 float prev = g_received_beep_window[0];
405 float curr = g_received_beep_window[i];
406 float abs_curr = fabsf(curr);
409 if (abs_curr > max_amp)
411 if (abs_curr < min_amp)
416 if ((prev > 0 && curr < 0) || (prev < 0 && curr > 0)) {
423 float amp_range = max_amp - min_amp;
429 bool is_tonal = (zero_crossings >= 5 && zero_crossings <= 100) &&
431 (amp_range < avg_amp * 1.5f);
436 if (!g_in_beep_burst) {
438 g_in_beep_burst =
true;
444 if (g_in_beep_burst) {
448 if (g_beep_burst_samples > 0 && g_beep_burst_samples < 24000) {
449 g_received_beep_events++;
450 g_received_stats.
beep_events = g_received_beep_events;
452 g_in_beep_burst =
false;
453 g_beep_burst_samples = 0;
461 if (g_received_wav) {
518 if (!g_analysis_enabled) {
523 clock_gettime(CLOCK_MONOTONIC, &ts);
524 int64_t now_us = (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
533 float sent_rms = 0.0f;
534 float recv_rms = 0.0f;
535 if (g_sent_rms_sample_count > 0) {
536 sent_rms = sqrtf(g_sent_rms_accumulator / g_sent_rms_sample_count);
538 if (g_received_rms_sample_count > 0) {
539 recv_rms = sqrtf(g_received_rms_accumulator / g_received_rms_sample_count);
542 log_plain(
"================================================================================");
544 log_plain(
"================================================================================");
545 log_plain(
"SENT AUDIO (Microphone Capture):");
546 log_plain(
" Duration: %lld ms", (
long long)sent_duration_ms);
549 log_plain(
" RMS Level: %.4f (audio energy/loudness)", sent_rms);
554 if (g_sent_max_silence_burst > 0) {
555 log_plain(
" Max Silence Burst: %llu samples", (
unsigned long long)g_sent_max_silence_burst);
561 log_plain(
" Duration: %lld ms", (
long long)recv_duration_ms);
564 log_plain(
" RMS Level: %.4f (audio energy/loudness)", recv_rms);
573 if (g_received_max_silence_burst > 0) {
574 log_plain(
" Max Silence Burst: %llu samples", (
unsigned long long)g_received_max_silence_burst);
576 double low_energy_pct =
578 log_plain(
" Very Quiet Samples: %llu samples (%.1f%%) [amplitude < 0.05]",
579 (
unsigned long long)g_received_low_energy_samples, low_energy_pct);
583 log_plain(
"QUALITY METRICS (Scratchy/Distorted Audio Detection):");
585 log_plain(
" Jitter Events: %llu (rapid amplitude changes)", (
unsigned long long)g_sent_stats.
jitter_count);
586 log_plain(
" Discontinuities: %llu (packet arrival gaps > 100ms)",
588 log_plain(
" Max Gap Between Packets: %u ms (expected ~20ms per frame)", g_sent_stats.
max_gap_ms);
591 log_plain(
" Jitter Events: %llu (rapid amplitude changes)",
593 log_plain(
" Discontinuities: %llu (packet arrival gaps > 100ms)",
595 log_plain(
" Max Gap Between Packets: %u ms (expected ~20ms per frame)", g_received_stats.
max_gap_ms);
598 if (g_received_beep_events > 0 || g_received_tonal_samples > 0) {
602 log_plain(
" Beep Events: %llu (short tonal bursts < 500ms)",
603 (
unsigned long long)g_received_beep_events);
604 log_plain(
" Tonal Samples: %llu samples (%.1f%%) [consistent frequency content]",
605 (
unsigned long long)g_received_tonal_samples, tonal_pct);
607 if (g_received_beep_events > 10) {
608 log_plain(
" 🔴 BEEPING DETECTED: %llu short tonal bursts - likely codec artifacts or system sounds!",
609 (
unsigned long long)g_received_beep_events);
611 log_plain(
" - Opus codec producing tonal artifacts during silence/transitions");
612 log_plain(
" - Buffer underruns creating synthetic tones");
613 log_plain(
" - AEC3 suppressor resonance");
614 log_plain(
" - System notification sounds bleeding through");
615 }
else if (g_received_beep_events > 3) {
616 log_plain(
" ⚠️ Some beep artifacts detected (%llu events)", (
unsigned long long)g_received_beep_events);
622 log_plain(
" No audio captured from microphone!");
625 log_plain(
" No audio received from server!");
626 }
else if (g_received_stats.
peak_level < 0.01f) {
627 log_plain(
" ⚠️ Received audio is very quiet (peak < 0.01)");
630 log_plain(
" Microphone input is clipping - reduce microphone volume");
634 log_plain(
"ECHO DETECTION (Echo Cancellation Quality Check):");
635 if (g_echo_correlation_sample_count > 0 && g_sent_stats.
total_samples > 0) {
637 int best_delay_idx = -1;
641 if (g_echo_match_count[i] > max_matches) {
642 max_matches = g_echo_match_count[i];
647 double echo_threshold_pct = 5.0;
649 if (best_delay_idx >= 0) {
650 double match_pct = (100.0 * g_echo_match_count[best_delay_idx]) / g_echo_correlation_sample_count;
651 log_plain(
" Echo correlation at different delays:");
653 double pct = (100.0 * g_echo_match_count[i]) / g_echo_correlation_sample_count;
654 const char *status = pct > echo_threshold_pct ?
"⚠️ ECHO DETECTED" :
"✓ OK";
655 log_plain(
" %3u ms delay: %.1f%% match rate %s", g_echo_delays_ms[i], pct, status);
658 if (match_pct > echo_threshold_pct) {
659 g_detected_echo_delay_ms = g_echo_delays_ms[best_delay_idx];
660 log_plain(
" 🔴 ECHO CANCELLATION NOT WORKING: Strong echo at %u ms delay!", g_detected_echo_delay_ms);
661 log_plain(
" Received audio contains %.1f%% samples matching sent audio from %u ms ago", match_pct,
662 g_detected_echo_delay_ms);
664 log_plain(
" ✓ Echo cancellation working: No significant echo detected");
668 log_plain(
" Insufficient data for echo detection (need both sent and received audio)");
672 if (g_aec3_metrics_available) {
673 log_plain(
"AEC3 METRICS (from WebRTC GetMetrics()):");
674 log_plain(
" Echo Return Loss (ERL): %.2f dB (how much echo is attenuated; >10 dB is good)",
675 g_aec3_echo_return_loss);
676 log_plain(
" Echo Return Loss Enhancement (ERLE): %.2f dB (residual echo suppression)",
677 g_aec3_echo_return_loss_enhancement);
678 log_plain(
" Estimated Echo Delay: %d ms", g_aec3_delay_ms);
680 if (g_aec3_echo_return_loss > 10.0) {
681 log_plain(
" ✓ Good echo attenuation (ERL > 10 dB)");
682 }
else if (g_aec3_echo_return_loss > 3.0) {
683 log_plain(
" ⚠️ Moderate echo attenuation (3-10 dB)");
685 log_plain(
" 🔴 Poor echo attenuation (ERL < 3 dB)");
690 if (recv_rms < 0.005f) {
691 log_plain(
" ⚠️ CRITICAL: Received audio RMS is extremely low (%.6f) - barely audible!", recv_rms);
692 }
else if (recv_rms < 0.02f) {
693 log_plain(
" ⚠️ WARNING: Received audio RMS is low (%.6f) - may sound quiet or muddy", recv_rms);
697 double received_silence_pct = g_received_stats.
total_samples > 0
701 if (received_silence_pct > 30.0) {
702 log_plain(
" ⚠️ SCRATCHY AUDIO DETECTED: Too much silence in received audio!");
703 log_plain(
" - Silence: %.1f%% of received samples (should be < 10%%)", received_silence_pct);
704 log_plain(
" - Max silence burst: %llu samples", (
unsigned long long)g_received_max_silence_burst);
705 log_plain(
" - This creates jittery/choppy playback between audio bursts");
706 }
else if (received_silence_pct > 15.0) {
707 log_plain(
" ⚠️ WARNING: Moderate silence detected (%.1f%%)", received_silence_pct);
711 double sent_sharp_pct =
712 g_sent_transition_samples > 0 ? (100.0 * g_sent_sharp_transitions / g_sent_transition_samples) : 0;
713 double recv_sharp_pct =
714 g_received_transition_samples > 0 ? (100.0 * g_received_sharp_transitions / g_received_transition_samples) : 0;
718 double sent_zero_cross_pct =
720 double recv_zero_cross_pct =
723 log_plain(
"WAVEFORM ANALYSIS (Is it clean music or corrupted/static?):");
725 log_plain(
" Zero crossings: %.2f%% of samples (music: 1-5%%, noise: 15-50%%)", sent_zero_cross_pct);
726 log_plain(
" Sharp transitions (clicks/pops): %.2f%% of samples", sent_sharp_pct);
727 log_plain(
" Clipping samples: %llu (%.3f%%)", (
unsigned long long)g_sent_clipping_samples,
731 log_plain(
" Zero crossings: %.2f%% of samples (music: 1-5%%, noise: 15-50%%)", recv_zero_cross_pct);
732 log_plain(
" Sharp transitions (clicks/pops): %.2f%% of samples", recv_sharp_pct);
733 log_plain(
" Clipping samples: %llu (%.3f%%)", (
unsigned long long)g_received_clipping_samples,
736 log_plain(
" Zero crossing increase: %.2f%% higher than sent (indicates corruption)",
737 recv_zero_cross_pct - sent_zero_cross_pct);
741 if (recv_zero_cross_pct > 10.0) {
742 log_plain(
" ⚠️ SOUNDS LIKE STATIC/DISTORTED: Excessive zero crossings (%.2f%%) = high frequency noise",
743 recv_zero_cross_pct);
744 log_plain(
" Increase from sent: %.2f%% (waveform corruption detected)",
745 recv_zero_cross_pct - sent_zero_cross_pct);
746 log_plain(
" Likely causes: Opus codec artifacts, jitter buffer issues, or packet delivery gaps");
747 }
else if (recv_zero_cross_pct - sent_zero_cross_pct > 3.0) {
748 log_plain(
" ⚠️ SOUNDS CORRUPTED: Zero crossing rate increased by %.2f%% (should be ±0.5%%)",
749 recv_zero_cross_pct - sent_zero_cross_pct);
750 log_plain(
" Indicates waveform distortion from network/processing artifacts");
751 }
else if (recv_sharp_pct > 2.0) {
752 log_plain(
" ⚠️ SOUNDS LIKE STATIC: High click/pop rate (%.2f%%) indicates audio artifacts", recv_sharp_pct);
753 log_plain(
" Likely causes: Packet loss, jitter buffer issues, or frame discontinuities");
754 }
else if (g_received_clipping_samples > (g_received_stats.
total_samples / 1000)) {
755 log_plain(
" ⚠️ SOUNDS DISTORTED: Significant clipping detected (%.3f%%)",
756 100.0 * g_received_clipping_samples / g_received_stats.
total_samples);
757 log_plain(
" Likely causes: AGC too aggressive, gain too high, or codec compression artifacts");
758 }
else if (low_energy_pct > 50.0 && recv_rms < 0.05f) {
759 log_plain(
" ⚠️ SOUNDS MUDDY/QUIET: Over 50%% very quiet samples + low RMS");
760 log_plain(
" Audio may sound unclear or like background noise rather than music");
761 }
else if (received_silence_pct > 10.0) {
762 log_plain(
" ⚠️ SOUNDS SCRATCHY: Excessive silence (%.1f%%) causes dropouts", received_silence_pct);
763 }
else if (recv_rms > 0.08f && recv_zero_cross_pct < 6.0 && recv_sharp_pct < 1.0 &&
764 g_received_clipping_samples == 0) {
765 log_plain(
" ✓ SOUNDS LIKE MUSIC: Good RMS (%.4f), clean waveform (%.2f%% zero crossings), minimal artifacts",
766 recv_rms, recv_zero_cross_pct);
767 log_plain(
" Audio quality acceptable for communication");
769 log_plain(
" ? BORDERLINE: Check specific metrics above");
773 if (low_energy_pct > 50.0) {
774 log_plain(
" ⚠️ WARNING: Over 50%% of received samples are very quiet (< 0.05 amplitude)");
775 log_plain(
" - This makes audio sound muddy, unclear, or hard to understand");
776 log_plain(
" - Caused by: Mixing other clients' audio with your own at wrong levels");
780 if (g_received_packet_times_count >= 5) {
783 uint32_t min_interval_ms = 0xFFFFFFFF;
789 for (
uint32_t i = 1; i < g_received_packet_times_count; i++) {
790 struct timespec *prev = &g_received_packet_times[i - 1];
791 struct timespec *curr = &g_received_packet_times[i];
793 int64_t prev_us = (int64_t)prev->tv_sec * 1000000 + prev->tv_nsec / 1000;
794 int64_t curr_us = (int64_t)curr->tv_sec * 1000000 + curr->tv_nsec / 1000;
797 inter_arrival_times_ms[inter_arrival_count++] = gap_ms;
798 if (gap_ms < min_interval_ms)
799 min_interval_ms = gap_ms;
800 if (gap_ms > max_interval_ms)
801 max_interval_ms = gap_ms;
802 sum_intervals_ms += gap_ms;
805 if (gap_ms >= 35 && gap_ms <= 70) {
806 intervals_around_50ms++;
810 uint32_t avg_interval_ms = (
uint32_t)(sum_intervals_ms / inter_arrival_count);
811 uint32_t interval_consistency = (intervals_around_50ms * 100) / inter_arrival_count;
816 double avg_samples_per_packet =
818 double frames_per_packet = avg_samples_per_packet / 960.0;
819 double ms_audio_per_packet = frames_per_packet * 20.0;
822 if (intervals_around_50ms >= (inter_arrival_count * 2 / 3)) {
824 log_plain(
" 🔴 PERIODIC STUTTERING DETECTED: Server sends packets every ~%u ms (should be ~20ms)!",
826 log_plain(
" - Packet inter-arrival: %u-%u ms (avg: %u ms)", min_interval_ms, max_interval_ms, avg_interval_ms);
827 log_plain(
" - %u/%u packets (~%u%%) are ~50ms apart (CLEAR STUTTERING PATTERN)", intervals_around_50ms,
828 inter_arrival_count, interval_consistency);
831 log_plain(
" - Total audio samples: %llu over %u packets", (
unsigned long long)g_received_stats.
total_samples,
832 inter_arrival_count);
833 log_plain(
" - Avg samples per packet: %.0f (= %.2f Opus frames = %.1f ms)", avg_samples_per_packet,
834 frames_per_packet, ms_audio_per_packet);
836 if (frames_per_packet < 1.5) {
837 log_plain(
" - ❌ PROBLEM: Each packet contains < 1.5 frames (should be 2-3 frames!)");
838 log_plain(
" - With only %.1f frames per packet arriving every %u ms, there are gaps between chunks",
839 frames_per_packet, avg_interval_ms);
840 log_plain(
" - Audio plays for ~%.0f ms, then %u ms gap, then plays again", ms_audio_per_packet,
841 avg_interval_ms - (
uint32_t)ms_audio_per_packet);
842 }
else if (frames_per_packet > 2.5) {
843 log_plain(
" - ✓ Packets contain %.1f frames (~%.0f ms audio each)", frames_per_packet,
844 ms_audio_per_packet);
845 log_plain(
" - Should play smoothly if jitter buffer is large enough");
846 log_plain(
" - If still stuttering, issue is jitter buffer depth or timing precision");
848 log_plain(
" - Packets contain %.1f frames (~%.0f ms)", frames_per_packet, ms_audio_per_packet);
849 log_plain(
" - Borderline: buffer needs to hold %.0f ms to bridge %.u ms gap", ms_audio_per_packet,
850 avg_interval_ms - (
uint32_t)ms_audio_per_packet);
852 }
else if (avg_interval_ms > 30) {
853 log_plain(
" ⚠️ AUDIO DELIVERY INCONSISTENCY: Server packets arrive every ~%u ms (expected ~20ms)",
855 log_plain(
" - Interval range: %u-%u ms", min_interval_ms, max_interval_ms);
856 log_plain(
" - This causes dropouts and buffering issues");
862 log_plain(
" ⚠️ DISTORTION DETECTED: Packet delivery gaps too large!");
863 log_plain(
" - Max gap: %u ms (should be ~20ms for smooth audio)", g_received_stats.
max_gap_ms);
865 log_plain(
" - SEVERE: Gaps > 80ms cause severe distortion and dropouts");
866 }
else if (g_received_stats.
max_gap_ms > 50) {
867 log_plain(
" - Gaps > 50ms cause noticeable distortion");
871 log_plain(
" Packet delivery discontinuities: %llu gaps > 100ms detected",
875 log_plain(
" High jitter detected: > 1%% of samples have rapid amplitude changes");
876 log_plain(
" - May indicate buffer underruns from sparse packet delivery");
879 log_plain(
"================================================================================");