ESP32-S3 Advanced Features Development

XiaoZhi AI Chatbot Documentation Center | XiaoZhi.Dev

This guide covers advanced features development for XiaoZhi ESP32-S3, including 4G communication, local AI inference, multimodal interaction, IoT device control, and cloud service integration.

I. 4G Communication Integration

1.1 ML307R Cat.1 Module Integration

4G Advantage: 4G Cat.1 provides stable network connection, suitable for scenarios where Wi-Fi is unavailable, with data rates up to 10Mbps

Hardware Connection

ML307R 4G Module Connection:
ESP32-S3        ML307R
GPIO11    →    TX (Module receive)
GPIO12    →    RX (Module transmit)  
GPIO13    →    RESET
GPIO14    →    PWR_ON
3.3V      →    VCC
GND       →    GND

Antenna: Connect 4G antenna to ML307R MAIN antenna port
SIM Card: Insert standard SIM card (supports 2G/3G/4G)

4G Module Control Class

#include <HardwareSerial.h>

class ML307R_4G {
private:
    HardwareSerial* modemSerial;
    int resetPin, powerPin;
    bool isConnected = false;
    
public:
    ML307R_4G(HardwareSerial* serial, int reset, int power) 
        : modemSerial(serial), resetPin(reset), powerPin(power) {}
    
    bool init() {
        pinMode(resetPin, OUTPUT);
        pinMode(powerPin, OUTPUT);
        
        // Power on sequence
        digitalWrite(powerPin, LOW);
        delay(1000);
        digitalWrite(powerPin, HIGH);
        delay(3000);
        
        // Reset module
        digitalWrite(resetPin, LOW);
        delay(500);
        digitalWrite(resetPin, HIGH);
        delay(5000);
        
        modemSerial->begin(115200);
        
        // Check module response
        return sendATCommand("AT", "OK", 3000);
    }
    
    bool connectNetwork() {
        Serial.println("Connecting to 4G network...");
        
        // Set APN (adjust according to your carrier)
        if (!sendATCommand("AT+CGDCONT=1,\"IP\",\"cmnet\"", "OK", 5000)) {
            return false;
        }
        
        // Activate PDP context
        if (!sendATCommand("AT+CGACT=1,1", "OK", 30000)) {
            return false;
        }
        
        // Check network registration
        if (sendATCommand("AT+CREG?", "+CREG: 0,1", 10000) || 
            sendATCommand("AT+CREG?", "+CREG: 0,5", 10000)) {
            isConnected = true;
            Serial.println("4G network connected successfully");
            return true;
        }
        
        return false;
    }
    
    String getNetworkInfo() {
        String response = sendATCommandWithResponse("AT+CSQ", 3000);
        int rssi = parseSignalStrength(response);
        
        response = sendATCommandWithResponse("AT+COPS?", 3000);
        String operator = parseOperator(response);
        
        return "Operator: " + operator + ", Signal: " + String(rssi) + " dBm";
    }
    
    bool sendHTTPRequest(String url, String data, String& response) {
        // Configure HTTP service
        if (!sendATCommand("AT+HTTPINIT", "OK", 3000)) return false;
        if (!sendATCommand("AT+HTTPPARA=\"CID\",1", "OK", 3000)) return false;
        if (!sendATCommand("AT+HTTPPARA=\"URL\",\"" + url + "\"", "OK", 3000)) return false;
        
        if (data.length() > 0) {
            // POST request
            if (!sendATCommand("AT+HTTPPARA=\"CONTENT\",\"application/json\"", "OK", 3000)) return false;
            if (!sendATCommand("AT+HTTPDATA=" + String(data.length()) + ",10000", "DOWNLOAD", 3000)) return false;
            
            modemSerial->print(data);
            delay(1000);
            
            if (!sendATCommand("AT+HTTPACTION=1", "+HTTPACTION: 1,200", 30000)) return false;
        } else {
            // GET request
            if (!sendATCommand("AT+HTTPACTION=0", "+HTTPACTION: 0,200", 30000)) return false;
        }
        
        // Read response
        if (sendATCommand("AT+HTTPREAD", "OK", 5000)) {
            response = getLastATResponse();
        }
        
        sendATCommand("AT+HTTPTERM", "OK", 3000);
        return true;
    }
    
private:
    bool sendATCommand(String command, String expected, int timeout) {
        modemSerial->println(command);
        return waitForResponse(expected, timeout);
    }
    
    String sendATCommandWithResponse(String command, int timeout) {
        modemSerial->println(command);
        return readResponse(timeout);
    }
    
    bool waitForResponse(String expected, int timeout) {
        String response = readResponse(timeout);
        return response.indexOf(expected) >= 0;
    }
    
    String readResponse(int timeout) {
        String response = "";
        unsigned long start = millis();
        
        while (millis() - start < timeout) {
            if (modemSerial->available()) {
                response += modemSerial->readString();
                break;
            }
            delay(10);
        }
        
        return response;
    }
};

// Usage example
ML307R_4G modem(&Serial1, 13, 14);

void setup4G() {
    if (modem.init()) {
        Serial.println("4G module initialized");
        
        if (modem.connectNetwork()) {
            Serial.println("4G network connected");
            Serial.println(modem.getNetworkInfo());
        }
    }
}

1.2 4G + Wi-Fi Hybrid Networking

Intelligent Network Switching

class NetworkManager {
private:
    ML307R_4G* modem;
    bool wifi4GEnabled = false;
    bool preferWiFi = true;
    
public:
    enum NetworkType {
        NETWORK_NONE,
        NETWORK_WIFI,
        NETWORK_4G
    };
    
    NetworkType currentNetwork = NETWORK_NONE;
    
    NetworkManager(ML307R_4G* m) : modem(m) {}
    
    bool connectBestNetwork() {
        if (preferWiFi && connectWiFi()) {
            currentNetwork = NETWORK_WIFI;
            Serial.println("Using Wi-Fi connection");
            return true;
        }
        
        if (wifi4GEnabled && modem->connectNetwork()) {
            currentNetwork = NETWORK_4G;
            Serial.println("Using 4G connection");
            return true;
        }
        
        currentNetwork = NETWORK_NONE;
        return false;
    }
    
    bool sendDataToCloud(String data) {
        if (currentNetwork == NETWORK_WIFI) {
            return sendViaWiFi(data);
        } else if (currentNetwork == NETWORK_4G) {
            return sendVia4G(data);
        }
        return false;
    }
    
    void enable4GBackup(bool enable) {
        wifi4GEnabled = enable;
    }
    
    void monitorConnection() {
        static unsigned long lastCheck = 0;
        if (millis() - lastCheck > 30000) { // Check every 30 seconds
            if (currentNetwork == NETWORK_WIFI && WiFi.status() != WL_CONNECTED) {
                Serial.println("Wi-Fi disconnected, switching to 4G...");
                connectBestNetwork();
            }
            lastCheck = millis();
        }
    }
    
private:
    bool connectWiFi() {
        // Implement Wi-Fi connection logic
        return WiFi.status() == WL_CONNECTED;
    }
    
    bool sendViaWiFi(String data) {
        // Implement Wi-Fi data transmission
        return true;
    }
    
    bool sendVia4G(String data) {
        String response;
        return modem->sendHTTPRequest("https://api.yourserver.com/data", data, response);
    }
};

II. Local AI Inference

2.1 TensorFlow Lite Micro Integration

Model Deployment

#include "tensorflow/lite/micro/all_ops_resolver.h"
#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/schema/schema_generated.h"

// Include your model (converted to C array)
#include "voice_command_model.h"

class EdgeAIEngine {
private:
    tflite::MicroErrorReporter micro_error_reporter;
    tflite::AllOpsResolver resolver;
    const tflite::Model* model;
    tflite::MicroInterpreter* interpreter;
    TfLiteTensor* input;
    TfLiteTensor* output;
    
    // Arena for model execution (in PSRAM)
    constexpr static int kTensorArenaSize = 100 * 1024; // 100KB
    uint8_t* tensor_arena;
    
public:
    bool init() {
        // Allocate memory in PSRAM
        tensor_arena = (uint8_t*)ps_malloc(kTensorArenaSize);
        if (tensor_arena == nullptr) {
            Serial.println("Failed to allocate tensor arena");
            return false;
        }
        
        // Load model
        model = tflite::GetModel(voice_command_model);
        if (model->version() != TFLITE_SCHEMA_VERSION) {
            Serial.println("Model schema version mismatch");
            return false;
        }
        
        // Create interpreter
        static tflite::MicroInterpreter static_interpreter(
            model, resolver, tensor_arena, kTensorArenaSize, &micro_error_reporter);
        interpreter = &static_interpreter;
        
        // Allocate tensors
        TfLiteStatus allocate_status = interpreter->AllocateTensors();
        if (allocate_status != kTfLiteOk) {
            Serial.println("AllocateTensors() failed");
            return false;
        }
        
        // Get input and output tensors
        input = interpreter->input(0);
        output = interpreter->output(0);
        
        Serial.printf("Model loaded successfully\n");
        Serial.printf("Input shape: [%d, %d, %d, %d]\n", 
            input->dims->data[0], input->dims->data[1], 
            input->dims->data[2], input->dims->data[3]);
        
        return true;
    }
    
    int classifyVoiceCommand(float* audio_features, int feature_length) {
        // Copy features to input tensor
        for (int i = 0; i < feature_length; i++) {
            input->data.f[i] = audio_features[i];
        }
        
        // Run inference
        TfLiteStatus invoke_status = interpreter->Invoke();
        if (invoke_status != kTfLiteOk) {
            Serial.println("Invoke failed");
            return -1;
        }
        
        // Find prediction with highest confidence
        float max_confidence = 0;
        int predicted_class = -1;
        
        for (int i = 0; i < output->dims->data[1]; i++) {
            float confidence = output->data.f[i];
            if (confidence > max_confidence) {
                max_confidence = confidence;
                predicted_class = i;
            }
        }
        
        Serial.printf("Predicted class: %d, confidence: %.2f\n", 
            predicted_class, max_confidence);
        
        return (max_confidence > 0.7) ? predicted_class : -1; // Threshold 0.7
    }
    
    void printModelInfo() {
        Serial.println("=== Model Information ===");
        Serial.printf("Model size: %d bytes\n", voice_command_model_len);
        Serial.printf("Input type: %s\n", TfLiteTypeGetName(input->type));
        Serial.printf("Output type: %s\n", TfLiteTypeGetName(output->type));
        Serial.printf("Arena usage: %d bytes\n", interpreter->arena_used_bytes());
    }
};

2.2 Audio Feature Extraction

MFCC Feature Extraction

#include <math.h>

class AudioFeatureExtractor {
private:
    static const int SAMPLE_RATE = 16000;
    static const int FRAME_SIZE = 512;
    static const int HOP_SIZE = 256;
    static const int NUM_MFCC = 13;
    static const int NUM_FILTERS = 26;
    
    float hamming_window[FRAME_SIZE];
    float mel_filters[NUM_FILTERS][FRAME_SIZE/2 + 1];
    float dct_matrix[NUM_MFCC][NUM_FILTERS];
    
public:
    bool init() {
        // Generate Hamming window
        for (int i = 0; i < FRAME_SIZE; i++) {
            hamming_window[i] = 0.54 - 0.46 * cos(2 * M_PI * i / (FRAME_SIZE - 1));
        }
        
        // Generate Mel filter bank
        generateMelFilters();
        
        // Generate DCT matrix
        generateDCTMatrix();
        
        return true;
    }
    
    void extractMFCC(int16_t* audio_samples, int num_samples, float* mfcc_features) {
        int num_frames = (num_samples - FRAME_SIZE) / HOP_SIZE + 1;
        
        for (int frame = 0; frame < num_frames; frame++) {
            float frame_features[NUM_MFCC];
            
            // Extract features for current frame
            int frame_start = frame * HOP_SIZE;
            extractFrameMFCC(&audio_samples[frame_start], frame_features);
            
            // Copy to output buffer
            for (int i = 0; i < NUM_MFCC; i++) {
                mfcc_features[frame * NUM_MFCC + i] = frame_features[i];
            }
        }
    }
    
private:
    void extractFrameMFCC(int16_t* frame_samples, float* mfcc_out) {
        float windowed_frame[FRAME_SIZE];
        float fft_result[FRAME_SIZE/2 + 1];
        float mel_energies[NUM_FILTERS];
        
        // Apply window function
        for (int i = 0; i < FRAME_SIZE; i++) {
            windowed_frame[i] = frame_samples[i] * hamming_window[i];
        }
        
        // Compute FFT (simplified - use real FFT library)
        computeFFT(windowed_frame, fft_result);
        
        // Apply Mel filter bank
        for (int i = 0; i < NUM_FILTERS; i++) {
            mel_energies[i] = 0;
            for (int j = 0; j < FRAME_SIZE/2 + 1; j++) {
                mel_energies[i] += fft_result[j] * mel_filters[i][j];
            }
            mel_energies[i] = log(mel_energies[i] + 1e-10); // Log energy
        }
        
        // Apply DCT
        for (int i = 0; i < NUM_MFCC; i++) {
            mfcc_out[i] = 0;
            for (int j = 0; j < NUM_FILTERS; j++) {
                mfcc_out[i] += mel_energies[j] * dct_matrix[i][j];
            }
        }
    }
    
    void generateMelFilters() {
        // Simplified Mel filter bank generation
        float mel_low = 0;
        float mel_high = 2595 * log10(1 + (SAMPLE_RATE/2) / 700.0);
        float mel_step = (mel_high - mel_low) / (NUM_FILTERS + 1);
        
        for (int i = 0; i < NUM_FILTERS; i++) {
            float center_mel = mel_low + (i + 1) * mel_step;
            float center_freq = 700 * (pow(10, center_mel / 2595) - 1);
            
            int center_bin = (int)(center_freq * FRAME_SIZE / SAMPLE_RATE);
            
            // Generate triangular filter
            for (int j = 0; j < FRAME_SIZE/2 + 1; j++) {
                if (abs(j - center_bin) < 20) { // Simplified triangular filter
                    mel_filters[i][j] = 1.0 - abs(j - center_bin) / 20.0;
                } else {
                    mel_filters[i][j] = 0;
                }
            }
        }
    }
    
    void generateDCTMatrix() {
        for (int i = 0; i < NUM_MFCC; i++) {
            for (int j = 0; j < NUM_FILTERS; j++) {
                dct_matrix[i][j] = cos(M_PI * i * (j + 0.5) / NUM_FILTERS);
            }
        }
    }
    
    void computeFFT(float* input, float* output) {
        // Placeholder - implement real FFT
        // Use ESP-DSP library or other FFT implementation
        for (int i = 0; i < FRAME_SIZE/2 + 1; i++) {
            output[i] = input[i] * input[i]; // Simplified power spectrum
        }
    }
};

2.3 Voice Command Recognition System

Complete Voice Command Pipeline

class VoiceCommandSystem {
private:
    EdgeAIEngine* aiEngine;
    AudioFeatureExtractor* featureExtractor;
    VoiceActivityDetector* vad;
    
    // Command definitions
    const char* commands[5] = {
        "turn_on_light",
        "turn_off_light", 
        "play_music",
        "stop_music",
        "unknown"
    };
    
    // Audio buffer for 2 seconds at 16kHz
    static const int AUDIO_BUFFER_SIZE = 32000;
    int16_t audio_buffer[AUDIO_BUFFER_SIZE];
    int buffer_index = 0;
    bool recording = false;
    
public:
    VoiceCommandSystem(EdgeAIEngine* ai, AudioFeatureExtractor* fe, VoiceActivityDetector* v)
        : aiEngine(ai), featureExtractor(fe), vad(v) {}
    
    bool init() {
        return aiEngine->init() && featureExtractor->init();
    }
    
    void processAudioFrame(int16_t* samples, int length) {
        // Voice activity detection
        bool voice_detected = vad->detectVoiceActivity(samples, length);
        
        if (voice_detected && !recording) {
            // Start recording
            recording = true;
            buffer_index = 0;
            Serial.println("Voice detected - start recording");
        }
        
        if (recording) {
            // Add samples to buffer
            for (int i = 0; i < length && buffer_index < AUDIO_BUFFER_SIZE; i++) {
                audio_buffer[buffer_index++] = samples[i];
            }
            
            // Check if recording should stop
            if (!voice_detected || buffer_index >= AUDIO_BUFFER_SIZE) {
                processVoiceCommand();
                recording = false;
            }
        }
    }
    
private:
    void processVoiceCommand() {
        Serial.printf("Processing voice command (%d samples)\n", buffer_index);
        
        // Extract MFCC features
        int num_frames = (buffer_index - 512) / 256 + 1;
        float* mfcc_features = (float*)malloc(num_frames * 13 * sizeof(float));
        
        featureExtractor->extractMFCC(audio_buffer, buffer_index, mfcc_features);
        
        // Run AI inference
        int predicted_command = aiEngine->classifyVoiceCommand(mfcc_features, num_frames * 13);
        
        if (predicted_command >= 0 && predicted_command < 4) {
            Serial.printf("Recognized command: %s\n", commands[predicted_command]);
            executeCommand(predicted_command);
        } else {
            Serial.println("Unknown command or low confidence");
        }
        
        free(mfcc_features);
    }
    
    void executeCommand(int command_id) {
        switch (command_id) {
            case 0: // turn_on_light
                digitalWrite(LED_PIN, HIGH);
                Serial.println("Light turned ON");
                break;
                
            case 1: // turn_off_light
                digitalWrite(LED_PIN, LOW);
                Serial.println("Light turned OFF");
                break;
                
            case 2: // play_music
                // Implement music playback
                Serial.println("Playing music");
                break;
                
            case 3: // stop_music
                // Implement music stop
                Serial.println("Stopping music");
                break;
        }
    }
};

III. Multimodal Interaction

3.1 Camera Integration

ESP32-CAM Module Integration

#include "esp_camera.h"
#include "esp_jpg_decode.h"

class CameraManager {
private:
    camera_config_t config;
    bool initialized = false;
    
public:
    bool init() {
        // Camera pin configuration for ESP32-CAM
        config.ledc_channel = LEDC_CHANNEL_0;
        config.ledc_timer = LEDC_TIMER_0;
        config.pin_d0 = 5;
        config.pin_d1 = 18;
        config.pin_d2 = 19;
        config.pin_d3 = 21;
        config.pin_d4 = 36;
        config.pin_d5 = 39;
        config.pin_d6 = 34;
        config.pin_d7 = 35;
        config.pin_xclk = 0;
        config.pin_pclk = 22;
        config.pin_vsync = 25;
        config.pin_href = 23;
        config.pin_sscb_sda = 26;
        config.pin_sscb_scl = 27;
        config.pin_pwdn = 32;
        config.pin_reset = -1;
        config.xclk_freq_hz = 20000000;
        config.pixel_format = PIXFORMAT_JPEG;
        
        // Image quality settings
        if (psramFound()) {
            config.frame_size = FRAMESIZE_SVGA; // 800x600
            config.jpeg_quality = 10;
            config.fb_count = 2;
        } else {
            config.frame_size = FRAMESIZE_VGA; // 640x480
            config.jpeg_quality = 12;
            config.fb_count = 1;
        }
        
        esp_err_t err = esp_camera_init(&config);
        if (err != ESP_OK) {
            Serial.printf("Camera init failed with error 0x%x\n", err);
            return false;
        }
        
        initialized = true;
        Serial.println("Camera initialized successfully");
        return true;
    }
    
    bool captureImage(uint8_t** image_buffer, size_t* image_size) {
        if (!initialized) return false;
        
        camera_fb_t* fb = esp_camera_fb_get();
        if (!fb) {
            Serial.println("Camera capture failed");
            return false;
        }
        
        *image_buffer = fb->buf;
        *image_size = fb->len;
        
        // Note: Remember to call esp_camera_fb_return(fb) after using the image
        return true;
    }
    
    void releaseImage(camera_fb_t* fb) {
        esp_camera_fb_return(fb);
    }
    
    bool streamToWebSocket(WebSocketsServer& webSocket, int client_id) {
        camera_fb_t* fb = esp_camera_fb_get();
        if (!fb) return false;
        
        // Convert to base64 for web transmission
        String base64_image = base64::encode(fb->buf, fb->len);
        String json_message = "{\"type\":\"image\",\"data\":\"" + base64_image + "\"}";
        
        webSocket.sendTXT(client_id, json_message);
        
        esp_camera_fb_return(fb);
        return true;
    }
};

3.2 Vision + Voice Multimodal AI

Scene Understanding with Voice Description

class MultimodalAI {
private:
    CameraManager* camera;
    VoiceCommandSystem* voice;
    
    // Simple object detection results
    struct DetectionResult {
        String object_name;
        float confidence;
        int x, y, width, height;
    };
    
public:
    MultimodalAI(CameraManager* cam, VoiceCommandSystem* v) 
        : camera(cam), voice(v) {}
    
    String analyzeScene() {
        uint8_t* image_buffer;
        size_t image_size;
        
        if (!camera->captureImage(&image_buffer, &image_size)) {
            return "Camera capture failed";
        }
        
        // Simple color-based object detection (placeholder)
        std::vector<DetectionResult> objects = detectObjects(image_buffer, image_size);
        
        // Generate scene description
        String description = generateSceneDescription(objects);
        
        // Release camera buffer
        camera->releaseImage(nullptr); // Simplified
        
        return description;
    }
    
    void handleVoiceVisualQuery(String query) {
        if (query.indexOf("what do you see") >= 0) {
            String scene = analyzeScene();
            speakText("I can see " + scene);
        } else if (query.indexOf("take a photo") >= 0) {
            if (captureAndSavePhoto()) {
                speakText("Photo captured successfully");
            } else {
                speakText("Failed to capture photo");
            }
        } else if (query.indexOf("count") >= 0) {
            int object_count = countObjectsInScene();
            speakText("I can see " + String(object_count) + " objects");
        }
    }
    
private:
    std::vector<DetectionResult> detectObjects(uint8_t* image, size_t size) {
        std::vector<DetectionResult> results;
        
        // Placeholder: Simple color-based detection
        // In real implementation, use TensorFlow Lite model for object detection
        
        DetectionResult person;
        person.object_name = "person";
        person.confidence = 0.85;
        person.x = 100; person.y = 50;
        person.width = 200; person.height = 300;
        results.push_back(person);
        
        return results;
    }
    
    String generateSceneDescription(const std::vector<DetectionResult>& objects) {
        if (objects.empty()) {
            return "an empty scene";
        }
        
        String description = "";
        for (size_t i = 0; i < objects.size(); i++) {
            if (i > 0) description += ", ";
            description += objects[i].object_name;
        }
        
        return description;
    }
    
    bool captureAndSavePhoto() {
        // Implement photo capture and storage
        return true;
    }
    
    int countObjectsInScene() {
        uint8_t* image_buffer;
        size_t image_size;
        
        if (camera->captureImage(&image_buffer, &image_size)) {
            auto objects = detectObjects(image_buffer, image_size);
            camera->releaseImage(nullptr);
            return objects.size();
        }
        
        return 0;
    }
    
    void speakText(String text) {
        // Implement TTS (Text-to-Speech)
        Serial.println("Speaking: " + text);
    }
};

IV. IoT Device Control

4.1 MQTT Smart Home Integration

MQTT Device Controller

#include <PubSubClient.h>
#include <ArduinoJson.h>

class SmartHomeController {
private:
    WiFiClient wifiClient;
    PubSubClient mqttClient;
    String deviceId;
    
    // Device states
    struct DeviceState {
        bool light_on = false;
        int brightness = 50;
        float temperature = 25.0;
        float humidity = 60.0;
        bool fan_on = false;
    } state;
    
public:
    SmartHomeController(String id) : deviceId(id), mqttClient(wifiClient) {}
    
    bool init(const char* mqtt_server, int mqtt_port) {
        mqttClient.setServer(mqtt_server, mqtt_port);
        mqttClient.setCallback([this](char* topic, byte* payload, unsigned int length) {
            this->onMqttMessage(topic, payload, length);
        });
        
        return connectMQTT();
    }
    
    bool connectMQTT() {
        Serial.println("Connecting to MQTT...");
        
        if (mqttClient.connect(deviceId.c_str())) {
            Serial.println("MQTT connected");
            
            // Subscribe to control topics
            mqttClient.subscribe(("xiaozhi/" + deviceId + "/control").c_str());
            mqttClient.subscribe("xiaozhi/broadcast");
            
            // Publish device online status
            publishDeviceStatus("online");
            
            return true;
        }
        
        return false;
    }
    
    void loop() {
        if (!mqttClient.connected()) {
            connectMQTT();
        }
        mqttClient.loop();
        
        // Publish sensor data every 30 seconds
        static unsigned long lastPublish = 0;
        if (millis() - lastPublish > 30000) {
            publishSensorData();
            lastPublish = millis();
        }
    }
    
    void handleVoiceCommand(String command) {
        if (command == "turn_on_light") {
            controlLight(true, state.brightness);
        } else if (command == "turn_off_light") {
            controlLight(false, 0);
        } else if (command.startsWith("set_brightness_")) {
            int brightness = command.substring(15).toInt();
            controlLight(state.light_on, brightness);
        } else if (command == "turn_on_fan") {
            controlFan(true);
        } else if (command == "turn_off_fan") {
            controlFan(false);
        }
    }
    
private:
    void onMqttMessage(char* topic, byte* payload, unsigned int length) {
        String message = "";
        for (unsigned int i = 0; i < length; i++) {
            message += (char)payload[i];
        }
        
        Serial.printf("MQTT message [%s]: %s\n", topic, message.c_str());
        
        DynamicJsonDocument doc(512);
        deserializeJson(doc, message);
        
        String device = doc["device"];
        String action = doc["action"];
        
        if (device == deviceId || device == "all") {
            executeAction(action, doc);
        }
    }
    
    void executeAction(String action, DynamicJsonDocument& params) {
        if (action == "light_control") {
            bool on = params["on"];
            int brightness = params["brightness"];
            controlLight(on, brightness);
        } else if (action == "fan_control") {
            bool on = params["on"];
            controlFan(on);
        } else if (action == "get_status") {
            publishDeviceStatus("status_response");
        }
    }
    
    void controlLight(bool on, int brightness) {
        state.light_on = on;
        state.brightness = brightness;
        
        // Control actual hardware
        if (on) {
            analogWrite(LED_PIN, map(brightness, 0, 100, 0, 255));
        } else {
            digitalWrite(LED_PIN, LOW);
        }
        
        // Publish state change
        DynamicJsonDocument doc(256);
        doc["device"] = deviceId;
        doc["type"] = "light_status";
        doc["on"] = on;
        doc["brightness"] = brightness;
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/status").c_str(), message.c_str());
        
        Serial.printf("Light %s, brightness: %d\n", on ? "ON" : "OFF", brightness);
    }
    
    void controlFan(bool on) {
        state.fan_on = on;
        
        // Control actual hardware
        digitalWrite(FAN_PIN, on ? HIGH : LOW);
        
        // Publish state change
        DynamicJsonDocument doc(256);
        doc["device"] = deviceId;
        doc["type"] = "fan_status";
        doc["on"] = on;
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/status").c_str(), message.c_str());
        
        Serial.printf("Fan %s\n", on ? "ON" : "OFF");
    }
    
    void publishSensorData() {
        // Read actual sensors (placeholder)
        state.temperature = 25.0 + random(-50, 50) / 10.0;
        state.humidity = 60.0 + random(-100, 100) / 10.0;
        
        DynamicJsonDocument doc(512);
        doc["device"] = deviceId;
        doc["type"] = "sensor_data";
        doc["timestamp"] = millis();
        doc["temperature"] = state.temperature;
        doc["humidity"] = state.humidity;
        doc["light_on"] = state.light_on;
        doc["fan_on"] = state.fan_on;
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/sensors").c_str(), message.c_str());
    }
    
    void publishDeviceStatus(String status) {
        DynamicJsonDocument doc(256);
        doc["device"] = deviceId;
        doc["status"] = status;
        doc["timestamp"] = millis();
        doc["ip"] = WiFi.localIP().toString();
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/device").c_str(), message.c_str());
    }
};

4.2 HomeAssistant Integration

HomeAssistant Auto-Discovery

class HomeAssistantIntegration {
private:
    SmartHomeController* controller;
    String deviceId;
    String deviceName;
    
public:
    HomeAssistantIntegration(SmartHomeController* ctrl, String id, String name) 
        : controller(ctrl), deviceId(id), deviceName(name) {}
    
    void publishAutoDiscovery() {
        publishLightDiscovery();
        publishSensorDiscovery();
        publishSwitchDiscovery();
    }
    
private:
    void publishLightDiscovery() {
        DynamicJsonDocument doc(1024);
        
        doc["name"] = deviceName + " Light";
        doc["unique_id"] = deviceId + "_light";
        doc["state_topic"] = "xiaozhi/" + deviceId + "/status";
        doc["command_topic"] = "xiaozhi/" + deviceId + "/control";
        doc["brightness_state_topic"] = "xiaozhi/" + deviceId + "/status";
        doc["brightness_command_topic"] = "xiaozhi/" + deviceId + "/control";
        doc["state_value_template"] = "{{ value_json.on }}";
        doc["brightness_value_template"] = "{{ value_json.brightness }}";
        doc["on_command_type"] = "first";
        doc["payload_on"] = "{\"action\":\"light_control\",\"on\":true}";
        doc["payload_off"] = "{\"action\":\"light_control\",\"on\":false}";
        
        // Device information
        doc["device"]["identifiers"][0] = deviceId;
        doc["device"]["name"] = deviceName;
        doc["device"]["model"] = "XiaoZhi ESP32-S3";
        doc["device"]["manufacturer"] = "XiaoZhi.Dev";
        
        String topic = "homeassistant/light/" + deviceId + "_light/config";
        String message;
        serializeJson(doc, message);
        
        controller->publishMessage(topic, message, true); // Retained message
    }
    
    void publishSensorDiscovery() {
        // Temperature sensor
        publishSensorConfig("temperature", "Temperature", "°C", "temperature");
        
        // Humidity sensor
        publishSensorConfig("humidity", "Humidity", "%", "humidity");
    }
    
    void publishSensorConfig(String sensor_type, String name, String unit, String device_class) {
        DynamicJsonDocument doc(512);
        
        doc["name"] = deviceName + " " + name;
        doc["unique_id"] = deviceId + "_" + sensor_type;
        doc["state_topic"] = "xiaozhi/" + deviceId + "/sensors";
        doc["value_template"] = "{{ value_json." + sensor_type + " }}";
        doc["unit_of_measurement"] = unit;
        doc["device_class"] = device_class;
        
        doc["device"]["identifiers"][0] = deviceId;
        doc["device"]["name"] = deviceName;
        
        String topic = "homeassistant/sensor/" + deviceId + "_" + sensor_type + "/config";
        String message;
        serializeJson(doc, message);
        
        controller->publishMessage(topic, message, true);
    }
    
    void publishSwitchDiscovery() {
        DynamicJsonDocument doc(512);
        
        doc["name"] = deviceName + " Fan";
        doc["unique_id"] = deviceId + "_fan";
        doc["state_topic"] = "xiaozhi/" + deviceId + "/status";
        doc["command_topic"] = "xiaozhi/" + deviceId + "/control";
        doc["state_value_template"] = "{{ value_json.fan_on }}";
        doc["payload_on"] = "{\"action\":\"fan_control\",\"on\":true}";
        doc["payload_off"] = "{\"action\":\"fan_control\",\"on\":false}";
        
        doc["device"]["identifiers"][0] = deviceId;
        doc["device"]["name"] = deviceName;
        
        String topic = "homeassistant/switch/" + deviceId + "_fan/config";
        String message;
        serializeJson(doc, message);
        
        controller->publishMessage(topic, message, true);
    }
};

V. Cloud Service Integration

5.1 Multi-Cloud AI Service Integration

AI Service Manager

class CloudAIManager {
private:
    String deepseek_api_key;
    String openai_api_key;
    String baidu_api_key;
    
    enum AIProvider {
        PROVIDER_DEEPSEEK,
        PROVIDER_OPENAI,
        PROVIDER_BAIDU
    };
    
    AIProvider current_provider = PROVIDER_DEEPSEEK;
    
public:
    void setAPIKeys(String deepseek, String openai, String baidu) {
        deepseek_api_key = deepseek;
        openai_api_key = openai;
        baidu_api_key = baidu;
    }
    
    String chatWithAI(String user_message) {
        switch (current_provider) {
            case PROVIDER_DEEPSEEK:
                return chatDeepSeek(user_message);
            case PROVIDER_OPENAI:
                return chatOpenAI(user_message);
            case PROVIDER_BAIDU:
                return chatBaidu(user_message);
            default:
                return "AI service not available";
        }
    }
    
    void switchProvider(AIProvider provider) {
        current_provider = provider;
        Serial.printf("Switched to AI provider: %d\n", provider);
    }
    
    bool testAllProviders() {
        String test_message = "Hello, can you respond to this test?";
        
        Serial.println("Testing AI providers...");
        
        // Test DeepSeek
        switchProvider(PROVIDER_DEEPSEEK);
        String response1 = chatWithAI(test_message);
        bool deepseek_ok = response1.length() > 0 && !response1.startsWith("Error");
        
        // Test OpenAI
        switchProvider(PROVIDER_OPENAI);
        String response2 = chatWithAI(test_message);
        bool openai_ok = response2.length() > 0 && !response2.startsWith("Error");
        
        // Test Baidu
        switchProvider(PROVIDER_BAIDU);
        String response3 = chatWithAI(test_message);
        bool baidu_ok = response3.length() > 0 && !response3.startsWith("Error");
        
        Serial.printf("Provider test results - DeepSeek: %s, OpenAI: %s, Baidu: %s\n",
            deepseek_ok ? "OK" : "FAIL",
            openai_ok ? "OK" : "FAIL", 
            baidu_ok ? "OK" : "FAIL");
        
        // Use the first working provider
        if (deepseek_ok) switchProvider(PROVIDER_DEEPSEEK);
        else if (openai_ok) switchProvider(PROVIDER_OPENAI);
        else if (baidu_ok) switchProvider(PROVIDER_BAIDU);
        
        return deepseek_ok || openai_ok || baidu_ok;
    }
    
private:
    String chatDeepSeek(String message) {
        HTTPClient http;
        http.begin("https://api.deepseek.com/chat/completions");
        http.addHeader("Content-Type", "application/json");
        http.addHeader("Authorization", "Bearer " + deepseek_api_key);
        
        DynamicJsonDocument request(1024);
        request["model"] = "deepseek-chat";
        request["messages"][0]["role"] = "user";
        request["messages"][0]["content"] = message;
        request["max_tokens"] = 100;
        
        String requestString;
        serializeJson(request, requestString);
        
        int httpResponseCode = http.POST(requestString);
        String response = "";
        
        if (httpResponseCode == 200) {
            String responseString = http.getString();
            DynamicJsonDocument responseDoc(2048);
            deserializeJson(responseDoc, responseString);
            response = responseDoc["choices"][0]["message"]["content"];
        } else {
            response = "Error: HTTP " + String(httpResponseCode);
        }
        
        http.end();
        return response;
    }
    
    String chatOpenAI(String message) {
        // Similar implementation for OpenAI API
        return "OpenAI response placeholder";
    }
    
    String chatBaidu(String message) {
        // Similar implementation for Baidu ERNIE API
        return "Baidu response placeholder";
    }
};

Performance Tips:

Use PSRAM for large models and audio buffers
Pin audio tasks to different CPU cores for optimal performance
Implement watchdog timers for system reliability
Use hardware encryption for secure communication

ESP32-S3 Programming Development Guide ESP32-S3 Development Troubleshooting Guide