ESP32-S3 Advanced Features Development

ESP32-S3 Advanced Features Development

This guide covers advanced features development for XiaoZhi ESP32-S3, including 4G communication, local AI inference, multimodal interaction, IoT device control, and cloud service integration.

I. 4G Communication Integration

1.1 ML307R Cat.1 Module Integration

4G Advantage: 4G Cat.1 provides stable network connection, suitable for scenarios where Wi-Fi is unavailable, with data rates up to 10Mbps

Hardware Connection

ML307R 4G Module Connection:
ESP32-S3        ML307R
GPIO11    →    TX (Module receive)
GPIO12    →    RX (Module transmit)  
GPIO13    →    RESET
GPIO14    →    PWR_ON
3.3V      →    VCC
GND       →    GND

Antenna: Connect 4G antenna to ML307R MAIN antenna port
SIM Card: Insert standard SIM card (supports 2G/3G/4G)

4G Module Control Class

#include <HardwareSerial.h>

class ML307R_4G {
private:
    HardwareSerial* modemSerial;
    int resetPin, powerPin;
    bool isConnected = false;
    
public:
    ML307R_4G(HardwareSerial* serial, int reset, int power) 
        : modemSerial(serial), resetPin(reset), powerPin(power) {}
    
    bool init() {
        pinMode(resetPin, OUTPUT);
        pinMode(powerPin, OUTPUT);
        
        // Power on sequence
        digitalWrite(powerPin, LOW);
        delay(1000);
        digitalWrite(powerPin, HIGH);
        delay(3000);
        
        // Reset module
        digitalWrite(resetPin, LOW);
        delay(500);
        digitalWrite(resetPin, HIGH);
        delay(5000);
        
        modemSerial->begin(115200);
        
        // Check module response
        return sendATCommand("AT", "OK", 3000);
    }
    
    bool connectNetwork() {
        Serial.println("Connecting to 4G network...");
        
        // Set APN (adjust according to your carrier)
        if (!sendATCommand("AT+CGDCONT=1,\"IP\",\"cmnet\"", "OK", 5000)) {
            return false;
        }
        
        // Activate PDP context
        if (!sendATCommand("AT+CGACT=1,1", "OK", 30000)) {
            return false;
        }
        
        // Check network registration
        if (sendATCommand("AT+CREG?", "+CREG: 0,1", 10000) || 
            sendATCommand("AT+CREG?", "+CREG: 0,5", 10000)) {
            isConnected = true;
            Serial.println("4G network connected successfully");
            return true;
        }
        
        return false;
    }
    
    String getNetworkInfo() {
        String response = sendATCommandWithResponse("AT+CSQ", 3000);
        int rssi = parseSignalStrength(response);
        
        response = sendATCommandWithResponse("AT+COPS?", 3000);
        String operator = parseOperator(response);
        
        return "Operator: " + operator + ", Signal: " + String(rssi) + " dBm";
    }
    
    bool sendHTTPRequest(String url, String data, String& response) {
        // Configure HTTP service
        if (!sendATCommand("AT+HTTPINIT", "OK", 3000)) return false;
        if (!sendATCommand("AT+HTTPPARA=\"CID\",1", "OK", 3000)) return false;
        if (!sendATCommand("AT+HTTPPARA=\"URL\",\"" + url + "\"", "OK", 3000)) return false;
        
        if (data.length() > 0) {
            // POST request
            if (!sendATCommand("AT+HTTPPARA=\"CONTENT\",\"application/json\"", "OK", 3000)) return false;
            if (!sendATCommand("AT+HTTPDATA=" + String(data.length()) + ",10000", "DOWNLOAD", 3000)) return false;
            
            modemSerial->print(data);
            delay(1000);
            
            if (!sendATCommand("AT+HTTPACTION=1", "+HTTPACTION: 1,200", 30000)) return false;
        } else {
            // GET request
            if (!sendATCommand("AT+HTTPACTION=0", "+HTTPACTION: 0,200", 30000)) return false;
        }
        
        // Read response
        if (sendATCommand("AT+HTTPREAD", "OK", 5000)) {
            response = getLastATResponse();
        }
        
        sendATCommand("AT+HTTPTERM", "OK", 3000);
        return true;
    }
    
private:
    bool sendATCommand(String command, String expected, int timeout) {
        modemSerial->println(command);
        return waitForResponse(expected, timeout);
    }
    
    String sendATCommandWithResponse(String command, int timeout) {
        modemSerial->println(command);
        return readResponse(timeout);
    }
    
    bool waitForResponse(String expected, int timeout) {
        String response = readResponse(timeout);
        return response.indexOf(expected) >= 0;
    }
    
    String readResponse(int timeout) {
        String response = "";
        unsigned long start = millis();
        
        while (millis() - start < timeout) {
            if (modemSerial->available()) {
                response += modemSerial->readString();
                break;
            }
            delay(10);
        }
        
        return response;
    }
};

// Usage example
ML307R_4G modem(&Serial1, 13, 14);

void setup4G() {
    if (modem.init()) {
        Serial.println("4G module initialized");
        
        if (modem.connectNetwork()) {
            Serial.println("4G network connected");
            Serial.println(modem.getNetworkInfo());
        }
    }
}

1.2 4G + Wi-Fi Hybrid Networking

Intelligent Network Switching

class NetworkManager {
private:
    ML307R_4G* modem;
    bool wifi4GEnabled = false;
    bool preferWiFi = true;
    
public:
    enum NetworkType {
        NETWORK_NONE,
        NETWORK_WIFI,
        NETWORK_4G
    };
    
    NetworkType currentNetwork = NETWORK_NONE;
    
    NetworkManager(ML307R_4G* m) : modem(m) {}
    
    bool connectBestNetwork() {
        if (preferWiFi && connectWiFi()) {
            currentNetwork = NETWORK_WIFI;
            Serial.println("Using Wi-Fi connection");
            return true;
        }
        
        if (wifi4GEnabled && modem->connectNetwork()) {
            currentNetwork = NETWORK_4G;
            Serial.println("Using 4G connection");
            return true;
        }
        
        currentNetwork = NETWORK_NONE;
        return false;
    }
    
    bool sendDataToCloud(String data) {
        if (currentNetwork == NETWORK_WIFI) {
            return sendViaWiFi(data);
        } else if (currentNetwork == NETWORK_4G) {
            return sendVia4G(data);
        }
        return false;
    }
    
    void enable4GBackup(bool enable) {
        wifi4GEnabled = enable;
    }
    
    void monitorConnection() {
        static unsigned long lastCheck = 0;
        if (millis() - lastCheck > 30000) { // Check every 30 seconds
            if (currentNetwork == NETWORK_WIFI && WiFi.status() != WL_CONNECTED) {
                Serial.println("Wi-Fi disconnected, switching to 4G...");
                connectBestNetwork();
            }
            lastCheck = millis();
        }
    }
    
private:
    bool connectWiFi() {
        // Implement Wi-Fi connection logic
        return WiFi.status() == WL_CONNECTED;
    }
    
    bool sendViaWiFi(String data) {
        // Implement Wi-Fi data transmission
        return true;
    }
    
    bool sendVia4G(String data) {
        String response;
        return modem->sendHTTPRequest("https://api.yourserver.com/data", data, response);
    }
};

II. Local AI Inference

2.1 TensorFlow Lite Micro Integration

Model Deployment

#include "tensorflow/lite/micro/all_ops_resolver.h"
#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/schema/schema_generated.h"

// Include your model (converted to C array)
#include "voice_command_model.h"

class EdgeAIEngine {
private:
    tflite::MicroErrorReporter micro_error_reporter;
    tflite::AllOpsResolver resolver;
    const tflite::Model* model;
    tflite::MicroInterpreter* interpreter;
    TfLiteTensor* input;
    TfLiteTensor* output;
    
    // Arena for model execution (in PSRAM)
    constexpr static int kTensorArenaSize = 100 * 1024; // 100KB
    uint8_t* tensor_arena;
    
public:
    bool init() {
        // Allocate memory in PSRAM
        tensor_arena = (uint8_t*)ps_malloc(kTensorArenaSize);
        if (tensor_arena == nullptr) {
            Serial.println("Failed to allocate tensor arena");
            return false;
        }
        
        // Load model
        model = tflite::GetModel(voice_command_model);
        if (model->version() != TFLITE_SCHEMA_VERSION) {
            Serial.println("Model schema version mismatch");
            return false;
        }
        
        // Create interpreter
        static tflite::MicroInterpreter static_interpreter(
            model, resolver, tensor_arena, kTensorArenaSize, &micro_error_reporter);
        interpreter = &static_interpreter;
        
        // Allocate tensors
        TfLiteStatus allocate_status = interpreter->AllocateTensors();
        if (allocate_status != kTfLiteOk) {
            Serial.println("AllocateTensors() failed");
            return false;
        }
        
        // Get input and output tensors
        input = interpreter->input(0);
        output = interpreter->output(0);
        
        Serial.printf("Model loaded successfully\n");
        Serial.printf("Input shape: [%d, %d, %d, %d]\n", 
            input->dims->data[0], input->dims->data[1], 
            input->dims->data[2], input->dims->data[3]);
        
        return true;
    }
    
    int classifyVoiceCommand(float* audio_features, int feature_length) {
        // Copy features to input tensor
        for (int i = 0; i < feature_length; i++) {
            input->data.f[i] = audio_features[i];
        }
        
        // Run inference
        TfLiteStatus invoke_status = interpreter->Invoke();
        if (invoke_status != kTfLiteOk) {
            Serial.println("Invoke failed");
            return -1;
        }
        
        // Find prediction with highest confidence
        float max_confidence = 0;
        int predicted_class = -1;
        
        for (int i = 0; i < output->dims->data[1]; i++) {
            float confidence = output->data.f[i];
            if (confidence > max_confidence) {
                max_confidence = confidence;
                predicted_class = i;
            }
        }
        
        Serial.printf("Predicted class: %d, confidence: %.2f\n", 
            predicted_class, max_confidence);
        
        return (max_confidence > 0.7) ? predicted_class : -1; // Threshold 0.7
    }
    
    void printModelInfo() {
        Serial.println("=== Model Information ===");
        Serial.printf("Model size: %d bytes\n", voice_command_model_len);
        Serial.printf("Input type: %s\n", TfLiteTypeGetName(input->type));
        Serial.printf("Output type: %s\n", TfLiteTypeGetName(output->type));
        Serial.printf("Arena usage: %d bytes\n", interpreter->arena_used_bytes());
    }
};

2.2 Audio Feature Extraction

MFCC Feature Extraction

#include <math.h>

class AudioFeatureExtractor {
private:
    static const int SAMPLE_RATE = 16000;
    static const int FRAME_SIZE = 512;
    static const int HOP_SIZE = 256;
    static const int NUM_MFCC = 13;
    static const int NUM_FILTERS = 26;
    
    float hamming_window[FRAME_SIZE];
    float mel_filters[NUM_FILTERS][FRAME_SIZE/2 + 1];
    float dct_matrix[NUM_MFCC][NUM_FILTERS];
    
public:
    bool init() {
        // Generate Hamming window
        for (int i = 0; i < FRAME_SIZE; i++) {
            hamming_window[i] = 0.54 - 0.46 * cos(2 * M_PI * i / (FRAME_SIZE - 1));
        }
        
        // Generate Mel filter bank
        generateMelFilters();
        
        // Generate DCT matrix
        generateDCTMatrix();
        
        return true;
    }
    
    void extractMFCC(int16_t* audio_samples, int num_samples, float* mfcc_features) {
        int num_frames = (num_samples - FRAME_SIZE) / HOP_SIZE + 1;
        
        for (int frame = 0; frame < num_frames; frame++) {
            float frame_features[NUM_MFCC];
            
            // Extract features for current frame
            int frame_start = frame * HOP_SIZE;
            extractFrameMFCC(&audio_samples[frame_start], frame_features);
            
            // Copy to output buffer
            for (int i = 0; i < NUM_MFCC; i++) {
                mfcc_features[frame * NUM_MFCC + i] = frame_features[i];
            }
        }
    }
    
private:
    void extractFrameMFCC(int16_t* frame_samples, float* mfcc_out) {
        float windowed_frame[FRAME_SIZE];
        float fft_result[FRAME_SIZE/2 + 1];
        float mel_energies[NUM_FILTERS];
        
        // Apply window function
        for (int i = 0; i < FRAME_SIZE; i++) {
            windowed_frame[i] = frame_samples[i] * hamming_window[i];
        }
        
        // Compute FFT (simplified - use real FFT library)
        computeFFT(windowed_frame, fft_result);
        
        // Apply Mel filter bank
        for (int i = 0; i < NUM_FILTERS; i++) {
            mel_energies[i] = 0;
            for (int j = 0; j < FRAME_SIZE/2 + 1; j++) {
                mel_energies[i] += fft_result[j] * mel_filters[i][j];
            }
            mel_energies[i] = log(mel_energies[i] + 1e-10); // Log energy
        }
        
        // Apply DCT
        for (int i = 0; i < NUM_MFCC; i++) {
            mfcc_out[i] = 0;
            for (int j = 0; j < NUM_FILTERS; j++) {
                mfcc_out[i] += mel_energies[j] * dct_matrix[i][j];
            }
        }
    }
    
    void generateMelFilters() {
        // Simplified Mel filter bank generation
        float mel_low = 0;
        float mel_high = 2595 * log10(1 + (SAMPLE_RATE/2) / 700.0);
        float mel_step = (mel_high - mel_low) / (NUM_FILTERS + 1);
        
        for (int i = 0; i < NUM_FILTERS; i++) {
            float center_mel = mel_low + (i + 1) * mel_step;
            float center_freq = 700 * (pow(10, center_mel / 2595) - 1);
            
            int center_bin = (int)(center_freq * FRAME_SIZE / SAMPLE_RATE);
            
            // Generate triangular filter
            for (int j = 0; j < FRAME_SIZE/2 + 1; j++) {
                if (abs(j - center_bin) < 20) { // Simplified triangular filter
                    mel_filters[i][j] = 1.0 - abs(j - center_bin) / 20.0;
                } else {
                    mel_filters[i][j] = 0;
                }
            }
        }
    }
    
    void generateDCTMatrix() {
        for (int i = 0; i < NUM_MFCC; i++) {
            for (int j = 0; j < NUM_FILTERS; j++) {
                dct_matrix[i][j] = cos(M_PI * i * (j + 0.5) / NUM_FILTERS);
            }
        }
    }
    
    void computeFFT(float* input, float* output) {
        // Placeholder - implement real FFT
        // Use ESP-DSP library or other FFT implementation
        for (int i = 0; i < FRAME_SIZE/2 + 1; i++) {
            output[i] = input[i] * input[i]; // Simplified power spectrum
        }
    }
};

2.3 Voice Command Recognition System

Complete Voice Command Pipeline

class VoiceCommandSystem {
private:
    EdgeAIEngine* aiEngine;
    AudioFeatureExtractor* featureExtractor;
    VoiceActivityDetector* vad;
    
    // Command definitions
    const char* commands[5] = {
        "turn_on_light",
        "turn_off_light", 
        "play_music",
        "stop_music",
        "unknown"
    };
    
    // Audio buffer for 2 seconds at 16kHz
    static const int AUDIO_BUFFER_SIZE = 32000;
    int16_t audio_buffer[AUDIO_BUFFER_SIZE];
    int buffer_index = 0;
    bool recording = false;
    
public:
    VoiceCommandSystem(EdgeAIEngine* ai, AudioFeatureExtractor* fe, VoiceActivityDetector* v)
        : aiEngine(ai), featureExtractor(fe), vad(v) {}
    
    bool init() {
        return aiEngine->init() && featureExtractor->init();
    }
    
    void processAudioFrame(int16_t* samples, int length) {
        // Voice activity detection
        bool voice_detected = vad->detectVoiceActivity(samples, length);
        
        if (voice_detected && !recording) {
            // Start recording
            recording = true;
            buffer_index = 0;
            Serial.println("Voice detected - start recording");
        }
        
        if (recording) {
            // Add samples to buffer
            for (int i = 0; i < length && buffer_index < AUDIO_BUFFER_SIZE; i++) {
                audio_buffer[buffer_index++] = samples[i];
            }
            
            // Check if recording should stop
            if (!voice_detected || buffer_index >= AUDIO_BUFFER_SIZE) {
                processVoiceCommand();
                recording = false;
            }
        }
    }
    
private:
    void processVoiceCommand() {
        Serial.printf("Processing voice command (%d samples)\n", buffer_index);
        
        // Extract MFCC features
        int num_frames = (buffer_index - 512) / 256 + 1;
        float* mfcc_features = (float*)malloc(num_frames * 13 * sizeof(float));
        
        featureExtractor->extractMFCC(audio_buffer, buffer_index, mfcc_features);
        
        // Run AI inference
        int predicted_command = aiEngine->classifyVoiceCommand(mfcc_features, num_frames * 13);
        
        if (predicted_command >= 0 && predicted_command < 4) {
            Serial.printf("Recognized command: %s\n", commands[predicted_command]);
            executeCommand(predicted_command);
        } else {
            Serial.println("Unknown command or low confidence");
        }
        
        free(mfcc_features);
    }
    
    void executeCommand(int command_id) {
        switch (command_id) {
            case 0: // turn_on_light
                digitalWrite(LED_PIN, HIGH);
                Serial.println("Light turned ON");
                break;
                
            case 1: // turn_off_light
                digitalWrite(LED_PIN, LOW);
                Serial.println("Light turned OFF");
                break;
                
            case 2: // play_music
                // Implement music playback
                Serial.println("Playing music");
                break;
                
            case 3: // stop_music
                // Implement music stop
                Serial.println("Stopping music");
                break;
        }
    }
};

III. Multimodal Interaction

3.1 Camera Integration

ESP32-CAM Module Integration

#include "esp_camera.h"
#include "esp_jpg_decode.h"

class CameraManager {
private:
    camera_config_t config;
    bool initialized = false;
    
public:
    bool init() {
        // Camera pin configuration for ESP32-CAM
        config.ledc_channel = LEDC_CHANNEL_0;
        config.ledc_timer = LEDC_TIMER_0;
        config.pin_d0 = 5;
        config.pin_d1 = 18;
        config.pin_d2 = 19;
        config.pin_d3 = 21;
        config.pin_d4 = 36;
        config.pin_d5 = 39;
        config.pin_d6 = 34;
        config.pin_d7 = 35;
        config.pin_xclk = 0;
        config.pin_pclk = 22;
        config.pin_vsync = 25;
        config.pin_href = 23;
        config.pin_sscb_sda = 26;
        config.pin_sscb_scl = 27;
        config.pin_pwdn = 32;
        config.pin_reset = -1;
        config.xclk_freq_hz = 20000000;
        config.pixel_format = PIXFORMAT_JPEG;
        
        // Image quality settings
        if (psramFound()) {
            config.frame_size = FRAMESIZE_SVGA; // 800x600
            config.jpeg_quality = 10;
            config.fb_count = 2;
        } else {
            config.frame_size = FRAMESIZE_VGA; // 640x480
            config.jpeg_quality = 12;
            config.fb_count = 1;
        }
        
        esp_err_t err = esp_camera_init(&config);
        if (err != ESP_OK) {
            Serial.printf("Camera init failed with error 0x%x\n", err);
            return false;
        }
        
        initialized = true;
        Serial.println("Camera initialized successfully");
        return true;
    }
    
    bool captureImage(uint8_t** image_buffer, size_t* image_size) {
        if (!initialized) return false;
        
        camera_fb_t* fb = esp_camera_fb_get();
        if (!fb) {
            Serial.println("Camera capture failed");
            return false;
        }
        
        *image_buffer = fb->buf;
        *image_size = fb->len;
        
        // Note: Remember to call esp_camera_fb_return(fb) after using the image
        return true;
    }
    
    void releaseImage(camera_fb_t* fb) {
        esp_camera_fb_return(fb);
    }
    
    bool streamToWebSocket(WebSocketsServer& webSocket, int client_id) {
        camera_fb_t* fb = esp_camera_fb_get();
        if (!fb) return false;
        
        // Convert to base64 for web transmission
        String base64_image = base64::encode(fb->buf, fb->len);
        String json_message = "{\"type\":\"image\",\"data\":\"" + base64_image + "\"}";
        
        webSocket.sendTXT(client_id, json_message);
        
        esp_camera_fb_return(fb);
        return true;
    }
};

3.2 Vision + Voice Multimodal AI

Scene Understanding with Voice Description

class MultimodalAI {
private:
    CameraManager* camera;
    VoiceCommandSystem* voice;
    
    // Simple object detection results
    struct DetectionResult {
        String object_name;
        float confidence;
        int x, y, width, height;
    };
    
public:
    MultimodalAI(CameraManager* cam, VoiceCommandSystem* v) 
        : camera(cam), voice(v) {}
    
    String analyzeScene() {
        uint8_t* image_buffer;
        size_t image_size;
        
        if (!camera->captureImage(&image_buffer, &image_size)) {
            return "Camera capture failed";
        }
        
        // Simple color-based object detection (placeholder)
        std::vector<DetectionResult> objects = detectObjects(image_buffer, image_size);
        
        // Generate scene description
        String description = generateSceneDescription(objects);
        
        // Release camera buffer
        camera->releaseImage(nullptr); // Simplified
        
        return description;
    }
    
    void handleVoiceVisualQuery(String query) {
        if (query.indexOf("what do you see") >= 0) {
            String scene = analyzeScene();
            speakText("I can see " + scene);
        } else if (query.indexOf("take a photo") >= 0) {
            if (captureAndSavePhoto()) {
                speakText("Photo captured successfully");
            } else {
                speakText("Failed to capture photo");
            }
        } else if (query.indexOf("count") >= 0) {
            int object_count = countObjectsInScene();
            speakText("I can see " + String(object_count) + " objects");
        }
    }
    
private:
    std::vector<DetectionResult> detectObjects(uint8_t* image, size_t size) {
        std::vector<DetectionResult> results;
        
        // Placeholder: Simple color-based detection
        // In real implementation, use TensorFlow Lite model for object detection
        
        DetectionResult person;
        person.object_name = "person";
        person.confidence = 0.85;
        person.x = 100; person.y = 50;
        person.width = 200; person.height = 300;
        results.push_back(person);
        
        return results;
    }
    
    String generateSceneDescription(const std::vector<DetectionResult>& objects) {
        if (objects.empty()) {
            return "an empty scene";
        }
        
        String description = "";
        for (size_t i = 0; i < objects.size(); i++) {
            if (i > 0) description += ", ";
            description += objects[i].object_name;
        }
        
        return description;
    }
    
    bool captureAndSavePhoto() {
        // Implement photo capture and storage
        return true;
    }
    
    int countObjectsInScene() {
        uint8_t* image_buffer;
        size_t image_size;
        
        if (camera->captureImage(&image_buffer, &image_size)) {
            auto objects = detectObjects(image_buffer, image_size);
            camera->releaseImage(nullptr);
            return objects.size();
        }
        
        return 0;
    }
    
    void speakText(String text) {
        // Implement TTS (Text-to-Speech)
        Serial.println("Speaking: " + text);
    }
};

IV. IoT Device Control

4.1 MQTT Smart Home Integration

MQTT Device Controller

#include <PubSubClient.h>
#include <ArduinoJson.h>

class SmartHomeController {
private:
    WiFiClient wifiClient;
    PubSubClient mqttClient;
    String deviceId;
    
    // Device states
    struct DeviceState {
        bool light_on = false;
        int brightness = 50;
        float temperature = 25.0;
        float humidity = 60.0;
        bool fan_on = false;
    } state;
    
public:
    SmartHomeController(String id) : deviceId(id), mqttClient(wifiClient) {}
    
    bool init(const char* mqtt_server, int mqtt_port) {
        mqttClient.setServer(mqtt_server, mqtt_port);
        mqttClient.setCallback([this](char* topic, byte* payload, unsigned int length) {
            this->onMqttMessage(topic, payload, length);
        });
        
        return connectMQTT();
    }
    
    bool connectMQTT() {
        Serial.println("Connecting to MQTT...");
        
        if (mqttClient.connect(deviceId.c_str())) {
            Serial.println("MQTT connected");
            
            // Subscribe to control topics
            mqttClient.subscribe(("xiaozhi/" + deviceId + "/control").c_str());
            mqttClient.subscribe("xiaozhi/broadcast");
            
            // Publish device online status
            publishDeviceStatus("online");
            
            return true;
        }
        
        return false;
    }
    
    void loop() {
        if (!mqttClient.connected()) {
            connectMQTT();
        }
        mqttClient.loop();
        
        // Publish sensor data every 30 seconds
        static unsigned long lastPublish = 0;
        if (millis() - lastPublish > 30000) {
            publishSensorData();
            lastPublish = millis();
        }
    }
    
    void handleVoiceCommand(String command) {
        if (command == "turn_on_light") {
            controlLight(true, state.brightness);
        } else if (command == "turn_off_light") {
            controlLight(false, 0);
        } else if (command.startsWith("set_brightness_")) {
            int brightness = command.substring(15).toInt();
            controlLight(state.light_on, brightness);
        } else if (command == "turn_on_fan") {
            controlFan(true);
        } else if (command == "turn_off_fan") {
            controlFan(false);
        }
    }
    
private:
    void onMqttMessage(char* topic, byte* payload, unsigned int length) {
        String message = "";
        for (unsigned int i = 0; i < length; i++) {
            message += (char)payload[i];
        }
        
        Serial.printf("MQTT message [%s]: %s\n", topic, message.c_str());
        
        DynamicJsonDocument doc(512);
        deserializeJson(doc, message);
        
        String device = doc["device"];
        String action = doc["action"];
        
        if (device == deviceId || device == "all") {
            executeAction(action, doc);
        }
    }
    
    void executeAction(String action, DynamicJsonDocument& params) {
        if (action == "light_control") {
            bool on = params["on"];
            int brightness = params["brightness"];
            controlLight(on, brightness);
        } else if (action == "fan_control") {
            bool on = params["on"];
            controlFan(on);
        } else if (action == "get_status") {
            publishDeviceStatus("status_response");
        }
    }
    
    void controlLight(bool on, int brightness) {
        state.light_on = on;
        state.brightness = brightness;
        
        // Control actual hardware
        if (on) {
            analogWrite(LED_PIN, map(brightness, 0, 100, 0, 255));
        } else {
            digitalWrite(LED_PIN, LOW);
        }
        
        // Publish state change
        DynamicJsonDocument doc(256);
        doc["device"] = deviceId;
        doc["type"] = "light_status";
        doc["on"] = on;
        doc["brightness"] = brightness;
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/status").c_str(), message.c_str());
        
        Serial.printf("Light %s, brightness: %d\n", on ? "ON" : "OFF", brightness);
    }
    
    void controlFan(bool on) {
        state.fan_on = on;
        
        // Control actual hardware
        digitalWrite(FAN_PIN, on ? HIGH : LOW);
        
        // Publish state change
        DynamicJsonDocument doc(256);
        doc["device"] = deviceId;
        doc["type"] = "fan_status";
        doc["on"] = on;
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/status").c_str(), message.c_str());
        
        Serial.printf("Fan %s\n", on ? "ON" : "OFF");
    }
    
    void publishSensorData() {
        // Read actual sensors (placeholder)
        state.temperature = 25.0 + random(-50, 50) / 10.0;
        state.humidity = 60.0 + random(-100, 100) / 10.0;
        
        DynamicJsonDocument doc(512);
        doc["device"] = deviceId;
        doc["type"] = "sensor_data";
        doc["timestamp"] = millis();
        doc["temperature"] = state.temperature;
        doc["humidity"] = state.humidity;
        doc["light_on"] = state.light_on;
        doc["fan_on"] = state.fan_on;
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/sensors").c_str(), message.c_str());
    }
    
    void publishDeviceStatus(String status) {
        DynamicJsonDocument doc(256);
        doc["device"] = deviceId;
        doc["status"] = status;
        doc["timestamp"] = millis();
        doc["ip"] = WiFi.localIP().toString();
        
        String message;
        serializeJson(doc, message);
        mqttClient.publish(("xiaozhi/" + deviceId + "/device").c_str(), message.c_str());
    }
};

4.2 HomeAssistant Integration

HomeAssistant Auto-Discovery

class HomeAssistantIntegration {
private:
    SmartHomeController* controller;
    String deviceId;
    String deviceName;
    
public:
    HomeAssistantIntegration(SmartHomeController* ctrl, String id, String name) 
        : controller(ctrl), deviceId(id), deviceName(name) {}
    
    void publishAutoDiscovery() {
        publishLightDiscovery();
        publishSensorDiscovery();
        publishSwitchDiscovery();
    }
    
private:
    void publishLightDiscovery() {
        DynamicJsonDocument doc(1024);
        
        doc["name"] = deviceName + " Light";
        doc["unique_id"] = deviceId + "_light";
        doc["state_topic"] = "xiaozhi/" + deviceId + "/status";
        doc["command_topic"] = "xiaozhi/" + deviceId + "/control";
        doc["brightness_state_topic"] = "xiaozhi/" + deviceId + "/status";
        doc["brightness_command_topic"] = "xiaozhi/" + deviceId + "/control";
        doc["state_value_template"] = "{{ value_json.on }}";
        doc["brightness_value_template"] = "{{ value_json.brightness }}";
        doc["on_command_type"] = "first";
        doc["payload_on"] = "{\"action\":\"light_control\",\"on\":true}";
        doc["payload_off"] = "{\"action\":\"light_control\",\"on\":false}";
        
        // Device information
        doc["device"]["identifiers"][0] = deviceId;
        doc["device"]["name"] = deviceName;
        doc["device"]["model"] = "XiaoZhi ESP32-S3";
        doc["device"]["manufacturer"] = "XiaoZhi.Dev";
        
        String topic = "homeassistant/light/" + deviceId + "_light/config";
        String message;
        serializeJson(doc, message);
        
        controller->publishMessage(topic, message, true); // Retained message
    }
    
    void publishSensorDiscovery() {
        // Temperature sensor
        publishSensorConfig("temperature", "Temperature", "°C", "temperature");
        
        // Humidity sensor
        publishSensorConfig("humidity", "Humidity", "%", "humidity");
    }
    
    void publishSensorConfig(String sensor_type, String name, String unit, String device_class) {
        DynamicJsonDocument doc(512);
        
        doc["name"] = deviceName + " " + name;
        doc["unique_id"] = deviceId + "_" + sensor_type;
        doc["state_topic"] = "xiaozhi/" + deviceId + "/sensors";
        doc["value_template"] = "{{ value_json." + sensor_type + " }}";
        doc["unit_of_measurement"] = unit;
        doc["device_class"] = device_class;
        
        doc["device"]["identifiers"][0] = deviceId;
        doc["device"]["name"] = deviceName;
        
        String topic = "homeassistant/sensor/" + deviceId + "_" + sensor_type + "/config";
        String message;
        serializeJson(doc, message);
        
        controller->publishMessage(topic, message, true);
    }
    
    void publishSwitchDiscovery() {
        DynamicJsonDocument doc(512);
        
        doc["name"] = deviceName + " Fan";
        doc["unique_id"] = deviceId + "_fan";
        doc["state_topic"] = "xiaozhi/" + deviceId + "/status";
        doc["command_topic"] = "xiaozhi/" + deviceId + "/control";
        doc["state_value_template"] = "{{ value_json.fan_on }}";
        doc["payload_on"] = "{\"action\":\"fan_control\",\"on\":true}";
        doc["payload_off"] = "{\"action\":\"fan_control\",\"on\":false}";
        
        doc["device"]["identifiers"][0] = deviceId;
        doc["device"]["name"] = deviceName;
        
        String topic = "homeassistant/switch/" + deviceId + "_fan/config";
        String message;
        serializeJson(doc, message);
        
        controller->publishMessage(topic, message, true);
    }
};

V. Cloud Service Integration

5.1 Multi-Cloud AI Service Integration

AI Service Manager

class CloudAIManager {
private:
    String deepseek_api_key;
    String openai_api_key;
    String baidu_api_key;
    
    enum AIProvider {
        PROVIDER_DEEPSEEK,
        PROVIDER_OPENAI,
        PROVIDER_BAIDU
    };
    
    AIProvider current_provider = PROVIDER_DEEPSEEK;
    
public:
    void setAPIKeys(String deepseek, String openai, String baidu) {
        deepseek_api_key = deepseek;
        openai_api_key = openai;
        baidu_api_key = baidu;
    }
    
    String chatWithAI(String user_message) {
        switch (current_provider) {
            case PROVIDER_DEEPSEEK:
                return chatDeepSeek(user_message);
            case PROVIDER_OPENAI:
                return chatOpenAI(user_message);
            case PROVIDER_BAIDU:
                return chatBaidu(user_message);
            default:
                return "AI service not available";
        }
    }
    
    void switchProvider(AIProvider provider) {
        current_provider = provider;
        Serial.printf("Switched to AI provider: %d\n", provider);
    }
    
    bool testAllProviders() {
        String test_message = "Hello, can you respond to this test?";
        
        Serial.println("Testing AI providers...");
        
        // Test DeepSeek
        switchProvider(PROVIDER_DEEPSEEK);
        String response1 = chatWithAI(test_message);
        bool deepseek_ok = response1.length() > 0 && !response1.startsWith("Error");
        
        // Test OpenAI
        switchProvider(PROVIDER_OPENAI);
        String response2 = chatWithAI(test_message);
        bool openai_ok = response2.length() > 0 && !response2.startsWith("Error");
        
        // Test Baidu
        switchProvider(PROVIDER_BAIDU);
        String response3 = chatWithAI(test_message);
        bool baidu_ok = response3.length() > 0 && !response3.startsWith("Error");
        
        Serial.printf("Provider test results - DeepSeek: %s, OpenAI: %s, Baidu: %s\n",
            deepseek_ok ? "OK" : "FAIL",
            openai_ok ? "OK" : "FAIL", 
            baidu_ok ? "OK" : "FAIL");
        
        // Use the first working provider
        if (deepseek_ok) switchProvider(PROVIDER_DEEPSEEK);
        else if (openai_ok) switchProvider(PROVIDER_OPENAI);
        else if (baidu_ok) switchProvider(PROVIDER_BAIDU);
        
        return deepseek_ok || openai_ok || baidu_ok;
    }
    
private:
    String chatDeepSeek(String message) {
        HTTPClient http;
        http.begin("https://api.deepseek.com/chat/completions");
        http.addHeader("Content-Type", "application/json");
        http.addHeader("Authorization", "Bearer " + deepseek_api_key);
        
        DynamicJsonDocument request(1024);
        request["model"] = "deepseek-chat";
        request["messages"][0]["role"] = "user";
        request["messages"][0]["content"] = message;
        request["max_tokens"] = 100;
        
        String requestString;
        serializeJson(request, requestString);
        
        int httpResponseCode = http.POST(requestString);
        String response = "";
        
        if (httpResponseCode == 200) {
            String responseString = http.getString();
            DynamicJsonDocument responseDoc(2048);
            deserializeJson(responseDoc, responseString);
            response = responseDoc["choices"][0]["message"]["content"];
        } else {
            response = "Error: HTTP " + String(httpResponseCode);
        }
        
        http.end();
        return response;
    }
    
    String chatOpenAI(String message) {
        // Similar implementation for OpenAI API
        return "OpenAI response placeholder";
    }
    
    String chatBaidu(String message) {
        // Similar implementation for Baidu ERNIE API
        return "Baidu response placeholder";
    }
};

Performance Tips:

  • Use PSRAM for large models and audio buffers
  • Pin audio tasks to different CPU cores for optimal performance
  • Implement watchdog timers for system reliability
  • Use hardware encryption for secure communication