ESP32-S3 Advanced Features Development
ESP32-S3 Advanced Features Development
This guide covers advanced features development for XiaoZhi ESP32-S3, including 4G communication, local AI inference, multimodal interaction, IoT device control, and cloud service integration.
I. 4G Communication Integration
1.1 ML307R Cat.1 Module Integration
4G Advantage: 4G Cat.1 provides stable network connection, suitable for scenarios where Wi-Fi is unavailable, with data rates up to 10Mbps
Hardware Connection
ML307R 4G Module Connection:
ESP32-S3 ML307R
GPIO11 → TX (Module receive)
GPIO12 → RX (Module transmit)
GPIO13 → RESET
GPIO14 → PWR_ON
3.3V → VCC
GND → GND
Antenna: Connect 4G antenna to ML307R MAIN antenna port
SIM Card: Insert standard SIM card (supports 2G/3G/4G)
4G Module Control Class
#include <HardwareSerial.h>
class ML307R_4G {
private:
HardwareSerial* modemSerial;
int resetPin, powerPin;
bool isConnected = false;
public:
ML307R_4G(HardwareSerial* serial, int reset, int power)
: modemSerial(serial), resetPin(reset), powerPin(power) {}
bool init() {
pinMode(resetPin, OUTPUT);
pinMode(powerPin, OUTPUT);
// Power on sequence
digitalWrite(powerPin, LOW);
delay(1000);
digitalWrite(powerPin, HIGH);
delay(3000);
// Reset module
digitalWrite(resetPin, LOW);
delay(500);
digitalWrite(resetPin, HIGH);
delay(5000);
modemSerial->begin(115200);
// Check module response
return sendATCommand("AT", "OK", 3000);
}
bool connectNetwork() {
Serial.println("Connecting to 4G network...");
// Set APN (adjust according to your carrier)
if (!sendATCommand("AT+CGDCONT=1,\"IP\",\"cmnet\"", "OK", 5000)) {
return false;
}
// Activate PDP context
if (!sendATCommand("AT+CGACT=1,1", "OK", 30000)) {
return false;
}
// Check network registration
if (sendATCommand("AT+CREG?", "+CREG: 0,1", 10000) ||
sendATCommand("AT+CREG?", "+CREG: 0,5", 10000)) {
isConnected = true;
Serial.println("4G network connected successfully");
return true;
}
return false;
}
String getNetworkInfo() {
String response = sendATCommandWithResponse("AT+CSQ", 3000);
int rssi = parseSignalStrength(response);
response = sendATCommandWithResponse("AT+COPS?", 3000);
String operator = parseOperator(response);
return "Operator: " + operator + ", Signal: " + String(rssi) + " dBm";
}
bool sendHTTPRequest(String url, String data, String& response) {
// Configure HTTP service
if (!sendATCommand("AT+HTTPINIT", "OK", 3000)) return false;
if (!sendATCommand("AT+HTTPPARA=\"CID\",1", "OK", 3000)) return false;
if (!sendATCommand("AT+HTTPPARA=\"URL\",\"" + url + "\"", "OK", 3000)) return false;
if (data.length() > 0) {
// POST request
if (!sendATCommand("AT+HTTPPARA=\"CONTENT\",\"application/json\"", "OK", 3000)) return false;
if (!sendATCommand("AT+HTTPDATA=" + String(data.length()) + ",10000", "DOWNLOAD", 3000)) return false;
modemSerial->print(data);
delay(1000);
if (!sendATCommand("AT+HTTPACTION=1", "+HTTPACTION: 1,200", 30000)) return false;
} else {
// GET request
if (!sendATCommand("AT+HTTPACTION=0", "+HTTPACTION: 0,200", 30000)) return false;
}
// Read response
if (sendATCommand("AT+HTTPREAD", "OK", 5000)) {
response = getLastATResponse();
}
sendATCommand("AT+HTTPTERM", "OK", 3000);
return true;
}
private:
bool sendATCommand(String command, String expected, int timeout) {
modemSerial->println(command);
return waitForResponse(expected, timeout);
}
String sendATCommandWithResponse(String command, int timeout) {
modemSerial->println(command);
return readResponse(timeout);
}
bool waitForResponse(String expected, int timeout) {
String response = readResponse(timeout);
return response.indexOf(expected) >= 0;
}
String readResponse(int timeout) {
String response = "";
unsigned long start = millis();
while (millis() - start < timeout) {
if (modemSerial->available()) {
response += modemSerial->readString();
break;
}
delay(10);
}
return response;
}
};
// Usage example
ML307R_4G modem(&Serial1, 13, 14);
void setup4G() {
if (modem.init()) {
Serial.println("4G module initialized");
if (modem.connectNetwork()) {
Serial.println("4G network connected");
Serial.println(modem.getNetworkInfo());
}
}
}
1.2 4G + Wi-Fi Hybrid Networking
Intelligent Network Switching
class NetworkManager {
private:
ML307R_4G* modem;
bool wifi4GEnabled = false;
bool preferWiFi = true;
public:
enum NetworkType {
NETWORK_NONE,
NETWORK_WIFI,
NETWORK_4G
};
NetworkType currentNetwork = NETWORK_NONE;
NetworkManager(ML307R_4G* m) : modem(m) {}
bool connectBestNetwork() {
if (preferWiFi && connectWiFi()) {
currentNetwork = NETWORK_WIFI;
Serial.println("Using Wi-Fi connection");
return true;
}
if (wifi4GEnabled && modem->connectNetwork()) {
currentNetwork = NETWORK_4G;
Serial.println("Using 4G connection");
return true;
}
currentNetwork = NETWORK_NONE;
return false;
}
bool sendDataToCloud(String data) {
if (currentNetwork == NETWORK_WIFI) {
return sendViaWiFi(data);
} else if (currentNetwork == NETWORK_4G) {
return sendVia4G(data);
}
return false;
}
void enable4GBackup(bool enable) {
wifi4GEnabled = enable;
}
void monitorConnection() {
static unsigned long lastCheck = 0;
if (millis() - lastCheck > 30000) { // Check every 30 seconds
if (currentNetwork == NETWORK_WIFI && WiFi.status() != WL_CONNECTED) {
Serial.println("Wi-Fi disconnected, switching to 4G...");
connectBestNetwork();
}
lastCheck = millis();
}
}
private:
bool connectWiFi() {
// Implement Wi-Fi connection logic
return WiFi.status() == WL_CONNECTED;
}
bool sendViaWiFi(String data) {
// Implement Wi-Fi data transmission
return true;
}
bool sendVia4G(String data) {
String response;
return modem->sendHTTPRequest("https://api.yourserver.com/data", data, response);
}
};
II. Local AI Inference
2.1 TensorFlow Lite Micro Integration
Model Deployment
#include "tensorflow/lite/micro/all_ops_resolver.h"
#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "tensorflow/lite/schema/schema_generated.h"
// Include your model (converted to C array)
#include "voice_command_model.h"
class EdgeAIEngine {
private:
tflite::MicroErrorReporter micro_error_reporter;
tflite::AllOpsResolver resolver;
const tflite::Model* model;
tflite::MicroInterpreter* interpreter;
TfLiteTensor* input;
TfLiteTensor* output;
// Arena for model execution (in PSRAM)
constexpr static int kTensorArenaSize = 100 * 1024; // 100KB
uint8_t* tensor_arena;
public:
bool init() {
// Allocate memory in PSRAM
tensor_arena = (uint8_t*)ps_malloc(kTensorArenaSize);
if (tensor_arena == nullptr) {
Serial.println("Failed to allocate tensor arena");
return false;
}
// Load model
model = tflite::GetModel(voice_command_model);
if (model->version() != TFLITE_SCHEMA_VERSION) {
Serial.println("Model schema version mismatch");
return false;
}
// Create interpreter
static tflite::MicroInterpreter static_interpreter(
model, resolver, tensor_arena, kTensorArenaSize, µ_error_reporter);
interpreter = &static_interpreter;
// Allocate tensors
TfLiteStatus allocate_status = interpreter->AllocateTensors();
if (allocate_status != kTfLiteOk) {
Serial.println("AllocateTensors() failed");
return false;
}
// Get input and output tensors
input = interpreter->input(0);
output = interpreter->output(0);
Serial.printf("Model loaded successfully\n");
Serial.printf("Input shape: [%d, %d, %d, %d]\n",
input->dims->data[0], input->dims->data[1],
input->dims->data[2], input->dims->data[3]);
return true;
}
int classifyVoiceCommand(float* audio_features, int feature_length) {
// Copy features to input tensor
for (int i = 0; i < feature_length; i++) {
input->data.f[i] = audio_features[i];
}
// Run inference
TfLiteStatus invoke_status = interpreter->Invoke();
if (invoke_status != kTfLiteOk) {
Serial.println("Invoke failed");
return -1;
}
// Find prediction with highest confidence
float max_confidence = 0;
int predicted_class = -1;
for (int i = 0; i < output->dims->data[1]; i++) {
float confidence = output->data.f[i];
if (confidence > max_confidence) {
max_confidence = confidence;
predicted_class = i;
}
}
Serial.printf("Predicted class: %d, confidence: %.2f\n",
predicted_class, max_confidence);
return (max_confidence > 0.7) ? predicted_class : -1; // Threshold 0.7
}
void printModelInfo() {
Serial.println("=== Model Information ===");
Serial.printf("Model size: %d bytes\n", voice_command_model_len);
Serial.printf("Input type: %s\n", TfLiteTypeGetName(input->type));
Serial.printf("Output type: %s\n", TfLiteTypeGetName(output->type));
Serial.printf("Arena usage: %d bytes\n", interpreter->arena_used_bytes());
}
};
2.2 Audio Feature Extraction
MFCC Feature Extraction
#include <math.h>
class AudioFeatureExtractor {
private:
static const int SAMPLE_RATE = 16000;
static const int FRAME_SIZE = 512;
static const int HOP_SIZE = 256;
static const int NUM_MFCC = 13;
static const int NUM_FILTERS = 26;
float hamming_window[FRAME_SIZE];
float mel_filters[NUM_FILTERS][FRAME_SIZE/2 + 1];
float dct_matrix[NUM_MFCC][NUM_FILTERS];
public:
bool init() {
// Generate Hamming window
for (int i = 0; i < FRAME_SIZE; i++) {
hamming_window[i] = 0.54 - 0.46 * cos(2 * M_PI * i / (FRAME_SIZE - 1));
}
// Generate Mel filter bank
generateMelFilters();
// Generate DCT matrix
generateDCTMatrix();
return true;
}
void extractMFCC(int16_t* audio_samples, int num_samples, float* mfcc_features) {
int num_frames = (num_samples - FRAME_SIZE) / HOP_SIZE + 1;
for (int frame = 0; frame < num_frames; frame++) {
float frame_features[NUM_MFCC];
// Extract features for current frame
int frame_start = frame * HOP_SIZE;
extractFrameMFCC(&audio_samples[frame_start], frame_features);
// Copy to output buffer
for (int i = 0; i < NUM_MFCC; i++) {
mfcc_features[frame * NUM_MFCC + i] = frame_features[i];
}
}
}
private:
void extractFrameMFCC(int16_t* frame_samples, float* mfcc_out) {
float windowed_frame[FRAME_SIZE];
float fft_result[FRAME_SIZE/2 + 1];
float mel_energies[NUM_FILTERS];
// Apply window function
for (int i = 0; i < FRAME_SIZE; i++) {
windowed_frame[i] = frame_samples[i] * hamming_window[i];
}
// Compute FFT (simplified - use real FFT library)
computeFFT(windowed_frame, fft_result);
// Apply Mel filter bank
for (int i = 0; i < NUM_FILTERS; i++) {
mel_energies[i] = 0;
for (int j = 0; j < FRAME_SIZE/2 + 1; j++) {
mel_energies[i] += fft_result[j] * mel_filters[i][j];
}
mel_energies[i] = log(mel_energies[i] + 1e-10); // Log energy
}
// Apply DCT
for (int i = 0; i < NUM_MFCC; i++) {
mfcc_out[i] = 0;
for (int j = 0; j < NUM_FILTERS; j++) {
mfcc_out[i] += mel_energies[j] * dct_matrix[i][j];
}
}
}
void generateMelFilters() {
// Simplified Mel filter bank generation
float mel_low = 0;
float mel_high = 2595 * log10(1 + (SAMPLE_RATE/2) / 700.0);
float mel_step = (mel_high - mel_low) / (NUM_FILTERS + 1);
for (int i = 0; i < NUM_FILTERS; i++) {
float center_mel = mel_low + (i + 1) * mel_step;
float center_freq = 700 * (pow(10, center_mel / 2595) - 1);
int center_bin = (int)(center_freq * FRAME_SIZE / SAMPLE_RATE);
// Generate triangular filter
for (int j = 0; j < FRAME_SIZE/2 + 1; j++) {
if (abs(j - center_bin) < 20) { // Simplified triangular filter
mel_filters[i][j] = 1.0 - abs(j - center_bin) / 20.0;
} else {
mel_filters[i][j] = 0;
}
}
}
}
void generateDCTMatrix() {
for (int i = 0; i < NUM_MFCC; i++) {
for (int j = 0; j < NUM_FILTERS; j++) {
dct_matrix[i][j] = cos(M_PI * i * (j + 0.5) / NUM_FILTERS);
}
}
}
void computeFFT(float* input, float* output) {
// Placeholder - implement real FFT
// Use ESP-DSP library or other FFT implementation
for (int i = 0; i < FRAME_SIZE/2 + 1; i++) {
output[i] = input[i] * input[i]; // Simplified power spectrum
}
}
};
2.3 Voice Command Recognition System
Complete Voice Command Pipeline
class VoiceCommandSystem {
private:
EdgeAIEngine* aiEngine;
AudioFeatureExtractor* featureExtractor;
VoiceActivityDetector* vad;
// Command definitions
const char* commands[5] = {
"turn_on_light",
"turn_off_light",
"play_music",
"stop_music",
"unknown"
};
// Audio buffer for 2 seconds at 16kHz
static const int AUDIO_BUFFER_SIZE = 32000;
int16_t audio_buffer[AUDIO_BUFFER_SIZE];
int buffer_index = 0;
bool recording = false;
public:
VoiceCommandSystem(EdgeAIEngine* ai, AudioFeatureExtractor* fe, VoiceActivityDetector* v)
: aiEngine(ai), featureExtractor(fe), vad(v) {}
bool init() {
return aiEngine->init() && featureExtractor->init();
}
void processAudioFrame(int16_t* samples, int length) {
// Voice activity detection
bool voice_detected = vad->detectVoiceActivity(samples, length);
if (voice_detected && !recording) {
// Start recording
recording = true;
buffer_index = 0;
Serial.println("Voice detected - start recording");
}
if (recording) {
// Add samples to buffer
for (int i = 0; i < length && buffer_index < AUDIO_BUFFER_SIZE; i++) {
audio_buffer[buffer_index++] = samples[i];
}
// Check if recording should stop
if (!voice_detected || buffer_index >= AUDIO_BUFFER_SIZE) {
processVoiceCommand();
recording = false;
}
}
}
private:
void processVoiceCommand() {
Serial.printf("Processing voice command (%d samples)\n", buffer_index);
// Extract MFCC features
int num_frames = (buffer_index - 512) / 256 + 1;
float* mfcc_features = (float*)malloc(num_frames * 13 * sizeof(float));
featureExtractor->extractMFCC(audio_buffer, buffer_index, mfcc_features);
// Run AI inference
int predicted_command = aiEngine->classifyVoiceCommand(mfcc_features, num_frames * 13);
if (predicted_command >= 0 && predicted_command < 4) {
Serial.printf("Recognized command: %s\n", commands[predicted_command]);
executeCommand(predicted_command);
} else {
Serial.println("Unknown command or low confidence");
}
free(mfcc_features);
}
void executeCommand(int command_id) {
switch (command_id) {
case 0: // turn_on_light
digitalWrite(LED_PIN, HIGH);
Serial.println("Light turned ON");
break;
case 1: // turn_off_light
digitalWrite(LED_PIN, LOW);
Serial.println("Light turned OFF");
break;
case 2: // play_music
// Implement music playback
Serial.println("Playing music");
break;
case 3: // stop_music
// Implement music stop
Serial.println("Stopping music");
break;
}
}
};
III. Multimodal Interaction
3.1 Camera Integration
ESP32-CAM Module Integration
#include "esp_camera.h"
#include "esp_jpg_decode.h"
class CameraManager {
private:
camera_config_t config;
bool initialized = false;
public:
bool init() {
// Camera pin configuration for ESP32-CAM
config.ledc_channel = LEDC_CHANNEL_0;
config.ledc_timer = LEDC_TIMER_0;
config.pin_d0 = 5;
config.pin_d1 = 18;
config.pin_d2 = 19;
config.pin_d3 = 21;
config.pin_d4 = 36;
config.pin_d5 = 39;
config.pin_d6 = 34;
config.pin_d7 = 35;
config.pin_xclk = 0;
config.pin_pclk = 22;
config.pin_vsync = 25;
config.pin_href = 23;
config.pin_sscb_sda = 26;
config.pin_sscb_scl = 27;
config.pin_pwdn = 32;
config.pin_reset = -1;
config.xclk_freq_hz = 20000000;
config.pixel_format = PIXFORMAT_JPEG;
// Image quality settings
if (psramFound()) {
config.frame_size = FRAMESIZE_SVGA; // 800x600
config.jpeg_quality = 10;
config.fb_count = 2;
} else {
config.frame_size = FRAMESIZE_VGA; // 640x480
config.jpeg_quality = 12;
config.fb_count = 1;
}
esp_err_t err = esp_camera_init(&config);
if (err != ESP_OK) {
Serial.printf("Camera init failed with error 0x%x\n", err);
return false;
}
initialized = true;
Serial.println("Camera initialized successfully");
return true;
}
bool captureImage(uint8_t** image_buffer, size_t* image_size) {
if (!initialized) return false;
camera_fb_t* fb = esp_camera_fb_get();
if (!fb) {
Serial.println("Camera capture failed");
return false;
}
*image_buffer = fb->buf;
*image_size = fb->len;
// Note: Remember to call esp_camera_fb_return(fb) after using the image
return true;
}
void releaseImage(camera_fb_t* fb) {
esp_camera_fb_return(fb);
}
bool streamToWebSocket(WebSocketsServer& webSocket, int client_id) {
camera_fb_t* fb = esp_camera_fb_get();
if (!fb) return false;
// Convert to base64 for web transmission
String base64_image = base64::encode(fb->buf, fb->len);
String json_message = "{\"type\":\"image\",\"data\":\"" + base64_image + "\"}";
webSocket.sendTXT(client_id, json_message);
esp_camera_fb_return(fb);
return true;
}
};
3.2 Vision + Voice Multimodal AI
Scene Understanding with Voice Description
class MultimodalAI {
private:
CameraManager* camera;
VoiceCommandSystem* voice;
// Simple object detection results
struct DetectionResult {
String object_name;
float confidence;
int x, y, width, height;
};
public:
MultimodalAI(CameraManager* cam, VoiceCommandSystem* v)
: camera(cam), voice(v) {}
String analyzeScene() {
uint8_t* image_buffer;
size_t image_size;
if (!camera->captureImage(&image_buffer, &image_size)) {
return "Camera capture failed";
}
// Simple color-based object detection (placeholder)
std::vector<DetectionResult> objects = detectObjects(image_buffer, image_size);
// Generate scene description
String description = generateSceneDescription(objects);
// Release camera buffer
camera->releaseImage(nullptr); // Simplified
return description;
}
void handleVoiceVisualQuery(String query) {
if (query.indexOf("what do you see") >= 0) {
String scene = analyzeScene();
speakText("I can see " + scene);
} else if (query.indexOf("take a photo") >= 0) {
if (captureAndSavePhoto()) {
speakText("Photo captured successfully");
} else {
speakText("Failed to capture photo");
}
} else if (query.indexOf("count") >= 0) {
int object_count = countObjectsInScene();
speakText("I can see " + String(object_count) + " objects");
}
}
private:
std::vector<DetectionResult> detectObjects(uint8_t* image, size_t size) {
std::vector<DetectionResult> results;
// Placeholder: Simple color-based detection
// In real implementation, use TensorFlow Lite model for object detection
DetectionResult person;
person.object_name = "person";
person.confidence = 0.85;
person.x = 100; person.y = 50;
person.width = 200; person.height = 300;
results.push_back(person);
return results;
}
String generateSceneDescription(const std::vector<DetectionResult>& objects) {
if (objects.empty()) {
return "an empty scene";
}
String description = "";
for (size_t i = 0; i < objects.size(); i++) {
if (i > 0) description += ", ";
description += objects[i].object_name;
}
return description;
}
bool captureAndSavePhoto() {
// Implement photo capture and storage
return true;
}
int countObjectsInScene() {
uint8_t* image_buffer;
size_t image_size;
if (camera->captureImage(&image_buffer, &image_size)) {
auto objects = detectObjects(image_buffer, image_size);
camera->releaseImage(nullptr);
return objects.size();
}
return 0;
}
void speakText(String text) {
// Implement TTS (Text-to-Speech)
Serial.println("Speaking: " + text);
}
};
IV. IoT Device Control
4.1 MQTT Smart Home Integration
MQTT Device Controller
#include <PubSubClient.h>
#include <ArduinoJson.h>
class SmartHomeController {
private:
WiFiClient wifiClient;
PubSubClient mqttClient;
String deviceId;
// Device states
struct DeviceState {
bool light_on = false;
int brightness = 50;
float temperature = 25.0;
float humidity = 60.0;
bool fan_on = false;
} state;
public:
SmartHomeController(String id) : deviceId(id), mqttClient(wifiClient) {}
bool init(const char* mqtt_server, int mqtt_port) {
mqttClient.setServer(mqtt_server, mqtt_port);
mqttClient.setCallback([this](char* topic, byte* payload, unsigned int length) {
this->onMqttMessage(topic, payload, length);
});
return connectMQTT();
}
bool connectMQTT() {
Serial.println("Connecting to MQTT...");
if (mqttClient.connect(deviceId.c_str())) {
Serial.println("MQTT connected");
// Subscribe to control topics
mqttClient.subscribe(("xiaozhi/" + deviceId + "/control").c_str());
mqttClient.subscribe("xiaozhi/broadcast");
// Publish device online status
publishDeviceStatus("online");
return true;
}
return false;
}
void loop() {
if (!mqttClient.connected()) {
connectMQTT();
}
mqttClient.loop();
// Publish sensor data every 30 seconds
static unsigned long lastPublish = 0;
if (millis() - lastPublish > 30000) {
publishSensorData();
lastPublish = millis();
}
}
void handleVoiceCommand(String command) {
if (command == "turn_on_light") {
controlLight(true, state.brightness);
} else if (command == "turn_off_light") {
controlLight(false, 0);
} else if (command.startsWith("set_brightness_")) {
int brightness = command.substring(15).toInt();
controlLight(state.light_on, brightness);
} else if (command == "turn_on_fan") {
controlFan(true);
} else if (command == "turn_off_fan") {
controlFan(false);
}
}
private:
void onMqttMessage(char* topic, byte* payload, unsigned int length) {
String message = "";
for (unsigned int i = 0; i < length; i++) {
message += (char)payload[i];
}
Serial.printf("MQTT message [%s]: %s\n", topic, message.c_str());
DynamicJsonDocument doc(512);
deserializeJson(doc, message);
String device = doc["device"];
String action = doc["action"];
if (device == deviceId || device == "all") {
executeAction(action, doc);
}
}
void executeAction(String action, DynamicJsonDocument& params) {
if (action == "light_control") {
bool on = params["on"];
int brightness = params["brightness"];
controlLight(on, brightness);
} else if (action == "fan_control") {
bool on = params["on"];
controlFan(on);
} else if (action == "get_status") {
publishDeviceStatus("status_response");
}
}
void controlLight(bool on, int brightness) {
state.light_on = on;
state.brightness = brightness;
// Control actual hardware
if (on) {
analogWrite(LED_PIN, map(brightness, 0, 100, 0, 255));
} else {
digitalWrite(LED_PIN, LOW);
}
// Publish state change
DynamicJsonDocument doc(256);
doc["device"] = deviceId;
doc["type"] = "light_status";
doc["on"] = on;
doc["brightness"] = brightness;
String message;
serializeJson(doc, message);
mqttClient.publish(("xiaozhi/" + deviceId + "/status").c_str(), message.c_str());
Serial.printf("Light %s, brightness: %d\n", on ? "ON" : "OFF", brightness);
}
void controlFan(bool on) {
state.fan_on = on;
// Control actual hardware
digitalWrite(FAN_PIN, on ? HIGH : LOW);
// Publish state change
DynamicJsonDocument doc(256);
doc["device"] = deviceId;
doc["type"] = "fan_status";
doc["on"] = on;
String message;
serializeJson(doc, message);
mqttClient.publish(("xiaozhi/" + deviceId + "/status").c_str(), message.c_str());
Serial.printf("Fan %s\n", on ? "ON" : "OFF");
}
void publishSensorData() {
// Read actual sensors (placeholder)
state.temperature = 25.0 + random(-50, 50) / 10.0;
state.humidity = 60.0 + random(-100, 100) / 10.0;
DynamicJsonDocument doc(512);
doc["device"] = deviceId;
doc["type"] = "sensor_data";
doc["timestamp"] = millis();
doc["temperature"] = state.temperature;
doc["humidity"] = state.humidity;
doc["light_on"] = state.light_on;
doc["fan_on"] = state.fan_on;
String message;
serializeJson(doc, message);
mqttClient.publish(("xiaozhi/" + deviceId + "/sensors").c_str(), message.c_str());
}
void publishDeviceStatus(String status) {
DynamicJsonDocument doc(256);
doc["device"] = deviceId;
doc["status"] = status;
doc["timestamp"] = millis();
doc["ip"] = WiFi.localIP().toString();
String message;
serializeJson(doc, message);
mqttClient.publish(("xiaozhi/" + deviceId + "/device").c_str(), message.c_str());
}
};
4.2 HomeAssistant Integration
HomeAssistant Auto-Discovery
class HomeAssistantIntegration {
private:
SmartHomeController* controller;
String deviceId;
String deviceName;
public:
HomeAssistantIntegration(SmartHomeController* ctrl, String id, String name)
: controller(ctrl), deviceId(id), deviceName(name) {}
void publishAutoDiscovery() {
publishLightDiscovery();
publishSensorDiscovery();
publishSwitchDiscovery();
}
private:
void publishLightDiscovery() {
DynamicJsonDocument doc(1024);
doc["name"] = deviceName + " Light";
doc["unique_id"] = deviceId + "_light";
doc["state_topic"] = "xiaozhi/" + deviceId + "/status";
doc["command_topic"] = "xiaozhi/" + deviceId + "/control";
doc["brightness_state_topic"] = "xiaozhi/" + deviceId + "/status";
doc["brightness_command_topic"] = "xiaozhi/" + deviceId + "/control";
doc["state_value_template"] = "{{ value_json.on }}";
doc["brightness_value_template"] = "{{ value_json.brightness }}";
doc["on_command_type"] = "first";
doc["payload_on"] = "{\"action\":\"light_control\",\"on\":true}";
doc["payload_off"] = "{\"action\":\"light_control\",\"on\":false}";
// Device information
doc["device"]["identifiers"][0] = deviceId;
doc["device"]["name"] = deviceName;
doc["device"]["model"] = "XiaoZhi ESP32-S3";
doc["device"]["manufacturer"] = "XiaoZhi.Dev";
String topic = "homeassistant/light/" + deviceId + "_light/config";
String message;
serializeJson(doc, message);
controller->publishMessage(topic, message, true); // Retained message
}
void publishSensorDiscovery() {
// Temperature sensor
publishSensorConfig("temperature", "Temperature", "°C", "temperature");
// Humidity sensor
publishSensorConfig("humidity", "Humidity", "%", "humidity");
}
void publishSensorConfig(String sensor_type, String name, String unit, String device_class) {
DynamicJsonDocument doc(512);
doc["name"] = deviceName + " " + name;
doc["unique_id"] = deviceId + "_" + sensor_type;
doc["state_topic"] = "xiaozhi/" + deviceId + "/sensors";
doc["value_template"] = "{{ value_json." + sensor_type + " }}";
doc["unit_of_measurement"] = unit;
doc["device_class"] = device_class;
doc["device"]["identifiers"][0] = deviceId;
doc["device"]["name"] = deviceName;
String topic = "homeassistant/sensor/" + deviceId + "_" + sensor_type + "/config";
String message;
serializeJson(doc, message);
controller->publishMessage(topic, message, true);
}
void publishSwitchDiscovery() {
DynamicJsonDocument doc(512);
doc["name"] = deviceName + " Fan";
doc["unique_id"] = deviceId + "_fan";
doc["state_topic"] = "xiaozhi/" + deviceId + "/status";
doc["command_topic"] = "xiaozhi/" + deviceId + "/control";
doc["state_value_template"] = "{{ value_json.fan_on }}";
doc["payload_on"] = "{\"action\":\"fan_control\",\"on\":true}";
doc["payload_off"] = "{\"action\":\"fan_control\",\"on\":false}";
doc["device"]["identifiers"][0] = deviceId;
doc["device"]["name"] = deviceName;
String topic = "homeassistant/switch/" + deviceId + "_fan/config";
String message;
serializeJson(doc, message);
controller->publishMessage(topic, message, true);
}
};
V. Cloud Service Integration
5.1 Multi-Cloud AI Service Integration
AI Service Manager
class CloudAIManager {
private:
String deepseek_api_key;
String openai_api_key;
String baidu_api_key;
enum AIProvider {
PROVIDER_DEEPSEEK,
PROVIDER_OPENAI,
PROVIDER_BAIDU
};
AIProvider current_provider = PROVIDER_DEEPSEEK;
public:
void setAPIKeys(String deepseek, String openai, String baidu) {
deepseek_api_key = deepseek;
openai_api_key = openai;
baidu_api_key = baidu;
}
String chatWithAI(String user_message) {
switch (current_provider) {
case PROVIDER_DEEPSEEK:
return chatDeepSeek(user_message);
case PROVIDER_OPENAI:
return chatOpenAI(user_message);
case PROVIDER_BAIDU:
return chatBaidu(user_message);
default:
return "AI service not available";
}
}
void switchProvider(AIProvider provider) {
current_provider = provider;
Serial.printf("Switched to AI provider: %d\n", provider);
}
bool testAllProviders() {
String test_message = "Hello, can you respond to this test?";
Serial.println("Testing AI providers...");
// Test DeepSeek
switchProvider(PROVIDER_DEEPSEEK);
String response1 = chatWithAI(test_message);
bool deepseek_ok = response1.length() > 0 && !response1.startsWith("Error");
// Test OpenAI
switchProvider(PROVIDER_OPENAI);
String response2 = chatWithAI(test_message);
bool openai_ok = response2.length() > 0 && !response2.startsWith("Error");
// Test Baidu
switchProvider(PROVIDER_BAIDU);
String response3 = chatWithAI(test_message);
bool baidu_ok = response3.length() > 0 && !response3.startsWith("Error");
Serial.printf("Provider test results - DeepSeek: %s, OpenAI: %s, Baidu: %s\n",
deepseek_ok ? "OK" : "FAIL",
openai_ok ? "OK" : "FAIL",
baidu_ok ? "OK" : "FAIL");
// Use the first working provider
if (deepseek_ok) switchProvider(PROVIDER_DEEPSEEK);
else if (openai_ok) switchProvider(PROVIDER_OPENAI);
else if (baidu_ok) switchProvider(PROVIDER_BAIDU);
return deepseek_ok || openai_ok || baidu_ok;
}
private:
String chatDeepSeek(String message) {
HTTPClient http;
http.begin("https://api.deepseek.com/chat/completions");
http.addHeader("Content-Type", "application/json");
http.addHeader("Authorization", "Bearer " + deepseek_api_key);
DynamicJsonDocument request(1024);
request["model"] = "deepseek-chat";
request["messages"][0]["role"] = "user";
request["messages"][0]["content"] = message;
request["max_tokens"] = 100;
String requestString;
serializeJson(request, requestString);
int httpResponseCode = http.POST(requestString);
String response = "";
if (httpResponseCode == 200) {
String responseString = http.getString();
DynamicJsonDocument responseDoc(2048);
deserializeJson(responseDoc, responseString);
response = responseDoc["choices"][0]["message"]["content"];
} else {
response = "Error: HTTP " + String(httpResponseCode);
}
http.end();
return response;
}
String chatOpenAI(String message) {
// Similar implementation for OpenAI API
return "OpenAI response placeholder";
}
String chatBaidu(String message) {
// Similar implementation for Baidu ERNIE API
return "Baidu response placeholder";
}
};
Performance Tips:
- Use PSRAM for large models and audio buffers
- Pin audio tasks to different CPU cores for optimal performance
- Implement watchdog timers for system reliability
- Use hardware encryption for secure communication