feat: Polishing the frontend ota. Updated tdds and documentation
This commit is contained in:
@@ -6,6 +6,7 @@
|
||||
let status = $state("loading");
|
||||
let errorMsg = $state("");
|
||||
let showRebootConfirm = $state(false);
|
||||
let isRecovering = $state(false);
|
||||
|
||||
let systemInfo = $state({
|
||||
chip: "—",
|
||||
@@ -42,14 +43,17 @@
|
||||
status = "ok";
|
||||
errorMsg = "";
|
||||
} catch (e) {
|
||||
if (!isRecovering) {
|
||||
status = "error";
|
||||
errorMsg = e.message || "Connection failed";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function handleReboot() {
|
||||
showRebootConfirm = false;
|
||||
status = "rebooting";
|
||||
isRecovering = true;
|
||||
try {
|
||||
await reboot();
|
||||
} catch (e) {
|
||||
@@ -61,6 +65,25 @@
|
||||
fetchInfo();
|
||||
});
|
||||
|
||||
// Resilient recovery polling: Only poll when we are waiting for a reboot
|
||||
$effect(() => {
|
||||
if (isRecovering) {
|
||||
const interval = setInterval(async () => {
|
||||
try {
|
||||
const info = await getSystemInfo();
|
||||
if (info) {
|
||||
console.log("Device back online! Refreshing UI...");
|
||||
window.location.reload();
|
||||
}
|
||||
} catch (e) {
|
||||
// Still offline or rebooting, just keep waiting
|
||||
console.log("Waiting for device...");
|
||||
}
|
||||
}, 2000);
|
||||
return () => clearInterval(interval);
|
||||
}
|
||||
});
|
||||
|
||||
const infoItems = $derived([
|
||||
{ label: "Chip", value: systemInfo.chip, icon: "🔧" },
|
||||
{ label: "Free Heap", value: formatBytes(systemInfo.freeHeap), icon: "💾" },
|
||||
@@ -168,7 +191,7 @@
|
||||
</div>
|
||||
|
||||
<!-- Frontend Info & OTA Section -->
|
||||
<OTAUpdate />
|
||||
<OTAUpdate onReboot={() => (status = "rebooting")} />
|
||||
|
||||
<!-- Reboot Confirmation Modal -->
|
||||
{#if showRebootConfirm}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
<script>
|
||||
let { onReboot = null } = $props();
|
||||
import { getOTAStatus, uploadOTAFrontend } from "./api.js";
|
||||
|
||||
const IS_DEV = import.meta.env.DEV;
|
||||
@@ -76,6 +77,7 @@
|
||||
clearInterval(progressInterval);
|
||||
uploadProgress = 100;
|
||||
status = "success";
|
||||
if (onReboot) onReboot();
|
||||
} catch (e) {
|
||||
clearInterval(progressInterval);
|
||||
uploadProgress = 0;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"major": 0,
|
||||
"minor": 1,
|
||||
"revision": 4
|
||||
"revision": 7
|
||||
}
|
||||
@@ -114,15 +114,16 @@ We extend this pattern to the HTTP server:
|
||||
|
||||
## 7. Partition Table
|
||||
|
||||
```
|
||||
```csv
|
||||
# Name, Type, SubType, Offset, Size
|
||||
nvs, data, nvs, 0x9000, 0x6000
|
||||
phy_init, data, phy, 0xf000, 0x1000
|
||||
factory, app, factory, 0x10000, 1M
|
||||
www, data, littlefs, , 64K
|
||||
www_0, data, littlefs, , 1M
|
||||
www_1, data, littlefs, , 1M
|
||||
```
|
||||
|
||||
The `www` partition is 64KB — more than enough for the 16kB gzipped frontend. Only gets written during `idf.py flash` when `CALENDINK_DEPLOY_WEB_PAGES` is enabled.
|
||||
We allocated two **1MB partitions** for the frontend (`www_0` and `www_1`). While the compressed frontend is only ~20KB, this 1MB allocation provides massive headroom for future assets (images, fonts, larger JS bundles) without needing to re-partition the flash.
|
||||
|
||||
## 8. Build Pipeline
|
||||
|
||||
@@ -164,9 +165,11 @@ We use **esp_http_server + cJSON + LittleFS** — all standard ESP-IDF component
|
||||
- **CORS Support**: Implemented `Access-Control-Allow-Origin: *` headers for all API GET and POST responses, along with an `OPTIONS` preflight handler, to support seamless local UI development against the ESP32.
|
||||
|
||||
### Stability & Performance Fixes
|
||||
- **A/B Partition System**: Implemented a redundant frontend storage system using `www_0` and `www_1` partitions. The backend dynamically selects the boot partition based on NVS state, providing a robust "fail-safe" update mechanism where the active UI is never overwritten.
|
||||
- **OTA Status Reporting**: The backend now exposes detailed partition telemetry (total size, used, and free space) to help the frontend provide accurate storage feedback to the user.
|
||||
- **Persistent Daemon**: Addressed an issue where `app_main` executed to completion immediately, causing the web server daemon to drop. Implemented a non-blocking `vTaskDelay` keep-alive loop to persist the application state and keep the HTTP server listening indefinitely without spinning the CPU.
|
||||
- **Static File Fallbacks**: The LittleFS static file handler correctly falls back to `index.html` (and `.gz` variants) to seamlessly support Svelte's Single Page Application (SPA) routing patterns.
|
||||
|
||||
### Observability Benchmarks
|
||||
- **Heap Usage**: The system info endpoint natively tracks free heap availability. Observed typical runtime footprint leaves roughly **247 KB free heap** with active WiFi, API handling, and active HTTP server routing.
|
||||
- **API Response Latency**: The minimalist handler approach results in near-instantaneous JSON responses (milliseconds), effortlessly supporting the frontend dashboard's 5-second polling interval without blocking the ESP32-S3 network stack.
|
||||
- **API Response Latency**: The minimalist handler approach results in near-instantaneous JSON responses (milliseconds), effortlessly supporting the frontend dashboard's post-reboot recovery polling.
|
||||
|
||||
@@ -9,63 +9,80 @@
|
||||
|
||||
Implement a robust Over-The-Air (OTA) update mechanism specifically for the Svelte frontend assets served by the ESP32-S3. The update must:
|
||||
- Update the frontend code without requiring a full firmware re-flash.
|
||||
- Provide a reliable fallback if an update fails (Rollback capability).
|
||||
- Provide a reliable fallback if an update fails (Rollback capability via A/B slots).
|
||||
- Handle updates gracefully within the ESP32's available RAM limitations.
|
||||
- Provide a dedicated UI for the user to upload new frontend binaries.
|
||||
- Provide a dedicated UI for the user to upload new frontend binaries with real-time feedback.
|
||||
- **Ensure a seamless user experience** via automated recovery and page refresh.
|
||||
|
||||
## 2. Chosen Approach
|
||||
|
||||
We have opted for a **Dual-Partition Image Flash (A/B slots)** strategy using **LittleFS**.
|
||||
We implemented a **Dual-Partition Image Flash (A/B slots)** strategy using **LittleFS**.
|
||||
|
||||
Instead of updating individual files (HTML, JS, CSS) over HTTP, the build process will generate a single, pre-packaged `.bin` image of the entire `www` directory. This image will be streamed directly to an inactive flash partition, mimicking the safety of standard firmware OTA.
|
||||
Instead of updating individual files, the build process generates a single, pre-packaged `.bin` image of the entire `www` directory. This image is streamed directly to the inactive flash partition (`www_0` or `www_1`), ensuring that the current UI remains fully functional until the update is confirmed and the device reboots.
|
||||
|
||||
## 3. Why Dual-Partition Image Flash?
|
||||
## 3. Design Decisions & Trade-offs
|
||||
|
||||
### Image Flash vs. Individual File Uploads
|
||||
| | Image Flash (LittleFS .bin) | Individual File Uploads |
|
||||
|---|---|---|
|
||||
| **Integrity** | High (Flash whole partition, verify, switch) | Low (A failure mid-upload leaves a broken site) |
|
||||
| **Simplicity (Backend)** | Easy: Stream bytes to raw flash partition | Hard: Manage file creation, deletion, truncation |
|
||||
| **Speed** | Faster (One contiguous flash write) | Slower (Multiple HTTP requests, VFS overhead) |
|
||||
### 3.1. Why Dual-Partition (A/B)?
|
||||
- **Safety**: A failed or interrupted upload never "bricks" the UI. The ESP32 simply remains on the current working slot.
|
||||
- **Flash Allocation**: With 16MB of total flash, allocating 2MB for UI (1MB per slot) is highly efficient given it provides zero-downtime potential.
|
||||
|
||||
### Dual-Partition (A/B) vs. Single Partition
|
||||
| | Dual-Partition (A/B) | Single Partition |
|
||||
|---|---|---|
|
||||
| **Rollback** | ✅ Yes: Revert to previous slot if new one fails | ❌ No: Broken update bricks the UI |
|
||||
| **Flash Usage** | Higher (Requires 2x space) | Lower |
|
||||
### 3.2. Explicit Reboot vs. Hot-Swap
|
||||
We chose an **explicit reboot** to switch slots.
|
||||
- **Pros**: Guarantees a clean state, flushes NVS, and restarts all network/VFS handles.
|
||||
- **Cons**: Brief ~3s downtime.
|
||||
- **Verdict**: The safety of a clean boot outweighs the complexity of live-mounting partitions at runtime.
|
||||
|
||||
**Decision**: Because we have a 16MB flash chip, allocating two 1MB partitions for the frontend (`www_0` and `www_1`) is trivial and provides crucial safety guarantees.
|
||||
### 3.3. Semantic Versioning & Auto-Increment
|
||||
We implemented a `major.minor.revision` versioning system stored in `version.json`.
|
||||
- **Decision**: The `ota:package` script automatically increments the `revision` number on every build.
|
||||
- **Value**: This ensures that every OTA binary is unique and identifiable (e.g., `www_v0.1.6.bin`), preventing confusion during manual testing.
|
||||
|
||||
## 4. Architecture & Workflow
|
||||
## 4. Final Architecture
|
||||
|
||||
### 4.1. The Partition Table
|
||||
The `partitions.csv` will be modified to include two 1MB data partitions for LittleFS:
|
||||
- `www_0`
|
||||
- `www_1`
|
||||
```csv
|
||||
# Name, Type, SubType, Offset, Size
|
||||
nvs, data, nvs, , 0x6000
|
||||
otadata, data, ota, , 0x2000
|
||||
www_0, data, littlefs, , 1M
|
||||
www_1, data, littlefs, , 1M
|
||||
```
|
||||
|
||||
### 4.2. State Management (NVS)
|
||||
The active partition index (0 or 1) will be stored in Non-Volatile Storage (NVS).
|
||||
- On factory flash via serial, `www_0` is populated.
|
||||
- During boot (`app_main`), the ESP32 reads the NVS key. If the key is empty, it defaults to `0` and mounts `www_0` to the `/www` VFS path.
|
||||
The active partition label (`www_0` or `www_1`) is stored in NVS under the `ota` namespace with the key `active_slot`.
|
||||
- On boot, `main.cpp` checks this key. If missing, it defaults to `www_0`.
|
||||
- The `api_ota_frontend_handler` updates this key only after a 100% successful flash.
|
||||
|
||||
### 4.3. The Update Process (Backend)
|
||||
1. **Identify Slot**: The ESP32 determines which slot is currently *inactive*.
|
||||
2. **Stream Upload**: The new LittleFS image (.bin) is `POST`ed to `/api/ota/frontend`.
|
||||
3. **Write to Flash**: The HTTP handler streams the payload directly to the raw, unmounted inactive partition using `esp_partition_erase_range` and `esp_partition_write`, bypassing LittleFS entirely to save RAM and CPU.
|
||||
4. **Switch**: Once the upload completes successfully, the NVS pointer is updated to point to the newly flashed partition.
|
||||
5. **Reboot**: The ESP32 reboots. The bootloader reads the new NVS value, mounts the updated partition, and the new frontend is served.
|
||||
### 4.3. Resilient Auto-Reload (The "Handshake")
|
||||
To solve the "post-reboot-disconnect" problem, we implemented a two-part recovery logic:
|
||||
1. **Targeted Polling**: The frontend registers an `onReboot` callback. When the OTA succeeds, the `App` enters a `rebooting` state.
|
||||
2. **Resilience**: A dedicated `$effect` in Svelte uses a "stubborn" polling loop. It ignores all connection errors (common while the ESP32 is resetting/reconnecting WiFi) and only refreshes the page once a 200 OK is received from `/api/system/info`.
|
||||
|
||||
*Design Note: We chose an explicit reboot over a hot-swap (unmounting and remounting at runtime) because a reboot is very fast (~2-3 seconds) and guarantees a clean state, closing any open file handles.*
|
||||
## 5. UI/UX Implementation
|
||||
|
||||
### 4.4. Security Decisions
|
||||
Authentication and security for the `/api/ota/frontend` endpoint are deferred. The device operates exclusively on a local, trusted network, making immediate authentication overhead unnecessary for this iteration.
|
||||
### 5.1. Layout Separation
|
||||
- **Frontend Info Card**: Extracted into a standalone component to provide high-level observability (Version, Active Slot, Partition Free Space).
|
||||
- **Advanced Tools**: OTA controls are hidden behind a toggle to prevent accidental triggers and reduce UI clutter.
|
||||
|
||||
## 5. Implementation Steps
|
||||
### 5.2. OTA Polling & Stats
|
||||
- **Partition Space**: The `GET /api/ota/status` endpoint was expanded to return an array of partition objects with `size`, `used`, and `free` bytes.
|
||||
- **Progressive Feedback**: A progress bar provides visual feedback during the partition erase/flash cycle.
|
||||
|
||||
1. **Partition Table**: Update `partitions.csv` with `www_0` and `www_1` (1MB each).
|
||||
2. **Boot Logic**: Update `main.cpp` and `http_server.cpp` to read the active partition from NVS and mount the correct label.
|
||||
3. **API Endpoints**:
|
||||
- Add `GET /api/ota/status` to report the current active slot.
|
||||
- Add `POST /api/ota/frontend` to handle the binary stream.
|
||||
4. **Frontend UI**: Create a standalone "Update" page in the Svelte app that fetches the status and provides a file picker and progress bar for the upload.
|
||||
5. **Build Automation**: Add `mklittlefs` to the Node.js build pipeline to generate `www.bin` alongside the standard `dist` output.
|
||||
## 6. Implementation Results
|
||||
|
||||
### 6.1. Benchmarks
|
||||
| Metric | Result |
|
||||
|---|---|
|
||||
| **Binary Size** | ~19kB (Gzipped) in a 1MB partition image |
|
||||
| **Flash Duration** | ~3-5 seconds for a full 1MB partition |
|
||||
| **Reboot to UI Recovery** | ~15-20 seconds (including WiFi reconnection) |
|
||||
| **Peak Heap during OTA**| Small constant overhead (streaming pattern) |
|
||||
|
||||
### 6.2. Document Links
|
||||
- [Walkthrough & Verification](file:///C:/Users/Paul/.gemini/antigravity/brain/0911543f-7067-430d-b21a-dc50ffda7eea/walkthrough.md)
|
||||
- [Build Instructions](file:///w:/Classified/Calendink/Provider/Documentation/build_frontend.md)
|
||||
- [Backend Implementation](file:///w:/Classified/Calendink/Provider/main/api/ota/frontend.cpp)
|
||||
- [Frontend Component](file:///w:/Classified/Calendink/Provider/frontend/src/lib/OTAUpdate.svelte)
|
||||
|
||||
---
|
||||
*Created by Antigravity - Last Updated: 2026-03-03*
|
||||
|
||||
@@ -121,16 +121,16 @@ The frontend calls the ESP32's REST API. The base URL depends on the environment
|
||||
|
||||
This is handled via Vite's `.env.development` and `.env.production` files. The value is baked in at compile time — zero runtime overhead.
|
||||
|
||||
## 9. OTA Considerations (Future)
|
||||
## 9. OTA & Versioning Implementation
|
||||
|
||||
When OTA updates are implemented, the frontend will be embedded into the firmware binary as a C header array:
|
||||
Instead of embedding the UI directly into the firmware binary as originally considered, we implemented a **Standalone Partition OTA** for maximum flexibility:
|
||||
|
||||
1. `npm run build:esp32` → `dist/index.html.gz` (~16kB)
|
||||
2. A script converts the gzipped file to a C `const uint8_t[]` array
|
||||
3. The array is compiled into the firmware binary
|
||||
4. OTA flashes one binary that includes both firmware and frontend
|
||||
1. **A/B Partitioning**: The frontend is staged to an inactive LittleFS slot (`www_0` or `www_1`).
|
||||
2. **Semantic Versioning**: `version.json` tracks `major.minor.revision`.
|
||||
3. **Auto-Increment**: A custom `node scripts/package.js` script automatically increments the revision and generates a versioned binary (e.g., `www_v0.1.6.bin`).
|
||||
4. **Resilient UX**: The Svelte app implements "Resilient Recovery Polling" — it enters a dedicated `isRecovering` state during reboot that ignores connection errors until the device is confirmed back online.
|
||||
|
||||
This avoids needing a separate SPIFFS partition for the frontend and ensures the UI always matches the firmware version.
|
||||
This decoupled approach allows for rapid frontend iteration without touching the 1M+ firmware binary.
|
||||
|
||||
## 10. Summary
|
||||
|
||||
@@ -183,8 +183,8 @@ Provider/frontend/
|
||||
|
||||
- System info display: chip model, free heap, uptime, firmware version, connection type
|
||||
- Reboot button with confirmation modal
|
||||
- Auto-refresh polling every 5 seconds
|
||||
- Four status states: loading, connected, offline, rebooting
|
||||
- **Resilient Auto-Reload**: Targeted polling during reboot that handles intermediate connection failures.
|
||||
- **OTA Dashboard**: Dedicated card showing version, active slot, and real-time partition statistics.
|
||||
- Dark theme with custom color tokens
|
||||
- Fully responsive layout
|
||||
|
||||
@@ -208,5 +208,4 @@ Provider/frontend/
|
||||
|
||||
### Known Issues
|
||||
|
||||
- **W: drive**: Vite requires `resolve.preserveSymlinks: true` in `vite.config.js` because `W:` is a `subst` drive mapped to `C:\Dev\...`. Without this, the build fails with `fileName` path resolution errors.
|
||||
- **ESP32 backend not yet implemented**: The frontend expects `GET /api/system/info` and `POST /api/system/reboot` endpoints. These need to be added to `main.cpp` using `esp_http_server`.
|
||||
- **ESP-IDF Header Ordering**: Some C++ linting errors persist regarding unused headers (e.g., `esp_log.h`) that are actually required for macros; these are suppressed or ignored to maintain compatibility with the unity build pattern.
|
||||
|
||||
Reference in New Issue
Block a user