feat: implement backup jobs management UI and detail view templates

2026-06-22 21:29:06 +07:00 · 2026-06-22 21:29:06 +07:00 · 8851a1e0e7
commit 8851a1e0e7
parent 9d5dc0420d
4 changed files with 388 additions and 101 deletions
--- a/gui_app.py
+++ b/gui_app.py
@ -76,69 +76,79 @@ if HAS_SCHEDULER:
    scheduler = BackgroundScheduler(daemon=True)
    scheduler.start()
 def register_scheduler_job(info):
    if not HAS_SCHEDULER or not scheduler:
        return None
    jid = info['id']
    schedule_type = info.get('schedule_type')
    schedule_time = info.get('schedule_time', '')
    weekly_day = info.get('weekly_day', '0')
    monthly_day = info.get('monthly_day', '1')
    interval_hours = info.get('interval_hours', '24')
    vm_name = info.get('vm_name')
    vm_names = info.get('vm_names')
    label = info.get('label', '')
    trigger = None
    if schedule_type == 'daily':
        hour, minute = (schedule_time.split(':') + ['00'])[:2]
        trigger = CronTrigger(hour=int(hour), minute=int(minute))
    elif schedule_type == 'weekly':
        hour, minute = (schedule_time.split(':') + ['00'])[:2]
        trigger = CronTrigger(
            day_of_week=int(weekly_day),
            hour=int(hour), minute=int(minute)
        )
    elif schedule_type == 'monthly':
        hour, minute = (schedule_time.split(':') + ['00'])[:2]
        day_val = monthly_day
        if str(day_val).isdigit():
            day_val = max(1, min(28, int(day_val)))
        trigger = CronTrigger(
            day=day_val,
            hour=int(hour), minute=int(minute)
        )
    elif schedule_type == 'interval':
        trigger = IntervalTrigger(hours=max(1, int(interval_hours or 24)))
    if trigger:
        def make_runner(j):
            def _runner():
                run_job_thread(j)
            return _runner
        if vm_names:
            sched_name = f"Backup {len(vm_names)} VMs ({label or jid[:8]})"
        else:
            sched_name = f"Backup {vm_name} ({label or jid[:8]})"
        sched_id = f'backup-{jid}'
        try:
            scheduler.remove_job(sched_id)
        except Exception:
            pass
        sched_job = scheduler.add_job(
            make_runner(jid),
            trigger=trigger,
            id=sched_id,
            name=sched_name,
            misfire_grace_time=3600,
            max_instances=1,
        )
        return sched_job.id
    return None
 def reschedule_active_jobs():
    if not HAS_SCHEDULER or not scheduler:
        return
    rescheduled_count = 0
    for jid, info in list(jobs.items()):
-        if info.get('status') == 'scheduled' and info.get('schedule_id'):
+        if info.get('schedule_type') and info.get('schedule_type') != 'now' and info.get('schedule_id'):
            try:
-                # Remove first to prevent duplicates
+                sched_id = register_scheduler_job(info)
-                try:
+                if sched_id:
                    scheduler.remove_job(info['schedule_id'])
                except Exception:
                    pass
                trigger = None
                schedule_type = info.get('schedule_type')
                schedule_time = info.get('schedule_time', '')
                weekly_day = info.get('weekly_day', '0')
                monthly_day = info.get('monthly_day', '1')
                interval_hours = info.get('interval_hours', '24')
                vm_name = info.get('vm_name')
                label = info.get('label', '')
                if schedule_type == 'daily':
                    hour, minute = (schedule_time.split(':') + ['00'])[:2]
                    trigger = CronTrigger(hour=int(hour), minute=int(minute))
                elif schedule_type == 'weekly':
                    hour, minute = (schedule_time.split(':') + ['00'])[:2]
                    trigger = CronTrigger(
                        day_of_week=int(weekly_day),
                        hour=int(hour), minute=int(minute)
                    )
                elif schedule_type == 'monthly':
                    hour, minute = (schedule_time.split(':') + ['00'])[:2]
                    day_val = monthly_day
                    if str(day_val).isdigit():
                        day_val = max(1, min(28, int(day_val)))
                    trigger = CronTrigger(
                        day=day_val,
                        hour=int(hour), minute=int(minute)
                    )
                elif schedule_type == 'interval':
                    trigger = IntervalTrigger(hours=max(1, int(interval_hours or 24)))
                if trigger:
                    def make_runner(j):
                        def _runner():
                            run_job_thread(j)
                        return _runner
                    vm_names = info.get('vm_names')
                    if vm_names:
                        sched_name = f"Backup {len(vm_names)} VMs ({label or jid[:8]})"
                    else:
                        sched_name = f"Backup {vm_name} ({label or jid[:8]})"
                    scheduler.add_job(
                        make_runner(jid),
                        trigger=trigger,
                        id=info['schedule_id'],
                        name=sched_name,
                        misfire_grace_time=3600,
                        max_instances=1,
                    )
                    rescheduled_count += 1
            except Exception as e:
                print(f"ERROR: Failed to reschedule job {jid}: {e}", file=sys.stderr)
@ -605,50 +615,9 @@ def create_and_start_job(
        t = threading.Thread(target=run_job_thread, args=(jid,), daemon=True)
        t.start()
    else:
-        # Build APScheduler trigger
+        sched_id = register_scheduler_job(info)
-        trigger = None
+        if sched_id:
-        if schedule_type == 'daily':
+            info['schedule_id'] = sched_id
            hour, minute = (schedule_time.split(':') + ['00'])[:2]
            trigger = CronTrigger(hour=int(hour), minute=int(minute))
        elif schedule_type == 'weekly':
            hour, minute = (schedule_time.split(':') + ['00'])[:2]
            trigger = CronTrigger(
                day_of_week=int(weekly_day),
                hour=int(hour), minute=int(minute)
            )
        elif schedule_type == 'monthly':
            hour, minute = (schedule_time.split(':') + ['00'])[:2]
            day_val = monthly_day
            if str(day_val).isdigit():
                day_val = max(1, min(28, int(day_val)))
            trigger = CronTrigger(
                day=day_val,
                hour=int(hour), minute=int(minute)
            )
        elif schedule_type == 'interval':
            trigger = IntervalTrigger(hours=max(1, int(interval_hours or 24)))
        if trigger:
            # Capture jid in closure
            def make_runner(j):
                def _runner():
                    run_job_thread(j)
                return _runner
            if vm_names:
                sched_name = f"Backup {len(vm_names)} VMs ({label or jid[:8]})"
            else:
                sched_name = f"Backup {vm_name} ({label or jid[:8]})"
            sched_job = scheduler.add_job(
                make_runner(jid),
                trigger=trigger,
                id=f'backup-{jid}',
                name=sched_name,
                misfire_grace_time=3600,
                max_instances=1,
            )
            info['schedule_id'] = sched_job.id
            info['status'] = 'scheduled'
        else:
            # Fallback: run now
@ -1027,6 +996,30 @@ def cancel_schedule(jobid):
    return redirect(url_for('job_detail', jobid=jobid))
@app.route('/job/<jobid>/reactivate-schedule', methods=['POST'])
@login_required
 def reactivate_schedule(jobid):
    info = jobs.get(jobid)
    if not info:
        abort(404)
    if not info.get('schedule_type') or info.get('schedule_type') == 'now':
        flash('This job does not have a recurring schedule configured.', 'danger')
        return redirect(url_for('job_detail', jobid=jobid))
    if info.get('schedule_id'):
        flash('Schedule is already active.', 'warning')
        return redirect(url_for('job_detail', jobid=jobid))
    sched_id = register_scheduler_job(info)
    if sched_id:
        info['schedule_id'] = sched_id
        if info.get('status') not in ('running', 'queued'):
            info['status'] = 'scheduled'
        save_jobs_db()
        flash('Recurring schedule reactivated successfully.', 'success')
    else:
        flash('Failed to reactivate schedule.', 'danger')
    return redirect(url_for('job_detail', jobid=jobid))
@app.route('/job/<jobid>/run', methods=['POST'])
@login_required
 def run_job_now(jobid):
--- a/templates/job_detail.html
+++ b/templates/job_detail.html
@ -209,6 +209,11 @@
            (Day: {{ ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'][job.weekly_day|int] if (job.weekly_day|string).isdigit() else job.weekly_day }})
          {% endif %}
          {% if job.schedule_time %}at {{ job.schedule_time }}{% endif %}
          {% if job.schedule_id %}
            <span class="badge badge-green" style="font-size: 10px; padding: 2px 6px; margin-left: 6px; display: inline-block;">Active</span>
          {% else %}
            <span class="badge badge-red" style="font-size: 10px; padding: 2px 6px; margin-left: 6px; display: inline-block;">Cancelled</span>
          {% endif %}
        {% else %}
          One-time (Run Now)
        {% endif %}
@ -280,6 +285,16 @@
      <button class="btn btn-danger btn-sm" type="submit">Cancel Schedule</button>
    </form>
  </div>
  {% elif job.schedule_type and job.schedule_type != 'now' %}
  <div class="alert alert-warning" style="margin-bottom:20px;">
    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 8px;"><path d="M10.29 3.86L1.82 18a2 2 0 0 0 1.71 3h16.94a2 2 0 0 0 1.71-3L13.71 3.86a2 2 0 0 0-3.42 0z"/><line x1="12" y1="9" x2="12" y2="13"/><line x1="12" y1="17" x2="12.01" y2="17"/></svg>
    The recurring schedule for this job is currently cancelled/inactive.
    <form method="post" action="/job/{{ job.id }}/reactivate-schedule"
          style="display:inline; margin-left:12px;"
          onsubmit="return confirm('Reactivate recurring schedule?')">
      <button class="btn btn-success btn-sm" type="submit">Reactivate Schedule</button>
    </form>
  </div>
  {% endif %}
  <!-- ── Progress card ── -->
--- a/templates/jobs.html
+++ b/templates/jobs.html
@ -150,6 +150,11 @@
                <svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" style="margin-right: 4px;"><path d="M23 4v6h-6"/><path d="M20.49 15a9 9 0 1 1-2.12-9.36L23 10"/></svg>
                {{ job.schedule_type|capitalize }}
              </span>
              {% if job.schedule_id %}
                <span class="badge badge-green" style="font-size: 10px; padding: 2px 6px; display: inline-block;">Active</span>
              {% else %}
                <span class="badge badge-red" style="font-size: 10px; padding: 2px 6px; display: inline-block;">Cancelled</span>
              {% endif %}
            {% else %}
              <span class="text-muted text-small" style="display: block; margin-bottom: 4px;">One-time</span>
            {% endif %}
@ -190,6 +195,12 @@
                    onsubmit="return confirm('Cancel this schedule?')">
                <button class="btn btn-secondary btn-sm" type="submit">Cancel Schedule</button>
              </form>
              {% elif job.schedule_type and job.schedule_type != 'now' %}
              <form method="post" action="/job/{{ job.id }}/reactivate-schedule"
                    style="margin: 0;"
                    onsubmit="return confirm('Reactivate this schedule?')">
                <button class="btn btn-success btn-sm" type="submit">Reactivate Schedule</button>
              </form>
              {% endif %}
              {% if job.status != 'running' and job.status != 'queued' %}
              <form method="post" action="/job/{{ job.id }}/delete"
--- a/todo.md
+++ b/todo.md
@ -0,0 +1,268 @@
 # vSphere Backup Manager — Enterprise Roadmap
 ## Current State ✅
 The backup engine is working. It connects to vCenter, creates crash-consistent snapshots, downloads full VMDK flat disk data and VMX configs, and runs scheduled recurring jobs — all accessible via a modern Flask web UI.
 ---
 ## Priority 1 — Core Reliability & Persistence
 These are **non-negotiable** for a production system. Without them, the tool is still a "hobby project."
 ### 1.1 — Persistent Job Store
 > **Why:** Currently everything is in RAM. A PM2 restart wipes all job history and kills all schedules.
 - Save `jobs` dict to `jobs.json` on every state change (create, status update, completion)
 - On app startup, load `jobs.json` and re-register all `scheduled` jobs into APScheduler
 - Impact: **Zero job loss across restarts**
 ### 1.2 — Backup Retention Policies
 > **Why:** Without retention, the backup disk fills up forever.
 - Per-job retention rules: keep last **N** full backups, or keep backups no older than **X days**
 - Auto-purge old backup directories after a new backup completes successfully
 - Show retention info and countdown on the Jobs dashboard
 - Impact: **Prevents disk exhaustion**, critical for unattended operation
 ### 1.3 — Email / Webhook Notifications
 > **Why:** Admins can't watch a dashboard 24/7.
 - Send email (SMTP) on: backup success, failure, or warning
 - Send webhook (Slack, Teams, generic HTTP) on job completion
 - Configurable per-job or globally
 - Impact: **Instant alerting** on failures
 ---
 ## Priority 2 — Backup Integrity & Verification
 A backup that can't be verified is a liability, not an asset.
 ### 2.1 — Checksum Verification
 > **Why:** Bit-rot, network corruption, or a partial write can silently corrupt a backup.
 - After each file download, compute **SHA-256** of the downloaded file
 - Store checksums in a `manifest.json` next to each backup
 - Optionally verify checksums before an upload or restore
 ### 2.2 — Backup Manifest & Catalog
 > **Why:** You need a machine-readable record of every backup for audit and restore.
 Each backup produces a `manifest.json`:
 ```json
 {
  "job_id": "...",
  "vm_name": "Nakivo",
  "started": "2026-06-22T01:52:00Z",
  "finished": "2026-06-22T03:10:44Z",
  "vcenter": "vcsa.noc.pens.ac.id",
  "snapshot": "backup-1782067446",
  "files": [
    { "path": "Nakivo/Nakivo.vmdk", "size_bytes": 491, "sha256": "..." },
    {
      "path": "Nakivo/Nakivo-flat.vmdk",
      "size_bytes": 17179869184,
      "sha256": "..."
    },
    { "path": "Nakivo/Nakivo.vmx", "size_bytes": 3065, "sha256": "..." }
  ]
 }
 ```
 ### 2.3 — Test Restore (Dry-Run)
 > **Why:** The only way to know a backup works is to try restoring it.
 - "Verify Backup" button in the UI
 - Checks: manifest exists, all files present, SHA-256 matches, disk size matches vCenter
 - Optionally: power on the VM in an isolated network (advanced)
 ---
 ## Priority 3 — Backup Strategies (Storage Efficiency)
 ### 3.1 — Incremental / Changed Block Tracking (CBT)
 > **Why:** Downloading a full 16 GB disk every night is inefficient. CBT lets you only transfer **changed blocks**.
 - Enable VMware CBT (`changeTrackingEnabled`) on the VM
 - Use `vim.VirtualDisk.QueryChangedDiskAreas()` to get only changed extents
 - Download only the changed byte ranges from the flat VMDK (HTTP Range requests)
 - Store deltas alongside the full base backup
 - Impact: **80–99% reduction** in daily backup transfer size
 > [!IMPORTANT]
 > This is the #1 differentiator between amateur and enterprise backup tools.
 ### 3.2 — Deduplication
 > **Why:** Multiple VMs often share identical OS blocks.
 - Block-level deduplication using content hashing (e.g., SHA-256 per 4 MB block)
 - Store a deduplicated block store; backups reference blocks by hash
 - Tools: integrate with `zfs send` (if on ZFS) or implement a simple local content-addressable store
 ### 3.3 — Compression
 > Already implemented (`zstd`), but integrate tighter with CBT deltas for per-block compression.
 ---
 ## Priority 4 — Security & Multi-User
 ### 4.1 — Encrypted Credential Storage
 > **Why:** Currently vCenter passwords are in Flask signed cookies (not encrypted).
 - Store credentials in server-side encrypted store (e.g., using `cryptography.fernet`)
 - Never transmit plaintext passwords to frontend JavaScript
 - Support environment variable injection (`VCENTER_PASSWORD`)
 ### 4.2 — Role-Based Access Control (RBAC)
 > **Why:** In an enterprise, not everyone should have the same access.
 | Role     | Permissions                                                      |
 | -------- | ---------------------------------------------------------------- |
 | Admin    | Full access — create/delete jobs, manage schedules, view all VMs |
 | Operator | Start/stop jobs, view logs, cannot change schedules              |
 | Viewer   | Read-only dashboard access                                       |
 - Local user accounts stored in a SQLite database with bcrypt-hashed passwords
 - Simple session-based auth or JWT tokens
 ### 4.3 — Audit Log
 > **Why:** Who ran a backup? Who deleted a job? Essential for compliance.
 - Persistent append-only audit log
 - Records: user, action, VM, timestamp, result
 - Viewable in the UI with filtering
 ---
 ## Priority 5 — Operations & Monitoring
 ### 5.1 — REST API
 > **Why:** Integrate with Ansible, Terraform, CI/CD pipelines, or your own monitoring system.
 Expose a full REST API:
 ```
 GET  /api/v1/jobs              — list all jobs
 POST /api/v1/jobs              — create job
 GET  /api/v1/jobs/{id}         — job status + progress
 POST /api/v1/jobs/{id}/cancel  — cancel job
 GET  /api/v1/vms               — list VMs
 GET  /api/v1/backups           — list completed backups with manifests
 POST /api/v1/backups/{id}/verify — trigger checksum verify
 ```
 Include API key authentication (`X-API-Key` header).
 ### 5.2 — Metrics & Dashboard (Prometheus/Grafana)
 > **Why:** At-a-glance health visibility across all backup jobs.
 - Expose a `/metrics` endpoint (Prometheus format)
 - Metrics: `backup_duration_seconds`, `backup_size_bytes`, `backup_success_total`, `backup_failure_total`
 - Build a Grafana dashboard for the backup operations team
 ### 5.3 — Multi-vCenter Support
 > **Why:** Enterprises run multiple vCenter clusters.
 - Support multiple saved vCenter connections (not just session-based)
 - Jobs can target VMs across different vCenter instances
 - Unified jobs dashboard across all vCenters
 ### 5.4 — Storage Backend Plugins
 > **Why:** Not everyone stores backups on local NFS.
 | Backend          | Use Case                          |
 | ---------------- | --------------------------------- |
 | NFS (current)    | On-prem NAS                       |
 | S3 / MinIO       | Object storage (on-prem or cloud) |
 | Azure Blob       | Azure-hosted environments         |
 | Rclone (generic) | 60+ cloud providers               |
 ---
 ## Priority 6 — Disaster Recovery Features
 ### 6.1 — Instant VM Recovery
 > **Why:** RTO (Recovery Time Objective) of minutes, not hours.
 - Register the downloaded VMDK directly back to vCenter without full copy
 - Use `RegisterVM_Task` on the downloaded `.vmx` pointing to the backup directory
 - If backup is on NFS, this is near-instant (no copy needed)
 ### 6.2 — Restore Wizard
 > Add a "Restore" tab to the UI
 - Browse backup catalog → select VM → select restore point → choose target host/datastore
 - Options: restore in-place (overwrite) or restore as new VM (clone)
 - Track restore progress like backup progress
 ### 6.3 — Off-site Replication
 > **Why:** 3-2-1 backup rule: 3 copies, 2 different media, **1 offsite**.
 - After backup completes, replicate to a secondary NFS, S3, or SFTP target
 - Run replication in parallel or sequential
 - Alert if replication fails even if backup succeeded
 ---
 ## Priority 7 — UI/UX Polish
 ### 7.1 — Backup Calendar View
 - Visual calendar showing which VMs were backed up on which days
 - Color-coded: green = success, red = failure, yellow = warning
 ### 7.2 — Storage Analytics
 - Pie chart / bar chart: backup size per VM, storage growth over time
 - Alert when NFS mount is above 80% full
 ### 7.3 — Live Progress Streaming (SSE/WebSocket)
 > **Why:** Currently the log page requires polling. Server-Sent Events provide true live streaming.
 - Replace AJAX polling with `EventSource` (SSE) for real-time log updates
 - Show a live progress bar with phase labels: Connecting → Snapshot → Downloading → Compressing → Done
 ---
 ## Recommended Implementation Order
 ```mermaid
 graph LR
    A["1.1 Persistent Jobs"] --> B["1.2 Retention Policies"]
    B --> C["1.3 Notifications"]
    C --> D["2.1 Checksums"]
    D --> E["3.1 CBT Incremental ⭐"]
    E --> F["4.1 Encrypted Creds"]
    F --> G["5.1 REST API"]
    G --> H["6.2 Restore Wizard"]
 ```
 | Phase       | Features        | Effort   | Impact                                |
 | ----------- | --------------- | -------- | ------------------------------------- |
 | **Phase 1** | 1.1 + 1.2 + 1.3 | ~2 days  | Survives restarts, alerts on failures |
 | **Phase 2** | 2.1 + 2.2 + 5.1 | ~3 days  | Trusted backups, API integration      |
 | **Phase 3** | 3.1 (CBT)       | ~1 week  | Game-changer: 90% less bandwidth      |
 | **Phase 4** | 4.1 + 4.2 + 4.3 | ~1 week  | Enterprise security & compliance      |
 | **Phase 5** | 6.2 + 5.4 + 5.2 | ~2 weeks | Full DR capability                    |