feat: implement backup jobs management UI and detail view templates
This commit is contained in:
parent
9d5dc0420d
commit
8851a1e0e7
195
gui_app.py
195
gui_app.py
@ -76,69 +76,79 @@ if HAS_SCHEDULER:
|
|||||||
scheduler = BackgroundScheduler(daemon=True)
|
scheduler = BackgroundScheduler(daemon=True)
|
||||||
scheduler.start()
|
scheduler.start()
|
||||||
|
|
||||||
|
def register_scheduler_job(info):
|
||||||
|
if not HAS_SCHEDULER or not scheduler:
|
||||||
|
return None
|
||||||
|
|
||||||
|
jid = info['id']
|
||||||
|
schedule_type = info.get('schedule_type')
|
||||||
|
schedule_time = info.get('schedule_time', '')
|
||||||
|
weekly_day = info.get('weekly_day', '0')
|
||||||
|
monthly_day = info.get('monthly_day', '1')
|
||||||
|
interval_hours = info.get('interval_hours', '24')
|
||||||
|
vm_name = info.get('vm_name')
|
||||||
|
vm_names = info.get('vm_names')
|
||||||
|
label = info.get('label', '')
|
||||||
|
|
||||||
|
trigger = None
|
||||||
|
if schedule_type == 'daily':
|
||||||
|
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
||||||
|
trigger = CronTrigger(hour=int(hour), minute=int(minute))
|
||||||
|
elif schedule_type == 'weekly':
|
||||||
|
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
||||||
|
trigger = CronTrigger(
|
||||||
|
day_of_week=int(weekly_day),
|
||||||
|
hour=int(hour), minute=int(minute)
|
||||||
|
)
|
||||||
|
elif schedule_type == 'monthly':
|
||||||
|
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
||||||
|
day_val = monthly_day
|
||||||
|
if str(day_val).isdigit():
|
||||||
|
day_val = max(1, min(28, int(day_val)))
|
||||||
|
trigger = CronTrigger(
|
||||||
|
day=day_val,
|
||||||
|
hour=int(hour), minute=int(minute)
|
||||||
|
)
|
||||||
|
elif schedule_type == 'interval':
|
||||||
|
trigger = IntervalTrigger(hours=max(1, int(interval_hours or 24)))
|
||||||
|
|
||||||
|
if trigger:
|
||||||
|
def make_runner(j):
|
||||||
|
def _runner():
|
||||||
|
run_job_thread(j)
|
||||||
|
return _runner
|
||||||
|
|
||||||
|
if vm_names:
|
||||||
|
sched_name = f"Backup {len(vm_names)} VMs ({label or jid[:8]})"
|
||||||
|
else:
|
||||||
|
sched_name = f"Backup {vm_name} ({label or jid[:8]})"
|
||||||
|
|
||||||
|
sched_id = f'backup-{jid}'
|
||||||
|
try:
|
||||||
|
scheduler.remove_job(sched_id)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
sched_job = scheduler.add_job(
|
||||||
|
make_runner(jid),
|
||||||
|
trigger=trigger,
|
||||||
|
id=sched_id,
|
||||||
|
name=sched_name,
|
||||||
|
misfire_grace_time=3600,
|
||||||
|
max_instances=1,
|
||||||
|
)
|
||||||
|
return sched_job.id
|
||||||
|
return None
|
||||||
|
|
||||||
def reschedule_active_jobs():
|
def reschedule_active_jobs():
|
||||||
if not HAS_SCHEDULER or not scheduler:
|
if not HAS_SCHEDULER or not scheduler:
|
||||||
return
|
return
|
||||||
rescheduled_count = 0
|
rescheduled_count = 0
|
||||||
for jid, info in list(jobs.items()):
|
for jid, info in list(jobs.items()):
|
||||||
if info.get('status') == 'scheduled' and info.get('schedule_id'):
|
if info.get('schedule_type') and info.get('schedule_type') != 'now' and info.get('schedule_id'):
|
||||||
try:
|
try:
|
||||||
# Remove first to prevent duplicates
|
sched_id = register_scheduler_job(info)
|
||||||
try:
|
if sched_id:
|
||||||
scheduler.remove_job(info['schedule_id'])
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
trigger = None
|
|
||||||
schedule_type = info.get('schedule_type')
|
|
||||||
schedule_time = info.get('schedule_time', '')
|
|
||||||
weekly_day = info.get('weekly_day', '0')
|
|
||||||
monthly_day = info.get('monthly_day', '1')
|
|
||||||
interval_hours = info.get('interval_hours', '24')
|
|
||||||
vm_name = info.get('vm_name')
|
|
||||||
label = info.get('label', '')
|
|
||||||
|
|
||||||
if schedule_type == 'daily':
|
|
||||||
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
|
||||||
trigger = CronTrigger(hour=int(hour), minute=int(minute))
|
|
||||||
elif schedule_type == 'weekly':
|
|
||||||
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
|
||||||
trigger = CronTrigger(
|
|
||||||
day_of_week=int(weekly_day),
|
|
||||||
hour=int(hour), minute=int(minute)
|
|
||||||
)
|
|
||||||
elif schedule_type == 'monthly':
|
|
||||||
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
|
||||||
day_val = monthly_day
|
|
||||||
if str(day_val).isdigit():
|
|
||||||
day_val = max(1, min(28, int(day_val)))
|
|
||||||
trigger = CronTrigger(
|
|
||||||
day=day_val,
|
|
||||||
hour=int(hour), minute=int(minute)
|
|
||||||
)
|
|
||||||
elif schedule_type == 'interval':
|
|
||||||
trigger = IntervalTrigger(hours=max(1, int(interval_hours or 24)))
|
|
||||||
|
|
||||||
if trigger:
|
|
||||||
def make_runner(j):
|
|
||||||
def _runner():
|
|
||||||
run_job_thread(j)
|
|
||||||
return _runner
|
|
||||||
|
|
||||||
vm_names = info.get('vm_names')
|
|
||||||
if vm_names:
|
|
||||||
sched_name = f"Backup {len(vm_names)} VMs ({label or jid[:8]})"
|
|
||||||
else:
|
|
||||||
sched_name = f"Backup {vm_name} ({label or jid[:8]})"
|
|
||||||
|
|
||||||
scheduler.add_job(
|
|
||||||
make_runner(jid),
|
|
||||||
trigger=trigger,
|
|
||||||
id=info['schedule_id'],
|
|
||||||
name=sched_name,
|
|
||||||
misfire_grace_time=3600,
|
|
||||||
max_instances=1,
|
|
||||||
)
|
|
||||||
rescheduled_count += 1
|
rescheduled_count += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"ERROR: Failed to reschedule job {jid}: {e}", file=sys.stderr)
|
print(f"ERROR: Failed to reschedule job {jid}: {e}", file=sys.stderr)
|
||||||
@ -605,50 +615,9 @@ def create_and_start_job(
|
|||||||
t = threading.Thread(target=run_job_thread, args=(jid,), daemon=True)
|
t = threading.Thread(target=run_job_thread, args=(jid,), daemon=True)
|
||||||
t.start()
|
t.start()
|
||||||
else:
|
else:
|
||||||
# Build APScheduler trigger
|
sched_id = register_scheduler_job(info)
|
||||||
trigger = None
|
if sched_id:
|
||||||
if schedule_type == 'daily':
|
info['schedule_id'] = sched_id
|
||||||
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
|
||||||
trigger = CronTrigger(hour=int(hour), minute=int(minute))
|
|
||||||
elif schedule_type == 'weekly':
|
|
||||||
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
|
||||||
trigger = CronTrigger(
|
|
||||||
day_of_week=int(weekly_day),
|
|
||||||
hour=int(hour), minute=int(minute)
|
|
||||||
)
|
|
||||||
elif schedule_type == 'monthly':
|
|
||||||
hour, minute = (schedule_time.split(':') + ['00'])[:2]
|
|
||||||
day_val = monthly_day
|
|
||||||
if str(day_val).isdigit():
|
|
||||||
day_val = max(1, min(28, int(day_val)))
|
|
||||||
trigger = CronTrigger(
|
|
||||||
day=day_val,
|
|
||||||
hour=int(hour), minute=int(minute)
|
|
||||||
)
|
|
||||||
elif schedule_type == 'interval':
|
|
||||||
trigger = IntervalTrigger(hours=max(1, int(interval_hours or 24)))
|
|
||||||
|
|
||||||
if trigger:
|
|
||||||
# Capture jid in closure
|
|
||||||
def make_runner(j):
|
|
||||||
def _runner():
|
|
||||||
run_job_thread(j)
|
|
||||||
return _runner
|
|
||||||
|
|
||||||
if vm_names:
|
|
||||||
sched_name = f"Backup {len(vm_names)} VMs ({label or jid[:8]})"
|
|
||||||
else:
|
|
||||||
sched_name = f"Backup {vm_name} ({label or jid[:8]})"
|
|
||||||
|
|
||||||
sched_job = scheduler.add_job(
|
|
||||||
make_runner(jid),
|
|
||||||
trigger=trigger,
|
|
||||||
id=f'backup-{jid}',
|
|
||||||
name=sched_name,
|
|
||||||
misfire_grace_time=3600,
|
|
||||||
max_instances=1,
|
|
||||||
)
|
|
||||||
info['schedule_id'] = sched_job.id
|
|
||||||
info['status'] = 'scheduled'
|
info['status'] = 'scheduled'
|
||||||
else:
|
else:
|
||||||
# Fallback: run now
|
# Fallback: run now
|
||||||
@ -1027,6 +996,30 @@ def cancel_schedule(jobid):
|
|||||||
return redirect(url_for('job_detail', jobid=jobid))
|
return redirect(url_for('job_detail', jobid=jobid))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/job/<jobid>/reactivate-schedule', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def reactivate_schedule(jobid):
|
||||||
|
info = jobs.get(jobid)
|
||||||
|
if not info:
|
||||||
|
abort(404)
|
||||||
|
if not info.get('schedule_type') or info.get('schedule_type') == 'now':
|
||||||
|
flash('This job does not have a recurring schedule configured.', 'danger')
|
||||||
|
return redirect(url_for('job_detail', jobid=jobid))
|
||||||
|
if info.get('schedule_id'):
|
||||||
|
flash('Schedule is already active.', 'warning')
|
||||||
|
return redirect(url_for('job_detail', jobid=jobid))
|
||||||
|
sched_id = register_scheduler_job(info)
|
||||||
|
if sched_id:
|
||||||
|
info['schedule_id'] = sched_id
|
||||||
|
if info.get('status') not in ('running', 'queued'):
|
||||||
|
info['status'] = 'scheduled'
|
||||||
|
save_jobs_db()
|
||||||
|
flash('Recurring schedule reactivated successfully.', 'success')
|
||||||
|
else:
|
||||||
|
flash('Failed to reactivate schedule.', 'danger')
|
||||||
|
return redirect(url_for('job_detail', jobid=jobid))
|
||||||
|
|
||||||
|
|
||||||
@app.route('/job/<jobid>/run', methods=['POST'])
|
@app.route('/job/<jobid>/run', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def run_job_now(jobid):
|
def run_job_now(jobid):
|
||||||
|
|||||||
@ -209,6 +209,11 @@
|
|||||||
(Day: {{ ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'][job.weekly_day|int] if (job.weekly_day|string).isdigit() else job.weekly_day }})
|
(Day: {{ ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'][job.weekly_day|int] if (job.weekly_day|string).isdigit() else job.weekly_day }})
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if job.schedule_time %}at {{ job.schedule_time }}{% endif %}
|
{% if job.schedule_time %}at {{ job.schedule_time }}{% endif %}
|
||||||
|
{% if job.schedule_id %}
|
||||||
|
<span class="badge badge-green" style="font-size: 10px; padding: 2px 6px; margin-left: 6px; display: inline-block;">Active</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge badge-red" style="font-size: 10px; padding: 2px 6px; margin-left: 6px; display: inline-block;">Cancelled</span>
|
||||||
|
{% endif %}
|
||||||
{% else %}
|
{% else %}
|
||||||
One-time (Run Now)
|
One-time (Run Now)
|
||||||
{% endif %}
|
{% endif %}
|
||||||
@ -280,6 +285,16 @@
|
|||||||
<button class="btn btn-danger btn-sm" type="submit">Cancel Schedule</button>
|
<button class="btn btn-danger btn-sm" type="submit">Cancel Schedule</button>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
{% elif job.schedule_type and job.schedule_type != 'now' %}
|
||||||
|
<div class="alert alert-warning" style="margin-bottom:20px;">
|
||||||
|
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 8px;"><path d="M10.29 3.86L1.82 18a2 2 0 0 0 1.71 3h16.94a2 2 0 0 0 1.71-3L13.71 3.86a2 2 0 0 0-3.42 0z"/><line x1="12" y1="9" x2="12" y2="13"/><line x1="12" y1="17" x2="12.01" y2="17"/></svg>
|
||||||
|
The recurring schedule for this job is currently cancelled/inactive.
|
||||||
|
<form method="post" action="/job/{{ job.id }}/reactivate-schedule"
|
||||||
|
style="display:inline; margin-left:12px;"
|
||||||
|
onsubmit="return confirm('Reactivate recurring schedule?')">
|
||||||
|
<button class="btn btn-success btn-sm" type="submit">Reactivate Schedule</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<!-- ── Progress card ── -->
|
<!-- ── Progress card ── -->
|
||||||
|
|||||||
@ -150,6 +150,11 @@
|
|||||||
<svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" style="margin-right: 4px;"><path d="M23 4v6h-6"/><path d="M20.49 15a9 9 0 1 1-2.12-9.36L23 10"/></svg>
|
<svg width="11" height="11" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2.5" stroke-linecap="round" stroke-linejoin="round" style="margin-right: 4px;"><path d="M23 4v6h-6"/><path d="M20.49 15a9 9 0 1 1-2.12-9.36L23 10"/></svg>
|
||||||
{{ job.schedule_type|capitalize }}
|
{{ job.schedule_type|capitalize }}
|
||||||
</span>
|
</span>
|
||||||
|
{% if job.schedule_id %}
|
||||||
|
<span class="badge badge-green" style="font-size: 10px; padding: 2px 6px; display: inline-block;">Active</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge badge-red" style="font-size: 10px; padding: 2px 6px; display: inline-block;">Cancelled</span>
|
||||||
|
{% endif %}
|
||||||
{% else %}
|
{% else %}
|
||||||
<span class="text-muted text-small" style="display: block; margin-bottom: 4px;">One-time</span>
|
<span class="text-muted text-small" style="display: block; margin-bottom: 4px;">One-time</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
@ -190,6 +195,12 @@
|
|||||||
onsubmit="return confirm('Cancel this schedule?')">
|
onsubmit="return confirm('Cancel this schedule?')">
|
||||||
<button class="btn btn-secondary btn-sm" type="submit">Cancel Schedule</button>
|
<button class="btn btn-secondary btn-sm" type="submit">Cancel Schedule</button>
|
||||||
</form>
|
</form>
|
||||||
|
{% elif job.schedule_type and job.schedule_type != 'now' %}
|
||||||
|
<form method="post" action="/job/{{ job.id }}/reactivate-schedule"
|
||||||
|
style="margin: 0;"
|
||||||
|
onsubmit="return confirm('Reactivate this schedule?')">
|
||||||
|
<button class="btn btn-success btn-sm" type="submit">Reactivate Schedule</button>
|
||||||
|
</form>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if job.status != 'running' and job.status != 'queued' %}
|
{% if job.status != 'running' and job.status != 'queued' %}
|
||||||
<form method="post" action="/job/{{ job.id }}/delete"
|
<form method="post" action="/job/{{ job.id }}/delete"
|
||||||
|
|||||||
268
todo.md
Normal file
268
todo.md
Normal file
@ -0,0 +1,268 @@
|
|||||||
|
# vSphere Backup Manager — Enterprise Roadmap
|
||||||
|
|
||||||
|
## Current State ✅
|
||||||
|
|
||||||
|
The backup engine is working. It connects to vCenter, creates crash-consistent snapshots, downloads full VMDK flat disk data and VMX configs, and runs scheduled recurring jobs — all accessible via a modern Flask web UI.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 1 — Core Reliability & Persistence
|
||||||
|
|
||||||
|
These are **non-negotiable** for a production system. Without them, the tool is still a "hobby project."
|
||||||
|
|
||||||
|
### 1.1 — Persistent Job Store
|
||||||
|
|
||||||
|
> **Why:** Currently everything is in RAM. A PM2 restart wipes all job history and kills all schedules.
|
||||||
|
|
||||||
|
- Save `jobs` dict to `jobs.json` on every state change (create, status update, completion)
|
||||||
|
- On app startup, load `jobs.json` and re-register all `scheduled` jobs into APScheduler
|
||||||
|
- Impact: **Zero job loss across restarts**
|
||||||
|
|
||||||
|
### 1.2 — Backup Retention Policies
|
||||||
|
|
||||||
|
> **Why:** Without retention, the backup disk fills up forever.
|
||||||
|
|
||||||
|
- Per-job retention rules: keep last **N** full backups, or keep backups no older than **X days**
|
||||||
|
- Auto-purge old backup directories after a new backup completes successfully
|
||||||
|
- Show retention info and countdown on the Jobs dashboard
|
||||||
|
- Impact: **Prevents disk exhaustion**, critical for unattended operation
|
||||||
|
|
||||||
|
### 1.3 — Email / Webhook Notifications
|
||||||
|
|
||||||
|
> **Why:** Admins can't watch a dashboard 24/7.
|
||||||
|
|
||||||
|
- Send email (SMTP) on: backup success, failure, or warning
|
||||||
|
- Send webhook (Slack, Teams, generic HTTP) on job completion
|
||||||
|
- Configurable per-job or globally
|
||||||
|
- Impact: **Instant alerting** on failures
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 2 — Backup Integrity & Verification
|
||||||
|
|
||||||
|
A backup that can't be verified is a liability, not an asset.
|
||||||
|
|
||||||
|
### 2.1 — Checksum Verification
|
||||||
|
|
||||||
|
> **Why:** Bit-rot, network corruption, or a partial write can silently corrupt a backup.
|
||||||
|
|
||||||
|
- After each file download, compute **SHA-256** of the downloaded file
|
||||||
|
- Store checksums in a `manifest.json` next to each backup
|
||||||
|
- Optionally verify checksums before an upload or restore
|
||||||
|
|
||||||
|
### 2.2 — Backup Manifest & Catalog
|
||||||
|
|
||||||
|
> **Why:** You need a machine-readable record of every backup for audit and restore.
|
||||||
|
|
||||||
|
Each backup produces a `manifest.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"job_id": "...",
|
||||||
|
"vm_name": "Nakivo",
|
||||||
|
"started": "2026-06-22T01:52:00Z",
|
||||||
|
"finished": "2026-06-22T03:10:44Z",
|
||||||
|
"vcenter": "vcsa.noc.pens.ac.id",
|
||||||
|
"snapshot": "backup-1782067446",
|
||||||
|
"files": [
|
||||||
|
{ "path": "Nakivo/Nakivo.vmdk", "size_bytes": 491, "sha256": "..." },
|
||||||
|
{
|
||||||
|
"path": "Nakivo/Nakivo-flat.vmdk",
|
||||||
|
"size_bytes": 17179869184,
|
||||||
|
"sha256": "..."
|
||||||
|
},
|
||||||
|
{ "path": "Nakivo/Nakivo.vmx", "size_bytes": 3065, "sha256": "..." }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 — Test Restore (Dry-Run)
|
||||||
|
|
||||||
|
> **Why:** The only way to know a backup works is to try restoring it.
|
||||||
|
|
||||||
|
- "Verify Backup" button in the UI
|
||||||
|
- Checks: manifest exists, all files present, SHA-256 matches, disk size matches vCenter
|
||||||
|
- Optionally: power on the VM in an isolated network (advanced)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 3 — Backup Strategies (Storage Efficiency)
|
||||||
|
|
||||||
|
### 3.1 — Incremental / Changed Block Tracking (CBT)
|
||||||
|
|
||||||
|
> **Why:** Downloading a full 16 GB disk every night is inefficient. CBT lets you only transfer **changed blocks**.
|
||||||
|
|
||||||
|
- Enable VMware CBT (`changeTrackingEnabled`) on the VM
|
||||||
|
- Use `vim.VirtualDisk.QueryChangedDiskAreas()` to get only changed extents
|
||||||
|
- Download only the changed byte ranges from the flat VMDK (HTTP Range requests)
|
||||||
|
- Store deltas alongside the full base backup
|
||||||
|
- Impact: **80–99% reduction** in daily backup transfer size
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> This is the #1 differentiator between amateur and enterprise backup tools.
|
||||||
|
|
||||||
|
### 3.2 — Deduplication
|
||||||
|
|
||||||
|
> **Why:** Multiple VMs often share identical OS blocks.
|
||||||
|
|
||||||
|
- Block-level deduplication using content hashing (e.g., SHA-256 per 4 MB block)
|
||||||
|
- Store a deduplicated block store; backups reference blocks by hash
|
||||||
|
- Tools: integrate with `zfs send` (if on ZFS) or implement a simple local content-addressable store
|
||||||
|
|
||||||
|
### 3.3 — Compression
|
||||||
|
|
||||||
|
> Already implemented (`zstd`), but integrate tighter with CBT deltas for per-block compression.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 4 — Security & Multi-User
|
||||||
|
|
||||||
|
### 4.1 — Encrypted Credential Storage
|
||||||
|
|
||||||
|
> **Why:** Currently vCenter passwords are in Flask signed cookies (not encrypted).
|
||||||
|
|
||||||
|
- Store credentials in server-side encrypted store (e.g., using `cryptography.fernet`)
|
||||||
|
- Never transmit plaintext passwords to frontend JavaScript
|
||||||
|
- Support environment variable injection (`VCENTER_PASSWORD`)
|
||||||
|
|
||||||
|
### 4.2 — Role-Based Access Control (RBAC)
|
||||||
|
|
||||||
|
> **Why:** In an enterprise, not everyone should have the same access.
|
||||||
|
|
||||||
|
| Role | Permissions |
|
||||||
|
| -------- | ---------------------------------------------------------------- |
|
||||||
|
| Admin | Full access — create/delete jobs, manage schedules, view all VMs |
|
||||||
|
| Operator | Start/stop jobs, view logs, cannot change schedules |
|
||||||
|
| Viewer | Read-only dashboard access |
|
||||||
|
|
||||||
|
- Local user accounts stored in a SQLite database with bcrypt-hashed passwords
|
||||||
|
- Simple session-based auth or JWT tokens
|
||||||
|
|
||||||
|
### 4.3 — Audit Log
|
||||||
|
|
||||||
|
> **Why:** Who ran a backup? Who deleted a job? Essential for compliance.
|
||||||
|
|
||||||
|
- Persistent append-only audit log
|
||||||
|
- Records: user, action, VM, timestamp, result
|
||||||
|
- Viewable in the UI with filtering
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 5 — Operations & Monitoring
|
||||||
|
|
||||||
|
### 5.1 — REST API
|
||||||
|
|
||||||
|
> **Why:** Integrate with Ansible, Terraform, CI/CD pipelines, or your own monitoring system.
|
||||||
|
|
||||||
|
Expose a full REST API:
|
||||||
|
|
||||||
|
```
|
||||||
|
GET /api/v1/jobs — list all jobs
|
||||||
|
POST /api/v1/jobs — create job
|
||||||
|
GET /api/v1/jobs/{id} — job status + progress
|
||||||
|
POST /api/v1/jobs/{id}/cancel — cancel job
|
||||||
|
GET /api/v1/vms — list VMs
|
||||||
|
GET /api/v1/backups — list completed backups with manifests
|
||||||
|
POST /api/v1/backups/{id}/verify — trigger checksum verify
|
||||||
|
```
|
||||||
|
|
||||||
|
Include API key authentication (`X-API-Key` header).
|
||||||
|
|
||||||
|
### 5.2 — Metrics & Dashboard (Prometheus/Grafana)
|
||||||
|
|
||||||
|
> **Why:** At-a-glance health visibility across all backup jobs.
|
||||||
|
|
||||||
|
- Expose a `/metrics` endpoint (Prometheus format)
|
||||||
|
- Metrics: `backup_duration_seconds`, `backup_size_bytes`, `backup_success_total`, `backup_failure_total`
|
||||||
|
- Build a Grafana dashboard for the backup operations team
|
||||||
|
|
||||||
|
### 5.3 — Multi-vCenter Support
|
||||||
|
|
||||||
|
> **Why:** Enterprises run multiple vCenter clusters.
|
||||||
|
|
||||||
|
- Support multiple saved vCenter connections (not just session-based)
|
||||||
|
- Jobs can target VMs across different vCenter instances
|
||||||
|
- Unified jobs dashboard across all vCenters
|
||||||
|
|
||||||
|
### 5.4 — Storage Backend Plugins
|
||||||
|
|
||||||
|
> **Why:** Not everyone stores backups on local NFS.
|
||||||
|
|
||||||
|
| Backend | Use Case |
|
||||||
|
| ---------------- | --------------------------------- |
|
||||||
|
| NFS (current) | On-prem NAS |
|
||||||
|
| S3 / MinIO | Object storage (on-prem or cloud) |
|
||||||
|
| Azure Blob | Azure-hosted environments |
|
||||||
|
| Rclone (generic) | 60+ cloud providers |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 6 — Disaster Recovery Features
|
||||||
|
|
||||||
|
### 6.1 — Instant VM Recovery
|
||||||
|
|
||||||
|
> **Why:** RTO (Recovery Time Objective) of minutes, not hours.
|
||||||
|
|
||||||
|
- Register the downloaded VMDK directly back to vCenter without full copy
|
||||||
|
- Use `RegisterVM_Task` on the downloaded `.vmx` pointing to the backup directory
|
||||||
|
- If backup is on NFS, this is near-instant (no copy needed)
|
||||||
|
|
||||||
|
### 6.2 — Restore Wizard
|
||||||
|
|
||||||
|
> Add a "Restore" tab to the UI
|
||||||
|
|
||||||
|
- Browse backup catalog → select VM → select restore point → choose target host/datastore
|
||||||
|
- Options: restore in-place (overwrite) or restore as new VM (clone)
|
||||||
|
- Track restore progress like backup progress
|
||||||
|
|
||||||
|
### 6.3 — Off-site Replication
|
||||||
|
|
||||||
|
> **Why:** 3-2-1 backup rule: 3 copies, 2 different media, **1 offsite**.
|
||||||
|
|
||||||
|
- After backup completes, replicate to a secondary NFS, S3, or SFTP target
|
||||||
|
- Run replication in parallel or sequential
|
||||||
|
- Alert if replication fails even if backup succeeded
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Priority 7 — UI/UX Polish
|
||||||
|
|
||||||
|
### 7.1 — Backup Calendar View
|
||||||
|
|
||||||
|
- Visual calendar showing which VMs were backed up on which days
|
||||||
|
- Color-coded: green = success, red = failure, yellow = warning
|
||||||
|
|
||||||
|
### 7.2 — Storage Analytics
|
||||||
|
|
||||||
|
- Pie chart / bar chart: backup size per VM, storage growth over time
|
||||||
|
- Alert when NFS mount is above 80% full
|
||||||
|
|
||||||
|
### 7.3 — Live Progress Streaming (SSE/WebSocket)
|
||||||
|
|
||||||
|
> **Why:** Currently the log page requires polling. Server-Sent Events provide true live streaming.
|
||||||
|
|
||||||
|
- Replace AJAX polling with `EventSource` (SSE) for real-time log updates
|
||||||
|
- Show a live progress bar with phase labels: Connecting → Snapshot → Downloading → Compressing → Done
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommended Implementation Order
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
graph LR
|
||||||
|
A["1.1 Persistent Jobs"] --> B["1.2 Retention Policies"]
|
||||||
|
B --> C["1.3 Notifications"]
|
||||||
|
C --> D["2.1 Checksums"]
|
||||||
|
D --> E["3.1 CBT Incremental ⭐"]
|
||||||
|
E --> F["4.1 Encrypted Creds"]
|
||||||
|
F --> G["5.1 REST API"]
|
||||||
|
G --> H["6.2 Restore Wizard"]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Phase | Features | Effort | Impact |
|
||||||
|
| ----------- | --------------- | -------- | ------------------------------------- |
|
||||||
|
| **Phase 1** | 1.1 + 1.2 + 1.3 | ~2 days | Survives restarts, alerts on failures |
|
||||||
|
| **Phase 2** | 2.1 + 2.2 + 5.1 | ~3 days | Trusted backups, API integration |
|
||||||
|
| **Phase 3** | 3.1 (CBT) | ~1 week | Game-changer: 90% less bandwidth |
|
||||||
|
| **Phase 4** | 4.1 + 4.2 + 4.3 | ~1 week | Enterprise security & compliance |
|
||||||
|
| **Phase 5** | 6.2 + 5.4 + 5.2 | ~2 weeks | Full DR capability |
|
||||||
Loading…
Reference in New Issue
Block a user