refactor: migrate jobs storage from JSON file to SQLite and add thread-safe access to job and cache states
This commit is contained in:
parent
046689bd09
commit
b99755771e
@ -115,14 +115,19 @@ def list_vms(host, user, password, no_verify_ssl=False):
|
||||
|
||||
|
||||
def wait_for_task(task, action_name='job'):
|
||||
while task.info.state not in (vim.TaskInfo.State.success, vim.TaskInfo.State.error):
|
||||
while True:
|
||||
info = getattr(task, 'info', None)
|
||||
if info and info.state in (vim.TaskInfo.State.success, vim.TaskInfo.State.error):
|
||||
break
|
||||
time.sleep(1)
|
||||
if task.info.state == vim.TaskInfo.State.success:
|
||||
return task.info.result
|
||||
info = task.info
|
||||
if info.state == vim.TaskInfo.State.success:
|
||||
return info.result
|
||||
else:
|
||||
err = task.info.error
|
||||
err = info.error
|
||||
fault_name = err.__class__.__name__ if err else "UnknownFault"
|
||||
err_msg = getattr(err, 'msg', None) or str(err)
|
||||
raise Exception(f"{action_name} did not complete successfully: {err_msg}")
|
||||
raise Exception(f"{action_name} did not complete successfully: {fault_name}: {err_msg}")
|
||||
|
||||
|
||||
def create_snapshot(vm, snap_name, desc="backup snapshot", memory=False, quiesce=False):
|
||||
@ -152,7 +157,7 @@ def download_datastore_file(host, dc_name, datastore_name, ds_path, local_path,
|
||||
print(f"Downloading {ds_path} from datastore {datastore_name} to {local_path}")
|
||||
print(f" URL: {url}")
|
||||
sha256 = hashlib.sha256()
|
||||
with requests.get(url, headers=headers, stream=True, verify=verify_ssl, proxies={"http": None, "https": None}) as r:
|
||||
with requests.get(url, headers=headers, stream=True, verify=verify_ssl, proxies={"http": None, "https": None}, timeout=30) as r:
|
||||
r.raise_for_status()
|
||||
total_bytes = int(r.headers.get('Content-Length', 0))
|
||||
print(f" HTTP {r.status_code}, Content-Length: {total_bytes} bytes")
|
||||
@ -370,7 +375,8 @@ def download_disk_changed_ranges(host, dc_name, ds_name, ds_path, extents,
|
||||
|
||||
with requests.get(url, headers=req_headers, stream=True,
|
||||
verify=verify_ssl,
|
||||
proxies={"http": None, "https": None}) as r:
|
||||
proxies={"http": None, "https": None},
|
||||
timeout=30) as r:
|
||||
if r.status_code not in (200, 206):
|
||||
raise Exception(f"HTTP {r.status_code} for Range {range_header}")
|
||||
|
||||
@ -636,7 +642,7 @@ def _run_backup_impl(host, user, password, vm_name, dest, compress, no_verify_ss
|
||||
# Get VMDK paths and normalize them (strip snapshot suffixes like -000001)
|
||||
# so we always request the base VMDKs which vCenter streams as the full data disk
|
||||
raw_vmdk_refs = vm_disk_vmdk_paths(vm)
|
||||
vmdk_refs = [re.sub(r'-\d+\.vmdk$', '.vmdk', r, flags=re.IGNORECASE) for r in raw_vmdk_refs]
|
||||
vmdk_refs = [re.sub(r'-\d{6}\.vmdk$', '.vmdk', r, flags=re.IGNORECASE) for r in raw_vmdk_refs]
|
||||
vmx_ref = vm_config_vmx_path(vm)
|
||||
|
||||
# Build a map of normalized vmdk_ref -> VirtualDisk device for CBT
|
||||
@ -645,7 +651,7 @@ def _run_backup_impl(host, user, password, vm_name, dest, compress, no_verify_ss
|
||||
if isinstance(dev, vim.vm.device.VirtualDisk):
|
||||
fn = getattr(dev.backing, 'fileName', None)
|
||||
if fn:
|
||||
norm = re.sub(r'-\d+\.vmdk$', '.vmdk', fn, flags=re.IGNORECASE)
|
||||
norm = re.sub(r'-\d{6}\.vmdk$', '.vmdk', fn, flags=re.IGNORECASE)
|
||||
disk_devices[norm] = dev
|
||||
|
||||
# Locate the backup snapshot object for CBT queries
|
||||
@ -664,7 +670,7 @@ def _run_backup_impl(host, user, password, vm_name, dest, compress, no_verify_ss
|
||||
|
||||
# Apply disk filter — only download selected VMDKs
|
||||
if disk_filter is not None:
|
||||
disk_filter_set = {re.sub(r'-\d+\.vmdk$', '.vmdk', f, flags=re.IGNORECASE) for f in disk_filter}
|
||||
disk_filter_set = {re.sub(r'-\d{6}\.vmdk$', '.vmdk', f, flags=re.IGNORECASE) for f in disk_filter}
|
||||
skipped = []
|
||||
filtered_vmdk_refs = []
|
||||
for raw_ref, norm_ref in zip(raw_vmdk_refs, vmdk_refs):
|
||||
|
||||
169
gui_app.py
169
gui_app.py
@ -10,6 +10,7 @@ import time
|
||||
import platform
|
||||
import subprocess
|
||||
import json
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
@ -43,32 +44,129 @@ JOBS_DIR = BASE_DIR / 'jobs'
|
||||
JOBS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
JOBS_DB_PATH = BASE_DIR / 'jobs.json'
|
||||
jobs_db_lock = threading.Lock()
|
||||
DB_PATH = BASE_DIR / 'jobs.db'
|
||||
jobs_db_lock = threading.RLock()
|
||||
|
||||
# In-memory job store: {job_id: job_dict}
|
||||
jobs: dict = {}
|
||||
|
||||
def load_jobs_db():
|
||||
global jobs
|
||||
def init_db():
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS jobs (
|
||||
id TEXT PRIMARY KEY,
|
||||
started REAL,
|
||||
status TEXT,
|
||||
data TEXT
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def migrate_old_json_db():
|
||||
if JOBS_DB_PATH.exists():
|
||||
try:
|
||||
with open(JOBS_DB_PATH, 'r', encoding='utf-8') as f:
|
||||
with jobs_db_lock:
|
||||
jobs.clear()
|
||||
jobs.update(json.load(f))
|
||||
old_jobs = json.load(f)
|
||||
if old_jobs and isinstance(old_jobs, dict):
|
||||
init_db()
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
for jid, info in old_jobs.items():
|
||||
cursor.execute("SELECT 1 FROM jobs WHERE id = ?", (jid,))
|
||||
if not cursor.fetchone():
|
||||
cursor.execute(
|
||||
"INSERT INTO jobs (id, started, status, data) VALUES (?, ?, ?, ?)",
|
||||
(jid, info.get('started', 0), info.get('status', ''), json.dumps(info, ensure_ascii=False))
|
||||
)
|
||||
conn.commit()
|
||||
print(f"MIGRATION: Successfully migrated jobs from jobs.json to SQLite database.")
|
||||
finally:
|
||||
conn.close()
|
||||
try:
|
||||
bak_path = BASE_DIR / 'jobs.json.bak'
|
||||
if bak_path.exists():
|
||||
bak_path.unlink()
|
||||
JOBS_DB_PATH.rename(bak_path)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"ERROR: Failed to load jobs database: {e}", file=sys.stderr)
|
||||
else:
|
||||
print(f"WARNING: Migration of jobs.json failed: {e}", file=sys.stderr)
|
||||
|
||||
def load_jobs_db():
|
||||
global jobs
|
||||
init_db()
|
||||
migrate_old_json_db()
|
||||
try:
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, data FROM jobs")
|
||||
rows = cursor.fetchall()
|
||||
|
||||
with jobs_db_lock:
|
||||
jobs.clear()
|
||||
for jid, data_str in rows:
|
||||
try:
|
||||
jobs[jid] = json.loads(data_str)
|
||||
except Exception as e:
|
||||
print(f"ERROR: Failed to parse job data for {jid}: {e}", file=sys.stderr)
|
||||
|
||||
# Clean up any jobs left in running/queued state across restart
|
||||
updated_jobs = []
|
||||
with jobs_db_lock:
|
||||
for jid, info in jobs.items():
|
||||
if info.get('status') in ('running', 'queued'):
|
||||
info['status'] = 'failed (Interrupted by restart)'
|
||||
info['progress'] = {
|
||||
'pct': 100,
|
||||
'phase': 'failed',
|
||||
'detail': 'Job was interrupted by server restart.'
|
||||
}
|
||||
updated_jobs.append((jid, info))
|
||||
|
||||
if updated_jobs:
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
for jid, info in updated_jobs:
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO jobs (id, started, status, data) VALUES (?, ?, ?, ?)",
|
||||
(jid, info.get('started', 0), info.get('status', ''), json.dumps(info, ensure_ascii=False))
|
||||
)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(f"ERROR: Failed to update interrupted jobs in SQLite: {e}", file=sys.stderr)
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"ERROR: Failed to load SQLite database: {e}", file=sys.stderr)
|
||||
|
||||
def save_jobs_db():
|
||||
with jobs_db_lock:
|
||||
try:
|
||||
with open(JOBS_DB_PATH, 'w', encoding='utf-8') as f:
|
||||
json.dump(jobs, f, indent=2, ensure_ascii=False)
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
# Remove deleted jobs from SQLite
|
||||
if jobs:
|
||||
placeholders = ','.join('?' for _ in jobs)
|
||||
cursor.execute(f"DELETE FROM jobs WHERE id NOT IN ({placeholders})", list(jobs.keys()))
|
||||
else:
|
||||
cursor.execute("DELETE FROM jobs")
|
||||
|
||||
# Insert or replace active jobs
|
||||
for jid, info in jobs.items():
|
||||
cursor.execute(
|
||||
"INSERT OR REPLACE INTO jobs (id, started, status, data) VALUES (?, ?, ?, ?)",
|
||||
(jid, info.get('started', 0), info.get('status', ''), json.dumps(info, ensure_ascii=False))
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f"ERROR: Failed to save jobs database: {e}", file=sys.stderr)
|
||||
print(f"ERROR: Failed to save jobs database to SQLite: {e}", file=sys.stderr)
|
||||
|
||||
# APScheduler instance
|
||||
scheduler = None
|
||||
@ -216,6 +314,7 @@ _bg_refresh_running: set = set()
|
||||
def _start_bg_refresh(host, user, password, no_verify_ssl):
|
||||
"""Kick off a background thread to refresh the cache if not already running."""
|
||||
key = _cache_key(host, user)
|
||||
with _vm_cache_lock:
|
||||
if key in _bg_refresh_running:
|
||||
return
|
||||
_bg_refresh_running.add(key)
|
||||
@ -224,6 +323,7 @@ def _start_bg_refresh(host, user, password, no_verify_ssl):
|
||||
try:
|
||||
_fetch_and_cache(host, user, password, no_verify_ssl)
|
||||
finally:
|
||||
with _vm_cache_lock:
|
||||
_bg_refresh_running.discard(key)
|
||||
|
||||
t = threading.Thread(target=_worker, daemon=True)
|
||||
@ -431,14 +531,18 @@ def enforce_retention_policy(info, log_path=None):
|
||||
|
||||
def run_job_thread(jid):
|
||||
"""Worker executed in a thread (and by APScheduler)."""
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jid)
|
||||
if not info:
|
||||
return
|
||||
info['status'] = 'running'
|
||||
info['started'] = time.time()
|
||||
info['progress'] = {'pct': 0, 'phase': 'starting', 'detail': 'Initializing…'}
|
||||
save_jobs_db()
|
||||
|
||||
is_cancelled = lambda: jobs.get(jid, {}).get('status') == 'cancelling'
|
||||
def is_cancelled():
|
||||
with jobs_db_lock:
|
||||
return jobs.get(jid, {}).get('status') == 'cancelling'
|
||||
|
||||
vm_names = info.get('vm_names')
|
||||
log_path = str(JOBS_DIR / jid / 'backup.log')
|
||||
@ -446,6 +550,7 @@ def run_job_thread(jid):
|
||||
if vm_names:
|
||||
# Grouped/Batch VM backup run
|
||||
total_vms = len(vm_names)
|
||||
with jobs_db_lock:
|
||||
info['run_dest'] = os.path.join(info['dest'], f"batch-{datetime.fromtimestamp(info['started']).strftime('%Y%m%d%H%M%S')}")
|
||||
save_jobs_db()
|
||||
|
||||
@ -454,6 +559,7 @@ def run_job_thread(jid):
|
||||
|
||||
for idx, vm in enumerate(vm_names):
|
||||
if is_cancelled():
|
||||
with jobs_db_lock:
|
||||
failed_vms.append((vm, "Cancelled by user"))
|
||||
with open(log_path, 'a', encoding='utf-8') as f:
|
||||
f.write(f"\nSkipping VM {idx+1}/{total_vms} ({vm}): Backup cancelled by user\n")
|
||||
@ -466,6 +572,7 @@ def run_job_thread(jid):
|
||||
def _cb(prog):
|
||||
prog_pct = prog.get('pct', 0)
|
||||
overall_pct = start_p + int((prog_pct / 100) * (end_p - start_p))
|
||||
with jobs_db_lock:
|
||||
info['progress'] = {
|
||||
'pct': overall_pct,
|
||||
'phase': f'vm {vm_idx+1}/{total} ({vm_n})',
|
||||
@ -506,15 +613,19 @@ def run_job_thread(jid):
|
||||
is_cancelled_cb=is_cancelled,
|
||||
use_cbt=info.get('use_cbt', False),
|
||||
)
|
||||
with jobs_db_lock:
|
||||
success_vms.append(vm)
|
||||
except Exception as e:
|
||||
if "cancelled by user" in str(e).lower():
|
||||
is_cancel_err = "cancelled by user" in str(e).lower()
|
||||
if is_cancel_err:
|
||||
with jobs_db_lock:
|
||||
failed_vms.append((vm, "Cancelled by user"))
|
||||
info['status'] = 'failed (Cancelled)'
|
||||
info['progress'] = {'pct': 100, 'phase': 'failed', 'detail': 'Backup cancelled by user'}
|
||||
save_jobs_db()
|
||||
break
|
||||
else:
|
||||
with jobs_db_lock:
|
||||
failed_vms.append((vm, str(e)))
|
||||
with open(log_path, 'a', encoding='utf-8') as f:
|
||||
f.write(f"\nERROR backing up VM {vm}: {e}\n\n")
|
||||
@ -528,6 +639,7 @@ def run_job_thread(jid):
|
||||
}
|
||||
enforce_retention_policy(vm_info, log_path=log_path)
|
||||
|
||||
with jobs_db_lock:
|
||||
if failed_vms:
|
||||
if success_vms:
|
||||
info['status'] = f"finished with errors (Failed: {', '.join([f[0] for f in failed_vms])})"
|
||||
@ -547,10 +659,12 @@ def run_job_thread(jid):
|
||||
# Single VM backup run (original behavior)
|
||||
run_timestamp = datetime.fromtimestamp(info['started']).strftime('%Y%m%d%H%M%S')
|
||||
run_dest = os.path.join(info['dest'], info['vm_name'], f"backup-{run_timestamp}")
|
||||
with jobs_db_lock:
|
||||
info['run_dest'] = run_dest
|
||||
save_jobs_db()
|
||||
|
||||
def progress_cb(prog):
|
||||
with jobs_db_lock:
|
||||
info['progress'] = prog
|
||||
|
||||
try:
|
||||
@ -573,10 +687,12 @@ def run_job_thread(jid):
|
||||
is_cancelled_cb=is_cancelled,
|
||||
use_cbt=info.get('use_cbt', False),
|
||||
)
|
||||
with jobs_db_lock:
|
||||
info['status'] = 'finished'
|
||||
info['progress'] = {'pct': 100, 'phase': 'done', 'detail': 'Backup completed successfully'}
|
||||
save_jobs_db()
|
||||
except Exception as e:
|
||||
with jobs_db_lock:
|
||||
if "cancelled by user" in str(e).lower():
|
||||
info['status'] = 'failed (Cancelled)'
|
||||
info['progress'] = {'pct': 100, 'phase': 'failed', 'detail': 'Backup cancelled by user'}
|
||||
@ -633,6 +749,7 @@ def create_and_start_job(
|
||||
'retention_value': retention_value,
|
||||
'use_cbt': use_cbt,
|
||||
}
|
||||
with jobs_db_lock:
|
||||
jobs[jid] = info
|
||||
|
||||
if schedule_type == 'now' or not HAS_SCHEDULER:
|
||||
@ -961,9 +1078,11 @@ def batch_jobs():
|
||||
@app.route('/jobs')
|
||||
@login_required
|
||||
def list_jobs():
|
||||
with jobs_db_lock:
|
||||
sorted_items = sorted(jobs.items(), key=lambda x: x[1].get('started', 0), reverse=True)
|
||||
job_list = [
|
||||
job_to_display(jid, info)
|
||||
for jid, info in sorted(jobs.items(), key=lambda x: x[1].get('started', 0), reverse=True)
|
||||
for jid, info in sorted_items
|
||||
]
|
||||
scheduled_count = sum(1 for j in job_list if j['schedule_id'])
|
||||
return render_template('jobs.html', jobs=job_list, scheduled_count=scheduled_count)
|
||||
@ -974,15 +1093,18 @@ def list_jobs():
|
||||
@app.route('/job/<jobid>')
|
||||
@login_required
|
||||
def job_detail(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
return render_template('job_detail.html', job=job_to_display(jobid, info))
|
||||
job_disp = job_to_display(jobid, info)
|
||||
return render_template('job_detail.html', job=job_disp)
|
||||
|
||||
|
||||
@app.route('/job/<jobid>/log')
|
||||
@login_required
|
||||
def job_log(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
@ -997,19 +1119,23 @@ def job_log(jobid):
|
||||
@app.route('/api/job/<jobid>/status')
|
||||
@login_required
|
||||
def api_job_status(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
return jsonify({'error': 'not found'}), 404
|
||||
status = info.get('status', 'unknown')
|
||||
progress = info.get('progress', {'pct': 0, 'phase': '', 'detail': ''})
|
||||
return jsonify({
|
||||
'status': info.get('status', 'unknown'),
|
||||
'status': status,
|
||||
'id': jobid,
|
||||
'progress': info.get('progress', {'pct': 0, 'phase': '', 'detail': ''}),
|
||||
'progress': progress,
|
||||
})
|
||||
|
||||
|
||||
@app.route('/job/<jobid>/cancel-schedule', methods=['POST'])
|
||||
@login_required
|
||||
def cancel_schedule(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
@ -1029,6 +1155,7 @@ def cancel_schedule(jobid):
|
||||
@app.route('/job/<jobid>/reactivate-schedule', methods=['POST'])
|
||||
@login_required
|
||||
def reactivate_schedule(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
@ -1053,6 +1180,7 @@ def reactivate_schedule(jobid):
|
||||
@app.route('/job/<jobid>/run', methods=['POST'])
|
||||
@login_required
|
||||
def run_job_now(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
@ -1060,6 +1188,10 @@ def run_job_now(jobid):
|
||||
flash('Backup is already running or queued.', 'warning')
|
||||
return redirect(url_for('job_detail', jobid=jobid))
|
||||
|
||||
# Mark status as queued atomically to prevent double run race condition
|
||||
info['status'] = 'queued'
|
||||
save_jobs_db()
|
||||
|
||||
# Start backup execution in a background thread
|
||||
t = threading.Thread(target=run_job_thread, args=(jobid,), daemon=True)
|
||||
t.start()
|
||||
@ -1070,6 +1202,7 @@ def run_job_now(jobid):
|
||||
@app.route('/job/<jobid>/stop', methods=['POST'])
|
||||
@login_required
|
||||
def stop_job(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
@ -1086,6 +1219,7 @@ def stop_job(jobid):
|
||||
@app.route('/job/<jobid>/delete', methods=['POST'])
|
||||
@login_required
|
||||
def delete_job(jobid):
|
||||
with jobs_db_lock:
|
||||
info = jobs.get(jobid)
|
||||
if not info:
|
||||
abort(404)
|
||||
@ -1099,7 +1233,6 @@ def delete_job(jobid):
|
||||
pass
|
||||
|
||||
# Remove from jobs dict
|
||||
with jobs_db_lock:
|
||||
jobs.pop(jobid, None)
|
||||
save_jobs_db()
|
||||
|
||||
|
||||
@ -70,14 +70,19 @@ def find_vm_by_name(content, vm_name):
|
||||
|
||||
|
||||
def wait_for_task(task, action_name='job'):
|
||||
while task.info.state not in (vim.TaskInfo.State.success, vim.TaskInfo.State.error):
|
||||
while True:
|
||||
info = getattr(task, 'info', None)
|
||||
if info and info.state in (vim.TaskInfo.State.success, vim.TaskInfo.State.error):
|
||||
break
|
||||
time.sleep(1)
|
||||
if task.info.state == vim.TaskInfo.State.success:
|
||||
return task.info.result
|
||||
info = task.info
|
||||
if info.state == vim.TaskInfo.State.success:
|
||||
return info.result
|
||||
else:
|
||||
err = task.info.error
|
||||
err = info.error
|
||||
fault_name = err.__class__.__name__ if err else "UnknownFault"
|
||||
err_msg = getattr(err, 'msg', None) or str(err)
|
||||
raise Exception(f"{action_name} did not complete successfully: {err_msg}")
|
||||
raise Exception(f"{action_name} did not complete successfully: {fault_name}: {err_msg}")
|
||||
|
||||
|
||||
def create_snapshot(vm, snap_name, desc="backup snapshot", memory=False, quiesce=False):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user