fix: comprehensive reliability and robustness improvements
Some checks failed
release / build-and-release (push) Failing after 32s

Critical fixes:
- Fix resume mode: StepsByIDs returned Enabled=false, all resume steps
  would be SKIPPED (deployment could never resume after reboot)
- Add reboot loop protection: per-step retry counter (max 5) prevents
  infinite reboot cycles when a step always exits with code 9
- Block reboot when state.Save() fails in resumePhase (prevents state
  loss leading to full restart from scratch)
- Atomic state file write (write-to-tmp + rename) prevents JSON
  corruption on BSOD/power loss mid-write
- Script watchdog: kills scripts after 30 min of no output (resets on
  each line, so active long-running scripts are never killed)
- Fix copyFile: check Close() error explicitly instead of deferred
  close that silently drops flush errors (e.g. disk full)

High severity:
- Cleanup() now logs errors instead of silently ignoring them
- Email report: 3 retries with backoff + always saves C:\X9\report.html
- Winget parallel jobs: 10 min timeout, kill hung jobs
- UCPD stop verification: 2s wait + state check before PDF association
- Atera installer: /qn -> /qb so MFA window can appear
- GVLK activation: match by EditionID (registry, not localized) instead
  of fragile OS caption string matching

Medium severity:
- Default profile hive unload: retry loop (5 attempts, increasing delay)
- LayoutModification.xml: UTF-8 without BOM (PS 5.1 Set-Content adds BOM)
- Set-Reg SYSTEM task: try/finally ensures temp file + task cleanup
- Windows Update: @($available).Count for PS 5.1 single-result edge case
- config.json: add missing kmsServer field in activation section

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
X9 Dev 2026-04-28 11:49:43 +02:00
parent 0cfe7510aa
commit d30767ef8b
11 changed files with 192 additions and 60 deletions

View file

@ -9,7 +9,8 @@
"username": "adminx9"
},
"activation": {
"productKey": ""
"productKey": "",
"kmsServer": ""
},
"software": {
"install": [

View file

@ -70,16 +70,50 @@ func Run(cfg config.Config, runCfg runner.RunConfig, cfgPath string) {
// Resume mode
// --------------------------------------------------------------------------
// maxStepRetries is the maximum number of reboot cycles a single step is
// allowed before it is marked as ERROR and skipped.
const maxStepRetries = 5
func resumePhase(st *state.State, runCfg runner.RunConfig) {
runCfg.LogFile = st.LogFile
// Initialise retry counts (backward-compat with older state files)
if st.RetryCounts == nil {
st.RetryCounts = make(map[string]int)
}
steps := runner.StepsByIDs(st.PendingSteps)
// StepsByIDs returns Enabled=false; resume steps must be enabled.
for i := range steps {
steps[i].Enabled = true
}
// Check retry limits — skip steps that have been retried too many times
for i := range steps {
if st.RetryCounts[steps[i].ID] >= maxStepRetries {
steps[i].Enabled = false
st.Results = append(st.Results, state.StepResult{
StepID: steps[i].ID,
Num: steps[i].Num,
Name: steps[i].Name,
Status: "ERROR",
})
}
}
results, needsReboot := runPhase(runCfg, steps, true)
// Accumulate completed results (NeedsReboot step excluded runs again)
newResults := append(st.Results, toStateResults(results)...) //nolint:gocritic
if needsReboot {
// Increment retry counts for steps that requested reboot
for _, r := range results {
if r.NeedsReboot {
st.RetryCounts[r.Step.ID]++
}
}
// Update state and reboot infrastructure already in place (running as adminx9)
pending := pendingStepIDs(steps, results)
newSt := &state.State{
@ -88,9 +122,11 @@ func resumePhase(st *state.State, runCfg runner.RunConfig) {
LogFile: st.LogFile,
PendingSteps: pending,
Results: newResults,
RetryCounts: st.RetryCounts,
}
if err := state.Save(newSt); err != nil {
walk.MsgBox(nil, "Chyba", "Nelze ulozit stav: "+err.Error(), walk.MsgBoxIconError)
return // do NOT reboot if state was not persisted
}
reboot()
return
@ -636,12 +672,9 @@ func donePhase(currentResults []runner.Result, prevResults []state.StepResult) {
summaryText := fmt.Sprintf("OK: %d CHYBY: %d PRESKOCENO: %d", ok, errs, skipped)
// Send email report (non-blocking, best-effort)
// Send email report (non-blocking; report.Send retries and saves local copy)
go func() {
if err := report.Send(emailRows); err != nil {
// Log but don't block - deployment is done
_ = err
}
_ = report.Send(emailRows)
}()
cancelReboot := make(chan struct{})

View file

@ -61,9 +61,15 @@ func Prepare() error {
// Cleanup disables autologon and removes the X9-Resume scheduled task.
// Called when all deployment steps have completed successfully.
// Errors are logged to stderr (visible in Deploy.log via runner) but do not
// block completion — deployment is already done at this point.
func Cleanup() {
_ = disableAutologon()
_ = unregisterResumeTask()
if err := disableAutologon(); err != nil {
fmt.Fprintf(os.Stderr, "[WARN] Cleanup: disableAutologon failed: %v\n", err)
}
if err := unregisterResumeTask(); err != nil {
fmt.Fprintf(os.Stderr, "[WARN] Cleanup: unregisterResumeTask failed: %v\n", err)
}
}
// ensureAdminx9User creates the adminx9 local account if absent.
@ -120,9 +126,13 @@ func copyFile(src, dst string) error {
if err != nil {
return err
}
defer w.Close()
_, err = io.Copy(w, r)
if _, err = io.Copy(w, r); err != nil {
w.Close()
return err
}
// Explicit Close: on Windows the actual flush happens here.
// A deferred close would silently drop write errors (e.g. disk full).
return w.Close()
}
func setAutologon() error {

View file

@ -5,6 +5,7 @@ import (
"fmt"
"net/smtp"
"os"
"path/filepath"
"strings"
"time"
)
@ -19,6 +20,9 @@ const (
mailTo = "net@x9.cz"
)
// localReportPath is where a local HTML copy of the report is always saved.
const localReportPath = `C:\X9\report.html`
// StepResult holds one row of the deployment report.
type StepResult struct {
Num string
@ -27,8 +31,9 @@ type StepResult struct {
Elapsed time.Duration
}
// Send emails the deployment report. Non-fatal: returns error but caller
// should log it and continue (deployment is already done).
// Send builds the deployment report, saves a local HTML copy to C:\X9\,
// and emails it via SMTP with retries. Returns the last SMTP error if all
// attempts fail (the local copy is always written regardless).
func Send(results []StepResult) error {
hostname, _ := os.Hostname()
now := time.Now().Format("2006-01-02 15:04")
@ -36,6 +41,31 @@ func Send(results []StepResult) error {
subject := fmt.Sprintf("xetup report %s", hostname)
body := buildHTML(results, hostname, now)
// Always save local copy so technician has a record even if SMTP fails
_ = os.MkdirAll(filepath.Dir(localReportPath), 0755)
if err := os.WriteFile(localReportPath, []byte(body), 0644); err != nil {
fmt.Fprintf(os.Stderr, "[WARN] Failed to save local report: %v\n", err)
}
// Retry SMTP up to 3 times with exponential backoff (1s, 5s, 15s)
delays := []time.Duration{0, 1 * time.Second, 5 * time.Second}
var lastErr error
for attempt, delay := range delays {
if delay > 0 {
time.Sleep(delay)
}
if err := sendMail(subject, body); err != nil {
lastErr = err
fmt.Fprintf(os.Stderr, "[WARN] Email attempt %d/3 failed: %v\n", attempt+1, err)
continue
}
return nil
}
fmt.Fprintf(os.Stderr, "[ERROR] All email attempts failed. Local copy saved: %s\n", localReportPath)
return lastErr
}
func sendMail(subject, body string) error {
msg := strings.Join([]string{
"From: " + mailFrom,
"To: " + mailTo,

View file

@ -217,6 +217,10 @@ func (r *Runner) Stop() {
}
}
// silenceTimeout is how long a script may produce no output before the
// watchdog kills it. Active scripts (producing output) are never killed.
const silenceTimeout = 30 * time.Minute
func (r *Runner) runScript(ctx context.Context, step Step, cfgArg string) error {
scriptPath := filepath.Join(r.cfg.ScriptsDir, step.ScriptName)
@ -251,8 +255,24 @@ func (r *Runner) runScript(ctx context.Context, step Step, cfgArg string) error
return err
}
// Watchdog: kill script if it produces no output for silenceTimeout.
// The timer is reset on every output line, so active scripts run
// indefinitely (e.g. Dell BIOS download producing progress dots).
watchdog := time.AfterFunc(silenceTimeout, func() {
r.onLog(LogLine{
StepID: step.ID,
Text: fmt.Sprintf("[WATCHDOG] No output for %v - killing script", silenceTimeout),
Level: "ERROR",
})
if cmd.Process != nil {
cmd.Process.Kill()
}
})
defer watchdog.Stop()
scanner := bufio.NewScanner(stdout)
for scanner.Scan() {
watchdog.Reset(silenceTimeout)
line := scanner.Text()
if skipPSNoiseLine(line) {
continue

View file

@ -30,6 +30,7 @@ type State struct {
LogFile string `json:"logFile"`
PendingSteps []string `json:"pendingSteps"` // step IDs to run next, in canonical order
Results []StepResult `json:"results"` // accumulated across all rounds
RetryCounts map[string]int `json:"retryCounts,omitempty"` // per-step reboot retry counter
}
// Load reads the state file. Returns a non-nil error when the file is absent.
@ -45,17 +46,24 @@ func Load() (*State, error) {
return &s, nil
}
// Save writes the state file, creating parent directories as needed.
// Save writes the state file atomically (write-to-temp + rename), creating
// parent directories as needed. This prevents corruption if the system
// crashes mid-write (e.g. BSOD, power loss).
func Save(s *State) error {
p := statePath()
if err := os.MkdirAll(filepath.Dir(p), 0755); err != nil {
dir := filepath.Dir(p)
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
data, err := json.MarshalIndent(s, "", " ")
if err != nil {
return err
}
return os.WriteFile(p, data, 0644)
tmp := p + ".tmp"
if err := os.WriteFile(tmp, data, 0644); err != nil {
return err
}
return os.Rename(tmp, p)
}
// Delete removes the state file. Silently ignores not-found.

View file

@ -14,7 +14,7 @@
7-zip-7zip-7zip: Installs 7-Zip (winget ID: 7zip.7zip). Used for archive management. Silent install with --accept-package-agreements --accept-source-agreements flags required for unattended deployment.
adobe-acrobat-reader-64-bit-adobe-acroba: Installs Adobe Acrobat Reader DC 64-bit (Adobe.Acrobat.Reader.64-bit). Required as the default PDF viewer to prevent Edge from handling PDFs in browser mode, which limits functionality.
openvpn-connect-openvpntechnologies-open: Installs OpenVPN Connect client. Used for client VPN access when the client network requires a VPN. The ovpn profile and credentials are configured separately per client.
atera-agent-install: Atera RMM agent downloaded from x9.servicedesk.atera.com and installed via msiexec /qn. During install, Atera MSI shows an interactive MFA window - technician enters the code to complete registration. Agent enables MSP monitoring, remote access, and ticketing integration.
atera-agent-install: Atera RMM agent downloaded from x9.servicedesk.atera.com and installed via msiexec /qb. During install, Atera MSI shows an interactive MFA window - technician enters the code to complete registration. Agent enables MSP monitoring, remote access, and ticketing integration.
adobe-pdf-default-pdf-acrord32-po-instal: Sets .pdf -> AcroRd32 file association after Acrobat install via HKCR (system-wide, no UserChoice hash issue). UCPD driver is stopped immediately before the write and restarted after to ensure the association persists across Edge updates.
ucpd-sys-kernel-driver-od-feb-2024-bloku: UCPD.sys (User Choice Protection Driver) is stopped before the PDF association write and restarted after. Pattern: Stop-Service ucpd -> set HKCR\.pdf -> Start-Service ucpd. Implemented in this script.
#>
@ -90,7 +90,15 @@ if (Get-Feature $Config "software" "wingetInstalls") {
# Wait for all jobs and collect results
Write-Log " Waiting for $($jobs.Count) installs to complete..." -Level INFO
$jobs | Wait-Job | Out-Null
$jobs | Wait-Job -Timeout 600 | Out-Null
# Kill any jobs that are still running after timeout
foreach ($job in $jobs) {
if ($job.State -eq "Running") {
Write-Log " Timeout: $($job.Name) - killing" -Level ERROR
Stop-Job -Job $job
}
}
foreach ($job in $jobs) {
$r = Receive-Job -Job $job
@ -124,8 +132,15 @@ if (Get-Feature $Config "software" "pdfDefault") {
if ($ucpdSvc) {
try {
Stop-Service -Name "ucpd" -Force -ErrorAction Stop
# Wait for the service to fully stop before writing association
Start-Sleep -Seconds 2
$svcState = (Get-Service -Name "ucpd" -ErrorAction SilentlyContinue).Status
if ($svcState -eq "Stopped") {
$ucpdStopped = $true
Write-Log " UCPD driver stopped" -Level OK
} else {
Write-Log " UCPD still in state '$svcState' - association may not persist" -Level WARN
}
}
catch {
Write-Log " Could not stop UCPD: $_ (association may not persist on some builds)" -Level WARN
@ -188,7 +203,7 @@ if (Get-Feature $Config "software" "pdfDefault") {
# -----------------------------------------------------------------------
# Install Atera RMM Agent
# Download MSI from Atera dashboard API, install via msiexec /qn.
# Download MSI from Atera dashboard API, install via msiexec /qb.
# During install, the Atera MSI shows an interactive MFA window -
# the technician enters the code to complete agent registration.
# -----------------------------------------------------------------------
@ -204,7 +219,7 @@ if (Get-Feature $Config "software" "ateraAgent") {
Write-Log " Download complete" -Level OK
Write-Log " Running installer (MFA window will appear)..." -Level INFO
$msiProc = Start-Process msiexec -ArgumentList "/i `"$ateraMsi`" /qn" -Wait -PassThru
$msiProc = Start-Process msiexec -ArgumentList "/i `"$ateraMsi`" /qb" -Wait -PassThru
if ($msiProc.ExitCode -eq 0) {
Write-Log " Atera agent installed (msiexec exit 0)" -Level OK
} else {

View file

@ -141,6 +141,8 @@ function Set-Reg {
# Retry 2: write via scheduled task running as SYSTEM
# SYSTEM has full registry access regardless of key ACL
$tempScript = $null
$taskName = $null
try {
$regType = switch ($Type) {
"DWord" { "REG_DWORD" }
@ -168,8 +170,6 @@ function Set-Reg {
Register-ScheduledTask -TaskName $taskName -InputObject $task -Force | Out-Null
Start-ScheduledTask -TaskName $taskName
Start-Sleep -Seconds 2
Unregister-ScheduledTask -TaskName $taskName -Confirm:$false -ErrorAction SilentlyContinue
Remove-Item $tempScript -Force -ErrorAction SilentlyContinue
# Verify it was written
$written = (Get-ItemProperty -Path $Path -Name $Name -ErrorAction SilentlyContinue).$Name
@ -182,6 +182,14 @@ function Set-Reg {
catch {
Write-Log " FAILED $Path\$Name - $_" -Level ERROR
}
finally {
if ($taskName) {
Unregister-ScheduledTask -TaskName $taskName -Confirm:$false -ErrorAction SilentlyContinue
}
if ($tempScript) {
Remove-Item $tempScript -Force -ErrorAction SilentlyContinue
}
}
}
}

View file

@ -236,7 +236,8 @@ $pinList
</CustomTaskbarLayoutCollection>
</LayoutModificationTemplate>
"@
$taskbarLayoutXml | Set-Content -Path "$taskbarLayoutDir\LayoutModification.xml" -Encoding UTF8 -Force
$utf8NoBom = New-Object System.Text.UTF8Encoding $false
[System.IO.File]::WriteAllText("$taskbarLayoutDir\LayoutModification.xml", $taskbarLayoutXml, $utf8NoBom)
Write-Log " Taskbar LayoutModification.xml written (profile: $ProfileType)" -Level OK
# NumLock on startup
@ -346,18 +347,26 @@ $pinList
}
finally {
# -----------------------------------------------------------------------
# Unload Default hive - always, even on error
# Unload Default hive - always, even on error. Retry because GC and
# other processes (antivirus) may hold handles briefly.
# -----------------------------------------------------------------------
Write-Log "Unloading Default hive" -Level INFO
$unloaded = $false
for ($attempt = 1; $attempt -le 5; $attempt++) {
[GC]::Collect()
[GC]::WaitForPendingFinalizers()
Start-Sleep -Milliseconds 500
Start-Sleep -Milliseconds ($attempt * 500)
$unloadResult = & reg unload "HKU\$hiveKey" 2>&1
if ($LASTEXITCODE -eq 0) {
Write-Log "Default hive unloaded" -Level OK
} else {
Write-Log "Failed to unload Default hive: $unloadResult" -Level ERROR
Write-Log "Default hive unloaded (attempt $attempt)" -Level OK
$unloaded = $true
break
}
}
if (-not $unloaded) {
Write-Log "Failed to unload Default hive after 5 attempts: $unloadResult" -Level ERROR
Write-Log " New user profiles may not inherit all settings until next reboot" -Level WARN
}
}

View file

@ -11,7 +11,7 @@
.ITEMS
oa3-bios-uefi-klic-kontrola-embedded-ke: Checks for OA3 embedded product key in BIOS/UEFI firmware via SoftwareLicensingService.OA3xOriginalProductKey WMI query. If a key is found, it is installed via slmgr /ipk and activation is attempted. Most OEM machines (since Win8 OA3) have a digital entitlement key in firmware - this path handles them without requiring a key in config.json.
klic-z-config-json-activation-productkey: Reads activation.productKey from config.json. Installs via slmgr.vbs /ipk <key> and activates via slmgr.vbs /ato. Supports MAK (Multiple Activation Key) for volume licensing without KMS, and retail keys. Takes priority over GVLK fallback.
fallback-na-gvlk-kms-client-key-dle-edic: When no key is in config, detects Windows edition via (Get-WmiObject SoftwareLicensingProduct).Name and maps to Microsoft's published GVLK table. Pro: W269N-WFGWX-YVC9B-4J6C9-T83GX, Enterprise: NPPR9-FWDCX-D2C8J-H872K-2YT43, Home: TX9XD-98N7V-6WMQ6-BX7FG-H8Q99.
fallback-na-gvlk-kms-client-key-dle-edic: When no key is in config, detects Windows edition via EditionID registry value (HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion\EditionID) and maps to Microsoft's published GVLK table. EditionID is not localized, unlike Win32_OperatingSystem.Caption. Professional: W269N-WFGWX-YVC9B-4J6C9-T83GX, Enterprise: NPPR9-FWDCX-D2C8J-H872K-2YT43, Core (Home): TX9XD-98N7V-6WMQ6-BX7FG-H8Q99.
volitelny-kms-server-activation-kmsserve: If activation.kmsServer is in config.json, runs slmgr.vbs /skms <server>:<port> before /ato. Used for clients with on-premises KMS infrastructure (common in larger organizations with volume licensing).
preskocit-pokud-jiz-aktivovano: Queries Win32_WindowsLicenseStatus or SoftwareLicensingProduct to check LicenseStatus. Value 1 = Licensed (fully activated). Script skips activation attempt and logs "Windows already activated" to avoid unnecessary slmgr calls.
typ-klice-mak-vs-kms-vs-retail: Key type selection depends on client's Microsoft licensing: MAK = volume license key activates online against Microsoft (limited activations), KMS = requires KMS server on network (VLSC subscription), Retail = individual license from Microsoft Store or OEM.
@ -31,18 +31,15 @@ $Config = Load-Config $ConfigPath
# Replace with your MAK/retail key for standalone activation.
# -----------------------------------------------------------------------
$KmsKeys = @{
# Windows 11
"Windows 11 Pro" = "W269N-WFGWX-YVC9B-4J6C9-T83GX"
"Windows 11 Pro N" = "MH37W-N47XK-V7XM9-C7227-GCQG9"
"Windows 11 Pro Education" = "6TP4R-GNPTD-KYYHQ-7B7DP-J447Y"
"Windows 11 Education" = "NW6C2-QMPVW-D7KKK-3GKT6-VCFB2"
"Windows 11 Enterprise" = "NPPR9-FWDCX-D2C8J-H872K-2YT43"
# Windows 10
"Windows 10 Pro" = "W269N-WFGWX-YVC9B-4J6C9-T83GX"
"Windows 10 Pro N" = "MH37W-N47XK-V7XM9-C7227-GCQG9"
"Windows 10 Education" = "NW6C2-QMPVW-D7KKK-3GKT6-VCFB2"
"Windows 10 Enterprise" = "NPPR9-FWDCX-D2C8J-H872K-2YT43"
"Windows 10 Home" = "TX9XD-98N7V-6WMQ6-BX7FG-H8Q99"
# EditionID -> GVLK (source: docs.microsoft.com/windows-server/get-started/kms-client-activation-keys)
# Same keys work for both Windows 10 and Windows 11
"Professional" = "W269N-WFGWX-YVC9B-4J6C9-T83GX"
"ProfessionalN" = "MH37W-N47XK-V7XM9-C7227-GCQG9"
"ProfessionalEducation" = "6TP4R-GNPTD-KYYHQ-7B7DP-J447Y"
"Education" = "NW6C2-QMPVW-D7KKK-3GKT6-VCFB2"
"Enterprise" = "NPPR9-FWDCX-D2C8J-H872K-2YT43"
"Core" = "TX9XD-98N7V-6WMQ6-BX7FG-H8Q99"
"ProfessionalWorkstation" = "NRG8B-VKK3Q-CXVCJ-9G2XF-6Q84J"
}
# -----------------------------------------------------------------------
@ -93,14 +90,15 @@ if ($licenseStatus -eq 1) {
$keyToUse = $oa3Key
Write-Log " Using OA3 key from firmware" -Level INFO
} else {
# Find matching GVLK key by OS name
# Find matching GVLK key by EditionID (registry value, not localized)
$editionId = (Get-ItemProperty "HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion" -Name EditionID -ErrorAction SilentlyContinue).EditionID
Write-Log " EditionID: $editionId" -Level INFO
$keyToUse = $null
foreach ($entry in $KmsKeys.GetEnumerator()) {
if ($osCaption -like "*$($entry.Key)*") {
$keyToUse = $entry.Value
Write-Log " Matched GVLK key for: $($entry.Key)" -Level INFO
break
}
if ($editionId -and $KmsKeys.ContainsKey($editionId)) {
$keyToUse = $KmsKeys[$editionId]
Write-Log " Matched GVLK key for edition: $editionId" -Level INFO
} else {
Write-Log " No GVLK key for edition: $editionId" -Level WARN
}
}

View file

@ -57,7 +57,7 @@ try {
exit 1
}
if (-not $available -or $available.Count -eq 0) {
if (-not $available -or @($available).Count -eq 0) {
Write-Log " System is fully up to date" -Level OK
Write-Log "Step 12 complete" -Level OK
exit 0