diff --git a/.woodpecker/_retry_failed.sh b/.woodpecker/_retry_failed.sh new file mode 100644 index 0000000..c357ffd --- /dev/null +++ b/.woodpecker/_retry_failed.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Usage: bash .woodpecker/_retry_failed.sh +# +# Runs `python manage.py test "$@"`. If any tests fail/error, parses the +# failure labels out of stdout and re-runs ONLY those tests — so a single +# Selenium flake at test 90/93 costs ~22s on retry instead of the full +# 35-minute step. +# +# Django's unittest-based runner prints failures in a predictable shape: +# +# ERROR: test_method (full.dotted.path.TestClass.test_method) +# FAIL: test_method (full.dotted.path.TestClass.test_method) +# +# The dotted path inside the parens is exactly what `manage.py test` +# accepts as a label. We grep for those lines + re-run that list. +# +# Exit semantics: +# - First run green → exit 0, no retry. +# - First run failed AND label parse found nothing (crashed before any +# test reported, e.g. ImportError) → propagate first-run exit code, +# no retry. Genuine infra problems shouldn't be silently re-run. +# - First run failed AND labels parsed → retry just those; exit with +# the retry's exit code. A real (not-flaky) regression fails twice +# → step still red, with the focused retry log as the authoritative +# report (no need to scroll past the noisy first-run output). +# +# Run from inside `src/` (Woodpecker preserves cwd across `commands:`, +# so the upstream `cd ./src` carries through). + +set +e # do NOT bail on first failure; we WANT to handle it + +LOG=$(mktemp -t ft-retry.XXXXXX.log) +trap 'rm -f "$LOG"' EXIT + +echo "──── First run ────" +python manage.py test "$@" 2>&1 | tee "$LOG" +FIRST=${PIPESTATUS[0]} + +if [ "$FIRST" -eq 0 ]; then + exit 0 +fi + +# Parse failure labels. Match both FAIL: and ERROR: lines; the dotted +# path lives inside the trailing parens. `sort -u` dedupes if a single +# test produces multiple lines (rare but possible). +FAILED=$(grep -E '^(FAIL|ERROR): ' "$LOG" \ + | sed -E 's/^.*\(([^)]+)\)[^()]*$/\1/' \ + | sort -u \ + | tr '\n' ' ') + +if [ -z "$FAILED" ]; then + echo "──── First run failed, but no FAIL/ERROR labels parseable ────" + echo "──── Not retrying — likely an infra problem, not a test flake ────" + exit "$FIRST" +fi + +NUM=$(echo "$FAILED" | wc -w | tr -d ' ') +echo "" +echo "──── Retry ($NUM failed test(s) from first run) ────" +echo "$FAILED" | tr ' ' '\n' | sed 's/^/ /' +echo "─────────────────────────────────────────────────────" +echo "" + +python manage.py test $FAILED diff --git a/.woodpecker/main.yaml b/.woodpecker/main.yaml index 3878936..39caefd 100644 --- a/.woodpecker/main.yaml +++ b/.woodpecker/main.yaml @@ -111,7 +111,12 @@ steps: # clusters run in test-FTs-room. Channels + two-browser tags already # covered upstream. `ls | grep -v | sed` enumerates module dotted-paths # from filenames. - - python manage.py test --exclude-tag=channels --exclude-tag=two-browser $(ls functional_tests/test_*.py | grep -vE 'test_(game_room|trinket)_' | sed 's|/|.|g;s|\.py||') + # + # Wrapped in `_retry_failed.sh` so a single Selenium flake (browser + # hang, gecko-perms blip, login race) at test N/M doesn't cost the + # full step wall-clock on retry — the script parses Django's + # FAIL:/ERROR: lines from stdout + re-runs only those labels. + - bash ../.woodpecker/_retry_failed.sh --exclude-tag=channels --exclude-tag=two-browser $(ls functional_tests/test_*.py | grep -vE 'test_(game_room|trinket)_' | sed 's|/|.|g;s|\.py||') when: - event: push path: @@ -145,7 +150,14 @@ steps: # table hex SCSS + chair geometry live), so they exercise the same # surface as test_game_room_*. Runs in parallel w. test-FTs-non-room # (distinct DATABASE_URL paths under /tmp; see split-rationale). - - python manage.py test --exclude-tag=channels --exclude-tag=two-browser $(ls functional_tests/test_game_room_*.py functional_tests/test_trinket_*.py | sed 's|/|.|g;s|\.py||') + # + # `_retry_failed.sh` parses Django FAIL:/ERROR: lines from the first + # run's stdout + re-runs just those labels — single-flake retries + # cost ~22s instead of the full ~35-min step wall-clock. Genuine + # regressions still fail (second run output is the authoritative + # report); first-run crashes w. no parseable labels propagate + # the original exit code (don't silently mask infra problems). + - bash ../.woodpecker/_retry_failed.sh --exclude-tag=channels --exclude-tag=two-browser $(ls functional_tests/test_game_room_*.py functional_tests/test_trinket_*.py | sed 's|/|.|g;s|\.py||') when: - event: push path: