fix: simplify e2e test wait loops to prevent flaky failures (#1425)

## Summary

- Replace all fixed-timeout wait loops with simple indefinite `while`
loops
- Add 5-second stabilization delay after VPN connection before running
ping tests
- Add server-side tun0 interface verification before signaling client
- Add wait for OpenVPN restart after server certificate renewal

## Problem

Tests fail randomly with errors like:
```
Test 2: Pinging VPN gateway (10.9.0.1)...
10 packets transmitted, 0 received, 100% packet loss
FAIL: Cannot ping VPN gateway
```

Example:
https://github.com/angristan/openvpn-install/actions/runs/20230801728/job/58072998112

## Solution

Instead of guessing timeout values that may be too short for slow CI
runners, all wait loops now run indefinitely and rely on the job-level
timeout to catch actual failures.

**Before:**
```bash
MAX_WAIT=60
WAITED=0
while [ condition ] && [ $WAITED -lt $MAX_WAIT ]; do
    sleep 2
    WAITED=$((WAITED + 2))
done
if [ condition ]; then exit 1; fi
```

**After:**
```bash
while [ condition ]; do
    sleep 2
done
```

This removes 83 lines of boilerplate timeout logic.
This commit is contained in:
Stanislas
2025-12-15 21:22:22 +01:00
committed by GitHub
parent 0473a35b97
commit 61bd345014
2 changed files with 115 additions and 198 deletions

View File

@@ -14,19 +14,11 @@ echo "TUN device ready"
# Wait for client config to be available
echo "Waiting for client config..."
MAX_WAIT=120
WAITED=0
while [ ! -f /shared/client.ovpn ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/client.ovpn ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for client config..."
done
if [ ! -f /shared/client.ovpn ]; then
echo "ERROR: Client config not found after ${MAX_WAIT}s"
exit 1
fi
echo "Client config found!"
cat /shared/client.ovpn
@@ -47,29 +39,23 @@ openvpn --config /shared/client.ovpn --daemon --log /var/log/openvpn.log
# Wait for connection
echo "Waiting for VPN connection..."
MAX_WAIT=60
WAITED=0
while ! ip addr show tun0 2>/dev/null | grep -q "inet " && [ $WAITED -lt $MAX_WAIT ]; do
while ! ip addr show tun0 2>/dev/null | grep -q "inet "; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for tun0... ($WAITED/$MAX_WAIT seconds)"
# Check for errors
echo "Waiting for tun0..."
# Show recent log for debugging
if [ -f /var/log/openvpn.log ]; then
tail -5 /var/log/openvpn.log
tail -3 /var/log/openvpn.log
fi
done
if ! ip addr show tun0 2>/dev/null | grep -q "inet "; then
echo "ERROR: VPN connection failed"
echo "=== OpenVPN log ==="
cat /var/log/openvpn.log || true
exit 1
fi
echo "=== VPN Connected! ==="
ip addr show tun0
# Allow routing tables to stabilize before running tests
# This prevents race conditions where tun0 is up but routing isn't ready
echo "Waiting for routing to stabilize..."
sleep 5
# Run connectivity tests
echo ""
echo "=== Running connectivity tests ==="
@@ -85,14 +71,13 @@ else
exit 1
fi
# Test 2: Ping VPN gateway
# Test 2: Ping VPN gateway (retries indefinitely, relies on job timeout)
echo "Test 2: Pinging VPN gateway ($VPN_GATEWAY)..."
if ping -c 10 "$VPN_GATEWAY"; then
while ! ping -c 3 -W 2 "$VPN_GATEWAY" >/dev/null 2>&1; do
echo "Ping failed, retrying..."
sleep 3
done
echo "PASS: Can ping VPN gateway"
else
echo "FAIL: Cannot ping VPN gateway"
exit 1
fi
# Test 3: DNS resolution through Unbound
echo "Test 3: Testing DNS resolution via Unbound ($VPN_GATEWAY)..."
@@ -133,19 +118,11 @@ REVOKE_CLIENT="revoketest"
# Wait for revoke test client config
echo "Waiting for revoke test client config..."
MAX_WAIT=120
WAITED=0
while [ ! -f /shared/revoke-client-config-ready ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/revoke-client-config-ready ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for revoke test config... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for revoke test config..."
done
if [ ! -f /shared/revoke-client-config-ready ]; then
echo "FAIL: Revoke test client config not ready in time"
exit 1
fi
if [ ! -f "/shared/$REVOKE_CLIENT.ovpn" ]; then
echo "FAIL: Revoke test client config file not found"
exit 1
@@ -164,51 +141,33 @@ openvpn --config "/shared/$REVOKE_CLIENT.ovpn" --daemon --log /var/log/openvpn-r
# Wait for connection
echo "Waiting for VPN connection with revoke test client..."
MAX_WAIT=60
WAITED=0
while ! ip addr show tun0 2>/dev/null | grep -q "inet " && [ $WAITED -lt $MAX_WAIT ]; do
while ! ip addr show tun0 2>/dev/null | grep -q "inet "; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for tun0... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for tun0..."
if [ -f /var/log/openvpn-revoke.log ]; then
tail -3 /var/log/openvpn-revoke.log
fi
done
if ! ip addr show tun0 2>/dev/null | grep -q "inet "; then
echo "FAIL: VPN connection with revoke test client failed"
cat /var/log/openvpn-revoke.log || true
exit 1
fi
echo "PASS: Connected with '$REVOKE_CLIENT' certificate"
ip addr show tun0
# Verify connectivity
if ping -c 2 "$VPN_GATEWAY" >/dev/null 2>&1; then
# Verify connectivity (retries indefinitely, relies on job timeout)
while ! ping -c 3 -W 2 "$VPN_GATEWAY" >/dev/null 2>&1; do
echo "Ping failed, retrying..."
sleep 3
done
echo "PASS: Can ping VPN gateway with revoke test client"
else
echo "FAIL: Cannot ping VPN gateway with revoke test client"
exit 1
fi
# Signal server that we're connected with revoke test client
touch /shared/revoke-client-connected
# Wait for server to signal us to disconnect
echo "Waiting for server to signal disconnect..."
MAX_WAIT=60
WAITED=0
while [ ! -f /shared/revoke-client-disconnect ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/revoke-client-disconnect ]; do
sleep 2
WAITED=$((WAITED + 2))
done
if [ ! -f /shared/revoke-client-disconnect ]; then
echo "FAIL: Server did not signal disconnect"
exit 1
fi
# Disconnect
echo "Disconnecting revoke test client..."
pkill openvpn || true
@@ -233,18 +192,10 @@ touch /shared/revoke-client-disconnected
# Wait for server to revoke the certificate and signal us to reconnect
echo "Waiting for server to revoke certificate and signal reconnect..."
MAX_WAIT=60
WAITED=0
while [ ! -f /shared/revoke-try-reconnect ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/revoke-try-reconnect ]; do
sleep 2
WAITED=$((WAITED + 2))
done
if [ ! -f /shared/revoke-try-reconnect ]; then
echo "FAIL: Server did not signal to try reconnect"
exit 1
fi
# Try to reconnect with the now-revoked certificate (should fail)
echo "Attempting to reconnect with revoked certificate (should fail)..."
rm -f /var/log/openvpn-revoke-fail.log
@@ -254,11 +205,8 @@ openvpn --config "/shared/$REVOKE_CLIENT.ovpn" --daemon --log /var/log/openvpn-r
# The connection should fail due to certificate being revoked
echo "Waiting to verify connection is rejected..."
CONNECT_FAILED=false
MAX_WAIT=30
WAITED=0
while [ $WAITED -lt $MAX_WAIT ]; do
while true; do
sleep 2
WAITED=$((WAITED + 2))
# Check if tun0 came up (would mean revocation didn't work)
if ip addr show tun0 2>/dev/null | grep -q "inet "; then
@@ -276,7 +224,7 @@ while [ $WAITED -lt $MAX_WAIT ]; do
fi
fi
echo "Checking connection status... ($WAITED/$MAX_WAIT seconds)"
echo "Checking connection status..."
if [ -f /var/log/openvpn-revoke-fail.log ]; then
tail -3 /var/log/openvpn-revoke-fail.log
fi
@@ -311,19 +259,11 @@ echo "=== Testing connection with recreated certificate ==="
# Wait for server to create new cert and signal us
echo "Waiting for new client config with same name..."
MAX_WAIT=120
WAITED=0
while [ ! -f /shared/new-client-config-ready ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/new-client-config-ready ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for new config... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for new config..."
done
if [ ! -f /shared/new-client-config-ready ]; then
echo "FAIL: New client config not ready in time"
exit 1
fi
if [ ! -f "/shared/$REVOKE_CLIENT-new.ovpn" ]; then
echo "FAIL: New client config file not found"
exit 1
@@ -338,33 +278,23 @@ openvpn --config "/shared/$REVOKE_CLIENT-new.ovpn" --daemon --log /var/log/openv
# Wait for connection
echo "Waiting for VPN connection with new certificate..."
MAX_WAIT=60
WAITED=0
while ! ip addr show tun0 2>/dev/null | grep -q "inet " && [ $WAITED -lt $MAX_WAIT ]; do
while ! ip addr show tun0 2>/dev/null | grep -q "inet "; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for tun0... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for tun0..."
if [ -f /var/log/openvpn-new.log ]; then
tail -3 /var/log/openvpn-new.log
fi
done
if ! ip addr show tun0 2>/dev/null | grep -q "inet "; then
echo "FAIL: VPN connection with new certificate failed"
cat /var/log/openvpn-new.log || true
exit 1
fi
echo "PASS: Connected with new '$REVOKE_CLIENT' certificate"
ip addr show tun0
# Verify connectivity
if ping -c 2 "$VPN_GATEWAY" >/dev/null 2>&1; then
# Verify connectivity (retries indefinitely, relies on job timeout)
while ! ping -c 3 -W 2 "$VPN_GATEWAY" >/dev/null 2>&1; do
echo "Ping failed, retrying..."
sleep 3
done
echo "PASS: Can ping VPN gateway with new certificate"
else
echo "FAIL: Cannot ping VPN gateway with new certificate"
exit 1
fi
# Signal server that we connected with new cert
touch /shared/new-client-connected
@@ -382,19 +312,11 @@ PASSPHRASE_CLIENT="passphrasetest"
# Wait for passphrase test client config
echo "Waiting for passphrase test client config..."
MAX_WAIT=120
WAITED=0
while [ ! -f /shared/passphrase-client-config-ready ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/passphrase-client-config-ready ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for passphrase test config... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for passphrase test config..."
done
if [ ! -f /shared/passphrase-client-config-ready ]; then
echo "FAIL: Passphrase test client config not ready in time"
exit 1
fi
if [ ! -f "/shared/$PASSPHRASE_CLIENT.ovpn" ]; then
echo "FAIL: Passphrase test client config file not found"
exit 1
@@ -418,33 +340,23 @@ openvpn --config "/shared/$PASSPHRASE_CLIENT.ovpn" --askpass "/shared/$PASSPHRAS
# Wait for connection
echo "Waiting for VPN connection with passphrase-protected client..."
MAX_WAIT=60
WAITED=0
while ! ip addr show tun0 2>/dev/null | grep -q "inet " && [ $WAITED -lt $MAX_WAIT ]; do
while ! ip addr show tun0 2>/dev/null | grep -q "inet "; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for tun0... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for tun0..."
if [ -f /var/log/openvpn-passphrase.log ]; then
tail -3 /var/log/openvpn-passphrase.log
fi
done
if ! ip addr show tun0 2>/dev/null | grep -q "inet "; then
echo "FAIL: VPN connection with passphrase-protected client failed"
cat /var/log/openvpn-passphrase.log || true
exit 1
fi
echo "PASS: Connected with passphrase-protected '$PASSPHRASE_CLIENT' certificate"
ip addr show tun0
# Verify connectivity
if ping -c 2 "$VPN_GATEWAY" >/dev/null 2>&1; then
# Verify connectivity (retries indefinitely, relies on job timeout)
while ! ping -c 3 -W 2 "$VPN_GATEWAY" >/dev/null 2>&1; do
echo "Ping failed, retrying..."
sleep 3
done
echo "PASS: Can ping VPN gateway with passphrase-protected client"
else
echo "FAIL: Cannot ping VPN gateway with passphrase-protected client"
exit 1
fi
# Signal server that we connected with passphrase client
touch /shared/passphrase-client-connected

View File

@@ -429,6 +429,35 @@ echo ""
echo "=== All Certificate Renewal Tests PASSED ==="
echo ""
# Wait for OpenVPN to be fully ready after server certificate renewal
# The renewal process restarts OpenVPN, so we need to verify it's back up
echo "Verifying OpenVPN is running after certificate renewal..."
for _ in $(seq 1 30); do
if pgrep -f "openvpn.*server.conf" >/dev/null; then
break
fi
sleep 1
done
if ! pgrep -f "openvpn.*server.conf" >/dev/null; then
echo "FAIL: OpenVPN not running after server certificate renewal"
systemctl status openvpn-server@server 2>&1 || true
exit 1
fi
# Wait for tun0 to be ready after restart
echo "Waiting for tun0 to be ready after certificate renewal..."
for i in $(seq 1 30); do
if ip addr show tun0 2>/dev/null | grep -q "inet $VPN_GATEWAY"; then
echo "OpenVPN tun0 interface ready after renewal"
break
fi
sleep 1
done
# Allow routing to stabilize after renewal restart
sleep 3
# =====================================================
# Verify Unbound DNS resolver (started by systemd via install script)
# =====================================================
@@ -611,23 +640,39 @@ if ! pgrep -f "openvpn.*server.conf" >/dev/null; then
exit 1
fi
# Wait for server tun interface to be ready with correct IP
# This prevents race conditions where OpenVPN is running but tun0 isn't configured
echo "Waiting for server tun0 interface to be ready..."
TUN_READY=false
for i in $(seq 1 30); do
if ip addr show tun0 2>/dev/null | grep -q "inet $VPN_GATEWAY"; then
echo "PASS: Server tun0 interface ready with $VPN_GATEWAY"
TUN_READY=true
break
fi
echo "Waiting for tun0... ($i/30)"
sleep 1
done
if [ "$TUN_READY" = false ]; then
echo "FAIL: Server tun0 interface not ready after 30 seconds"
ip addr show 2>&1 || true
exit 1
fi
# Allow routing tables to stabilize
echo "Allowing routing to stabilize..."
sleep 3
# =====================================================
# Wait for initial client tests to complete
# =====================================================
echo ""
echo "=== Waiting for initial client connectivity tests ==="
MAX_WAIT=120
WAITED=0
while [ ! -f /shared/initial-tests-passed ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/initial-tests-passed ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for initial tests... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for initial tests..."
done
if [ ! -f /shared/initial-tests-passed ]; then
echo "ERROR: Initial client tests did not complete in time"
exit 1
fi
echo "Initial client tests passed, proceeding with revocation tests"
# =====================================================
@@ -660,18 +705,10 @@ touch /shared/revoke-client-config-ready
# Wait for client to confirm connection with revoke test client
echo "Waiting for client to connect with '$REVOKE_CLIENT' certificate..."
MAX_WAIT=60
WAITED=0
while [ ! -f /shared/revoke-client-connected ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/revoke-client-connected ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for revoke test connection... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for revoke test connection..."
done
if [ ! -f /shared/revoke-client-connected ]; then
echo "ERROR: Client did not connect with revoke test certificate"
exit 1
fi
echo "PASS: Client connected with '$REVOKE_CLIENT' certificate"
# =====================================================
@@ -715,17 +752,9 @@ touch /shared/revoke-client-disconnect
# Wait for client to disconnect
echo "Waiting for client to disconnect..."
MAX_WAIT=30
WAITED=0
while [ ! -f /shared/revoke-client-disconnected ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/revoke-client-disconnected ]; do
sleep 2
WAITED=$((WAITED + 2))
done
if [ ! -f /shared/revoke-client-disconnected ]; then
echo "ERROR: Client did not signal disconnect"
exit 1
fi
echo "Client disconnected"
# Now revoke the certificate
@@ -755,18 +784,10 @@ touch /shared/revoke-try-reconnect
# Wait for client to confirm that connection with revoked cert failed
echo "Waiting for client to confirm revoked cert connection failure..."
MAX_WAIT=60
WAITED=0
while [ ! -f /shared/revoke-reconnect-failed ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/revoke-reconnect-failed ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for reconnect failure confirmation... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for reconnect failure confirmation..."
done
if [ ! -f /shared/revoke-reconnect-failed ]; then
echo "ERROR: Client did not confirm that revoked cert connection failed"
exit 1
fi
echo "PASS: Connection with revoked certificate correctly rejected"
echo "=== Certificate Revocation Tests PASSED ==="
@@ -904,18 +925,10 @@ touch /shared/new-client-config-ready
# Wait for client to confirm successful connection with new cert
echo "Waiting for client to connect with new '$REVOKE_CLIENT' certificate..."
MAX_WAIT=60
WAITED=0
while [ ! -f /shared/new-client-connected ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/new-client-connected ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for new cert connection... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for new cert connection..."
done
if [ ! -f /shared/new-client-connected ]; then
echo "ERROR: Client did not connect with new certificate"
exit 1
fi
echo "PASS: Client connected with new '$REVOKE_CLIENT' certificate"
echo "=== Reuse of Revoked Client Name Tests PASSED ==="
@@ -983,18 +996,10 @@ touch /shared/passphrase-client-config-ready
# Wait for client to confirm connection with passphrase client
echo "Waiting for client to connect with '$PASSPHRASE_CLIENT' certificate..."
MAX_WAIT=60
WAITED=0
while [ ! -f /shared/passphrase-client-connected ] && [ $WAITED -lt $MAX_WAIT ]; do
while [ ! -f /shared/passphrase-client-connected ]; do
sleep 2
WAITED=$((WAITED + 2))
echo "Waiting for passphrase client connection... ($WAITED/$MAX_WAIT seconds)"
echo "Waiting for passphrase client connection..."
done
if [ ! -f /shared/passphrase-client-connected ]; then
echo "FAIL: Client did not connect with passphrase-protected certificate"
exit 1
fi
echo "PASS: Client connected with passphrase-protected certificate"
echo "=== PASSPHRASE Support Tests PASSED ==="