MINOR: mux-h2: perform a graceful close at 75% glitches threshold

This avoids hitting the hard wall for connections with non-compliant peers that would be accumulating errors over long connections. We now permit to recycle the connection early enough to reset the connection counter. This was tested artificially by adding this to h2c_frt_handle_headers(): h2c_report_glitch(h2c, 1, "new stream"); or this to h2_detach(): h2c_report_glitch(h2c, 1, "detaching"); and injecting using h2load -c 1 -n 1000 0:4445 on a config featuring tune.h2.fe.glitches-threshold 1000: finished in 8.74ms, 85802.54 req/s, 686.62MB/s requests: 1000 total, 751 started, 751 done, 750 succeeded, 250 failed, 250 errored, 0 timeout status codes: 750 2xx, 0 3xx, 0 4xx, 0 5xx traffic: 6.00MB (6293303) total, 132.57KB (135750) headers (space savings 29.84%), 5.86MB (6144000) data min max mean sd +/- sd time for request: 9us 178us 10us 6us 99.47% time for connect: 139us 139us 139us 0us 100.00% time to 1st byte: 339us 339us 339us 0us 100.00% req/s : 87477.70 87477.70 87477.70 0.00 100.00% The failures are due to h2load not supporting reconnection.
2026-02-03 20:39:41 -05:00 · 2025-12-20 18:53:08 +01:00 · 2025-12-20 18:53:08 +01:00 · 0901f60cef
commit 0901f60cef
parent 52adeef7e1
2 changed files with 29 additions and 6 deletions
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@ -4219,7 +4219,10 @@ tune.h2.be.glitches-threshold <number>
  zero value here should probably be in the hundreds or thousands to be
  effective without affecting slightly bogus servers. It is also possible to
  only kill connections when the CPU usage crosses a certain level, by using
-  "tune.glitches.kill.cpu-usage".
+  "tune.glitches.kill.cpu-usage". Note that a graceful close is attempted at
+  75% of the configured threshold by advertising a GOAWAY for a future stream.
+  This ensures that a slightly faulty connection will stop being used after
+  some time without risking to interrupt ongoing transfers.

  See also: tune.h2.fe.glitches-threshold, bc_glitches, and
            tune.glitches.kill.cpu-usage
@ -4276,7 +4279,11 @@ tune.h2.fe.glitches-threshold <number>
  zero value here should probably be in the hundreds or thousands to be
  effective without affecting slightly bogus clients. It is also possible to
  only kill connections when the CPU usage crosses a certain level, by using
-  "tune.glitches.kill.cpu-usage".
+  "tune.glitches.kill.cpu-usage". Note that a graceful close is attempted at
+  75% of the configured threshold by advertising a GOAWAY for a future stream.
+  This ensures that a slightly non-compliant client will have the opportunity
+  to create a new connection and continue to work unaffected without ever
+  triggering the hard close thus risking to interrupt ongoing transfers.

  See also: tune.h2.be.glitches-threshold, fc_glitches, and
            tune.glitches.kill.cpu-usage
--- a/src/mux_h2.c
+++ b/src/mux_h2.c
@ -533,6 +533,7 @@ struct task *h2_timeout_task(struct task *t, void *context, unsigned int state);
 static int h2_send(struct h2c *h2c);
 static int h2_recv(struct h2c *h2c);
 static int h2_process(struct h2c *h2c);
+static int h2c_send_goaway_error(struct h2c *h2c, struct h2s *h2s);
 /* h2_io_cb is exported to see it resolved in "show fd" */
 struct task *h2_io_cb(struct task *t, void *ctx, unsigned int state);
 static inline struct h2s *h2c_st_by_id(struct h2c *h2c, int id);
@ -1709,10 +1710,25 @@ static inline int _h2c_report_glitch(struct h2c *h2c, int increment)
 		h2_be_glitches_threshold : h2_fe_glitches_threshold;

 	h2c->glitches += increment;
-	if (thres && h2c->glitches >= thres &&
-	    (th_ctx->idle_pct <= global.tune.glitch_kill_maxidle)) {
-		h2c_error(h2c, H2_ERR_ENHANCE_YOUR_CALM);
-		return 1;
+	if (unlikely(thres && h2c->glitches >= (thres * 3 + 1) / 4)) {
+		/* at 75% of the threshold, we switch to close mode
+		 * to force clients to periodically reconnect.
+		 */
+		if (h2c->last_sid <= 0 ||
+		    h2c->last_sid > h2c->max_id + 2 * h2c_max_concurrent_streams(h2c)) {
+			/* not set yet or was too high */
+			h2c->last_sid = h2c->max_id + 2 * h2c_max_concurrent_streams(h2c);
+			h2c_send_goaway_error(h2c, NULL);
+		}
+
+		/* at 100% of the threshold and excess of CPU usage we also
+		 * actively kill the connection.
+		 */
+		if (h2c->glitches >= thres &&
+		    (th_ctx->idle_pct <= global.tune.glitch_kill_maxidle)) {
+			h2c_error(h2c, H2_ERR_ENHANCE_YOUR_CALM);
+			return 1;
+		}
 	}
 	return 0;
 }