From 8b3d2c19d3691f29d4e86c73bc11491ae3fbfaec Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Sun, 23 Feb 2025 16:20:12 +0000 Subject: [PATCH] inpcb: Fix reuseport lbgroup array resizing in_pcblisten() moves an inpcb from the per-group list into the array, at which point it becomes visible to inpcb lookups in the datapath. It assumes that there is space in the array for this, but that's not guaranteed, since in_pcbinslbgrouphash() doesn't reserve space in the array if the inpcb isn't associated with a listening socket. We could resize the array in in_pcblisten(), but that would introduce a failure case where there currently is none. Instead, keep track of the number of pending inpcbs as well, and modify in_pcbinslbgrouphash() to reserve space for each pending (i.e., not-yet-listening) inpcb. Add a regression test. Reviewed by: glebius Reported by: netchild Fixes: 7cbb6b6e28db ("inpcb: Close some SO_REUSEPORT_LB races, part 2") Differential Revision: https://reviews.freebsd.org/D49100 --- sys/netinet/in_pcb.c | 7 +++- sys/netinet/in_pcb_var.h | 1 + tests/sys/netinet/so_reuseport_lb_test.c | 46 ++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 9d174dce902..1d9cc1866e1 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -339,6 +339,7 @@ in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) * lookups until listen() has been called. */ LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); + grp->il_pendcnt++; } else { grp->il_inp[grp->il_inpcnt] = inp; @@ -375,6 +376,8 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, CK_LIST_INSERT_HEAD(hdr, grp, il_list); LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, inp_lbgroup_list); + grp->il_pendcnt = old_grp->il_pendcnt; + old_grp->il_pendcnt = 0; in_pcblbgroup_free(old_grp); return (grp); } @@ -435,7 +438,7 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) return (ENOBUFS); in_pcblbgroup_insert(grp, inp); CK_LIST_INSERT_HEAD(hdr, grp, il_list); - } else if (grp->il_inpcnt == grp->il_inpsiz) { + } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", @@ -499,6 +502,7 @@ in_pcbremlbgrouphash(struct inpcb *inp) LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { if (inp == inp1) { LIST_REMOVE(inp, inp_lbgroup_list); + grp->il_pendcnt--; inp->inp_flags &= ~INP_INLBGROUP; return; } @@ -1503,6 +1507,7 @@ in_pcblisten(struct inpcb *inp) INP_HASH_WLOCK(pcbinfo); grp = in_pcblbgroup_find(inp); LIST_REMOVE(inp, inp_lbgroup_list); + grp->il_pendcnt--; in_pcblbgroup_insert(grp, inp); INP_HASH_WUNLOCK(pcbinfo); } diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h index e2b0ca386e7..32fdbced175 100644 --- a/sys/netinet/in_pcb_var.h +++ b/sys/netinet/in_pcb_var.h @@ -82,6 +82,7 @@ struct inpcblbgroup { #define il6_laddr il_dependladdr.id6_addr uint32_t il_inpsiz; /* max count in il_inp[] (h) */ uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ + uint32_t il_pendcnt; /* cur count in il_pending (h) */ struct inpcb *il_inp[]; /* (h) */ }; diff --git a/tests/sys/netinet/so_reuseport_lb_test.c b/tests/sys/netinet/so_reuseport_lb_test.c index 09d8e0ce8f8..aaadaead5e2 100644 --- a/tests/sys/netinet/so_reuseport_lb_test.c +++ b/tests/sys/netinet/so_reuseport_lb_test.c @@ -433,6 +433,51 @@ ATF_TC_BODY(double_listen_ipv6, tc) ATF_REQUIRE_MSG(error == 0, "close() failed: %s", strerror(errno)); } +/* + * Try binding many sockets to the same lbgroup without calling listen(2) on + * them. + */ +ATF_TC_WITHOUT_HEAD(bind_without_listen); +ATF_TC_BODY(bind_without_listen, tc) +{ + const int nsockets = 100; + struct sockaddr_in sin; + socklen_t socklen; + int error, s, s2[nsockets]; + + s = lb_listen_socket(PF_INET, 0); + + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_port = htons(0); + sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + error = bind(s, (struct sockaddr *)&sin, sizeof(sin)); + ATF_REQUIRE_MSG(error == 0, "bind() failed: %s", strerror(errno)); + + socklen = sizeof(sin); + error = getsockname(s, (struct sockaddr *)&sin, &socklen); + ATF_REQUIRE_MSG(error == 0, "getsockname() failed: %s", + strerror(errno)); + + for (int i = 0; i < nsockets; i++) { + s2[i] = lb_listen_socket(PF_INET, 0); + error = bind(s2[i], (struct sockaddr *)&sin, sizeof(sin)); + ATF_REQUIRE_MSG(error == 0, "bind() failed: %s", strerror(errno)); + } + for (int i = 0; i < nsockets; i++) { + error = listen(s2[i], 1); + ATF_REQUIRE_MSG(error == 0, "listen() failed: %s", strerror(errno)); + } + for (int i = 0; i < nsockets; i++) { + error = close(s2[i]); + ATF_REQUIRE_MSG(error == 0, "close() failed: %s", strerror(errno)); + } + + error = close(s); + ATF_REQUIRE_MSG(error == 0, "close() failed: %s", strerror(errno)); +} + ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, basic_ipv4); @@ -440,6 +485,7 @@ ATF_TP_ADD_TCS(tp) ATF_TP_ADD_TC(tp, concurrent_add); ATF_TP_ADD_TC(tp, double_listen_ipv4); ATF_TP_ADD_TC(tp, double_listen_ipv6); + ATF_TP_ADD_TC(tp, bind_without_listen); return (atf_no_error()); }