fix: dev: fix EDE 22 time out detection

Extended DNS error 22 (No reachable authority) was previously detected when `fctx_expired` fired. It turns out this function is used as a "safety net" and the timeout detection should be caught earlier.

It was working though, because of another issue fixed by !9927. But then, the recursive request timed out detection occurs before `fctx_expired` making impossible to raise the EDE 22 error.

This fixes the problem by triggering the EDE 22 in the part of the code detecting the (TCP or UDP) time out and taking the decision to cancel the whole fetch (i.e. There is no other server to attempt to contact).

Note this is not targeting users (no release note) because there is no release versions of BIND between !9927 and this changes. Thus a release note would be confusing.

Closes #5137

Merge branch '5137-ede22' into 'main'

See merge request isc-projects/bind9!9985
This commit is contained in:
Colin Vidal 2025-01-27 11:48:49 +00:00
commit dc3c3efdbf
5 changed files with 94 additions and 28 deletions

View file

@ -110,9 +110,14 @@ for (;;) {
} elsif ($qname eq "net" && $qtype eq "NS") {
$packet->header->aa(1);
$packet->push("answer", new Net::DNS::RR("net 300 NS a.root-servers.nil."));
} elsif ($qname eq "noresponse.exampleudp.net") {
next;
} elsif ($qname =~ /example\.net/) {
$packet->push("authority", new Net::DNS::RR("example.net 300 NS ns.example.net"));
$packet->push("additional", new Net::DNS::RR("ns.example.net 300 A 10.53.0.3"));
} elsif ($qname =~ /exampleudp\.net/) {
$packet->push("authority", new Net::DNS::RR("exampleudp.net 300 NS ns.exampleudp.net"));
$packet->push("additional", new Net::DNS::RR("ns.exampleudp.net 300 A 10.53.0.2"));
} elsif ($qname =~ /lame\.example\.org/) {
$packet->header->ad(0);
$packet->header->aa(0);

View file

@ -50,6 +50,7 @@ echo_i "checking no response handling with a shorter than resolver-query-timeout
ret=0
dig_with_opts +tcp +tries=1 +timeout=3 noresponse.example.net @10.53.0.1 a >dig.out.ns1.test${n} && ret=1
grep -F "no servers could be reached" dig.out.ns1.test${n} >/dev/null || ret=1
grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null && ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -61,6 +62,20 @@ echo_i "checking no response handling with a longer than resolver-query-timeout
ret=0
dig_with_opts +tcp +tries=1 +timeout=7 noresponse.example.net @10.53.0.1 a >dig.out.ns1.test${n} || ret=1
grep -F "status: SERVFAIL" dig.out.ns1.test${n} >/dev/null || ret=1
grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
# 'resolver-query-timeout' is set to 5 seconds in ns1, so named should
# interrupt the non-responsive query and send a SERVFAIL answer before dig's
# own timeout fires, which is set to 7 seconds. This time, exampleudp.net is
# contacted using UDP transport by the resolver.
n=$((n + 1))
echo_i "checking no response handling with a longer than resolver-query-timeout timeout (UDP recursion) ($n)"
ret=0
dig_with_opts +tcp +tries=1 +timeout=7 noresponse.exampleudp.net @10.53.0.1 a >dig.out.ns1.test${n} || ret=1
grep -F "status: SERVFAIL" dig.out.ns1.test${n} >/dev/null || ret=1
grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))

View file

@ -340,11 +340,15 @@ $DIG -p ${PORT} @10.53.0.1 nxdomain.example TXT >dig.out.test$((n + 4)) &
wait
# no stale answers are used and the authoritative queries timed out. So no EDE 3
# is not sent but EDE 22 is sent.
n=$((n + 1))
echo_i "check stale data.example TXT (serve-stale off) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -352,7 +356,8 @@ n=$((n + 1))
echo_i "check stale othertype.example CAA (serve-stale off) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -360,7 +365,8 @@ n=$((n + 1))
echo_i "check stale nodata.example TXT (serve-stale off) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -368,7 +374,8 @@ n=$((n + 1))
echo_i "check stale nxdomain.example TXT (serve-stale off) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -739,11 +746,15 @@ $DIG -p ${PORT} @10.53.0.1 nxdomain.example TXT >dig.out.test$((n + 4)) &
wait
# stale-answer is enabled, but with a very low TTL so the following answer have
# been removed from the stale cache. Hence, no EDE 3 anymore, but EDE 22.
n=$((n + 1))
echo_i "check ancient data.example TXT (low max-stale-ttl) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -752,7 +763,8 @@ n=$((n + 1))
echo_i "check ancient othertype.example CAA (low max-stale-ttl) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -761,7 +773,8 @@ n=$((n + 1))
echo_i "check ancient nodata.example TXT (low max-stale-ttl) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -770,7 +783,8 @@ n=$((n + 1))
echo_i "check ancient nxdomain.example TXT (low max-stale-ttl) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1093,11 +1107,15 @@ $DIG -p ${PORT} @10.53.0.3 nxdomain.example TXT >dig.out.test$((n + 4)) &
wait
# no stale answers are used and the authoritative queries timed out. So no EDE 3
# is not sent but EDE 22 is sent.
n=$((n + 1))
echo_i "check fail of data.example TXT (max-stale-ttl default) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1106,7 +1124,8 @@ n=$((n + 1))
echo_i "check fail of othertype.example CAA (max-stale-ttl default) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1115,7 +1134,8 @@ n=$((n + 1))
echo_i "check fail of nodata.example TXT (max-stale-ttl default) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1124,7 +1144,8 @@ n=$((n + 1))
echo_i "check fail of nxdomain.example TXT (max-stale-ttl default) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1225,11 +1246,13 @@ status=$((status + ret))
# The notfound.example check is different than nxdomain.example because
# we didn't send a prime query to add notfound.example to the cache.
# Independently, EDE 22 is sent as the authoritative server doesn't respond.
n=$((n + 1))
echo_i "check notfound.example TXT (max-stale-ttl default) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1341,11 +1364,15 @@ $DIG -p ${PORT} @10.53.0.4 nxdomain.example TXT >dig.out.test$((n + 4)) &
wait
# no stale answers are used and the authoritative queries timed out. So no EDE 3
# is not sent but EDE 22 is sent.
n=$((n + 1))
echo_i "check fail of data.example TXT (serve-stale answers disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1354,7 +1381,8 @@ n=$((n + 1))
echo_i "check fail of othertype.example TXT (serve-stale answers disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1363,7 +1391,8 @@ n=$((n + 1))
echo_i "check fail of nodata.example TXT (serve-stale answers disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1372,7 +1401,8 @@ n=$((n + 1))
echo_i "check fail of nxdomain.example TXT (serve-stale answers disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1549,11 +1579,15 @@ $DIG -p ${PORT} @10.53.0.5 nxdomain.example TXT >dig.out.test$((n + 4)) &
wait
# no stale answers are used and the authoritative queries timed out. So no EDE 3
# is not sent but EDE 22 is sent.
n=$((n + 1))
echo_i "check fail of data.example TXT (serve-stale cache disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1562,7 +1596,8 @@ n=$((n + 1))
echo_i "check fail of othertype.example CAA (serve-stale cache disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1571,7 +1606,8 @@ n=$((n + 1))
echo_i "check fail of nodata.example TXT (serve-stale cache disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))
@ -1580,7 +1616,8 @@ n=$((n + 1))
echo_i "check fail of nxdomain.example TXT (serve-stale cache disabled) ($n)"
ret=0
grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1
grep "EDE" dig.out.test$n >/dev/null && ret=1
grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1
grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1
grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status + ret))

View file

@ -1987,6 +1987,10 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo,
fctx_setretryinterval(fctx, srtt);
if (isc_interval_iszero(&fctx->interval)) {
FCTXTRACE("fetch expired");
LOCK(&fctx->lock);
dns_ede_append(fctx->mctx, &fctx->edelist,
DNS_EDE_NOREACHABLEAUTH, NULL);
UNLOCK(&fctx->lock);
return ISC_R_TIMEDOUT;
}
@ -7948,6 +7952,10 @@ rctx_timedout(respctx_t *rctx) {
if (isc_time_microdiff(&fctx->expires, &now) < US_PER_MS) {
FCTXTRACE("query timed out; stopped trying to make "
"fetch happen");
LOCK(&fctx->lock);
dns_ede_append(fctx->mctx, &fctx->edelist,
DNS_EDE_NOREACHABLEAUTH, NULL);
UNLOCK(&fctx->lock);
} else {
FCTXTRACE("query timed out; trying next server");
/* try next server */

View file

@ -234,7 +234,8 @@ client_extendederror_reset(ns_client_t *client) {
void
ns_client_extendederror(ns_client_t *client, uint16_t code, const char *text) {
const uint16_t codelen = sizeof(code);
uint16_t becode;
const uint16_t becodelen = sizeof(becode);
uint16_t textlen = 0;
size_t pos = 0;
unsigned char *ede = NULL;
@ -246,7 +247,7 @@ ns_client_extendederror(ns_client_t *client, uint16_t code, const char *text) {
* As ede will be directly put in the DNS message we need to make sure
* the code is in big-endian format
*/
code = htobe16(code);
becode = htobe16(code);
for (pos = 0; pos < DNS_EDE_MAX_ERRORS; pos++) {
edns = client->ede[pos];
@ -255,7 +256,7 @@ ns_client_extendederror(ns_client_t *client, uint16_t code, const char *text) {
break;
}
if (memcmp(&code, edns->value, sizeof(code)) == 0) {
if (memcmp(&becode, edns->value, becodelen) == 0) {
ns_client_log(client, NS_LOGCATEGORY_CLIENT,
NS_LOGMODULE_CLIENT, ISC_LOG_DEBUG(1),
"ignoring duplicate ede %u %s", code,
@ -288,16 +289,16 @@ ns_client_extendederror(ns_client_t *client, uint16_t code, const char *text) {
}
}
ede = isc_mem_get(client->manager->mctx, codelen + textlen);
ede = isc_mem_get(client->manager->mctx, becodelen + textlen);
memcpy(ede, &code, sizeof(code));
memcpy(ede, &becode, sizeof(code));
if (textlen > 0) {
memcpy(ede + codelen, text, textlen);
memcpy(ede + becodelen, text, textlen);
}
edns = isc_mem_get(client->manager->mctx, sizeof(*edns));
*edns = (dns_ednsopt_t){ .code = DNS_OPT_EDE,
.length = codelen + textlen,
.length = becodelen + textlen,
.value = ede };
client->ede[pos] = edns;