2019-06-17 10:59:45 -04:00
|
|
|
/*-
|
2023-05-10 11:40:58 -04:00
|
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
2019-06-17 10:59:45 -04:00
|
|
|
*
|
|
|
|
|
* Copyright (c) 2019 Conrad Meyer <cem@FreeBSD.org>
|
|
|
|
|
* All rights reserved.
|
|
|
|
|
*
|
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
|
* are met:
|
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
|
*
|
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
|
#include <sys/random.h>
|
|
|
|
|
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
|
|
|
|
|
|
#include <crypto/chacha20/chacha.h>
|
|
|
|
|
#include <crypto/rijndael/rijndael-api-fst.h>
|
|
|
|
|
#include <crypto/sha2/sha256.h>
|
|
|
|
|
|
|
|
|
|
#include <dev/random/hash.h>
|
|
|
|
|
#include <dev/random/uint128.h>
|
|
|
|
|
|
|
|
|
|
#include <atf-c.h>
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
vec_u32_tole128(uint8_t dst[static 16], const uint32_t src[static 4])
|
|
|
|
|
{
|
|
|
|
|
le32enc(dst, src[0]);
|
|
|
|
|
le32enc(&dst[4], src[1]);
|
|
|
|
|
le32enc(&dst[8], src[2]);
|
|
|
|
|
le32enc(&dst[12], src[3]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
le128_to_vec_u32(uint32_t dst[static 4], const uint8_t src[static 16])
|
|
|
|
|
{
|
|
|
|
|
dst[0] = le32dec(src);
|
|
|
|
|
dst[1] = le32dec(&src[4]);
|
|
|
|
|
dst[2] = le32dec(&src[8]);
|
|
|
|
|
dst[3] = le32dec(&src[12]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
formatu128(char buf[static 52], uint128_t x)
|
|
|
|
|
{
|
|
|
|
|
uint8_t le128x[16];
|
|
|
|
|
uint32_t vx[4];
|
|
|
|
|
size_t sz, i;
|
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
|
|
le128enc(le128x, x);
|
|
|
|
|
le128_to_vec_u32(vx, le128x);
|
|
|
|
|
|
|
|
|
|
sz = 52;
|
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
|
rc = snprintf(buf, sz, "0x%x ", vx[i]);
|
|
|
|
|
ATF_REQUIRE(rc > 0 && (size_t)rc < sz);
|
|
|
|
|
|
|
|
|
|
buf += rc;
|
|
|
|
|
sz -= rc;
|
|
|
|
|
}
|
|
|
|
|
/* Delete last trailing space */
|
|
|
|
|
buf[-1] = '\0';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
u128_check_equality(uint128_t a, uint128_t b, const char *descr)
|
|
|
|
|
{
|
|
|
|
|
char fmtbufa[52], fmtbufb[52];
|
|
|
|
|
|
|
|
|
|
formatu128(fmtbufa, a);
|
|
|
|
|
formatu128(fmtbufb, b);
|
|
|
|
|
|
|
|
|
|
ATF_CHECK_MSG(uint128_equals(a, b),
|
|
|
|
|
"Expected: [%s] != Actual: [%s]: %s", fmtbufa, fmtbufb, descr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ATF_TC_WITHOUT_HEAD(uint128_inc);
|
|
|
|
|
ATF_TC_BODY(uint128_inc, tc)
|
|
|
|
|
{
|
|
|
|
|
static const struct u128_inc_tc {
|
|
|
|
|
uint32_t input[4];
|
|
|
|
|
uint32_t expected[4];
|
|
|
|
|
const char *descr;
|
|
|
|
|
} tests[] = {
|
|
|
|
|
{
|
|
|
|
|
.input = { 0, 0, 0, 0 },
|
|
|
|
|
.expected = { 1, 0, 0, 0 },
|
|
|
|
|
.descr = "0 -> 1",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 1, 0, 0, 0 },
|
|
|
|
|
.expected = { 2, 0, 0, 0 },
|
|
|
|
|
.descr = "0 -> 2",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 0xff, 0, 0, 0 },
|
|
|
|
|
.expected = { 0x100, 0, 0, 0 },
|
|
|
|
|
.descr = "0xff -> 0x100 (byte carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { UINT32_MAX, 0, 0, 0 },
|
|
|
|
|
.expected = { 0, 1, 0, 0 },
|
|
|
|
|
.descr = "2^32 - 1 -> 2^32 (word carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { UINT32_MAX, UINT32_MAX, 0, 0 },
|
|
|
|
|
.expected = { 0, 0, 1, 0 },
|
|
|
|
|
.descr = "2^64 - 1 -> 2^64 (u128t_word0 carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { UINT32_MAX, UINT32_MAX, UINT32_MAX, 0 },
|
|
|
|
|
.expected = { 0, 0, 0, 1 },
|
|
|
|
|
.descr = "2^96 - 1 -> 2^96 (word carry)",
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
uint8_t inputle[16], expectedle[16];
|
|
|
|
|
uint128_t a;
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < nitems(tests); i++) {
|
|
|
|
|
vec_u32_tole128(inputle, tests[i].input);
|
|
|
|
|
vec_u32_tole128(expectedle, tests[i].expected);
|
|
|
|
|
|
|
|
|
|
a = le128dec(inputle);
|
|
|
|
|
uint128_increment(&a);
|
|
|
|
|
u128_check_equality(le128dec(expectedle), a, tests[i].descr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
random(4): Fortuna: allow increased concurrency
Add experimental feature to increase concurrency in Fortuna. As this
diverges slightly from canonical Fortuna, and due to the security
sensitivity of random(4), it is off by default. To enable it, set the
tunable kern.random.fortuna.concurrent_read="1". The rest of this commit
message describes the behavior when enabled.
Readers continue to update shared Fortuna state under global mutex, as they
do in the status quo implementation of the algorithm, but shift the actual
PRF generation out from under the global lock. This massively reduces the
CPU time readers spend holding the global lock, allowing for increased
concurrency on SMP systems and less bullying of the harvestq kthread.
It is somewhat of a deviation from FS&K. I think the primary difference is
that the specific sequence of AES keys will differ if READ_RANDOM_UIO is
accessed concurrently (as the 2nd thread to take the mutex will no longer
receive a key derived from rekeying the first thread). However, I believe
the goals of rekeying AES are maintained: trivially, we continue to rekey
every 1MB for the statistical property; and each consumer gets a
forward-secret, independent AES key for their PRF.
Since Chacha doesn't need to rekey for sequences of any length, this change
makes no difference to the sequence of Chacha keys and PRF generated when
Chacha is used in place of AES.
On a GENERIC 4-thread VM (so, INVARIANTS/WITNESS, numbers not necessarily
representative), 3x concurrent AES performance jumped from ~55 MiB/s per
thread to ~197 MB/s per thread. Concurrent Chacha20 at 3 threads went from
roughly ~113 MB/s per thread to ~430 MB/s per thread.
Prior to this change, the system was extremely unresponsive with 3-4
concurrent random readers; each thread had high variance in latency and
throughput, depending on who got lucky and won the lock. "rand_harvestq"
thread CPU use was high (double digits), seemingly due to spinning on the
global lock.
After the change, concurrent random readers and the system in general are
much more responsive, and rand_harvestq CPU use dropped to basically zero.
Tests are added to the devrandom suite to ensure the uint128_add64 primitive
utilized by unlocked read functions to specification.
Reviewed by: markm
Approved by: secteam(delphij)
Relnotes: yes
Differential Revision: https://reviews.freebsd.org/D20313
2019-06-17 16:29:13 -04:00
|
|
|
ATF_TC_WITHOUT_HEAD(uint128_add64);
|
|
|
|
|
ATF_TC_BODY(uint128_add64, tc)
|
|
|
|
|
{
|
|
|
|
|
static const struct u128_add64_tc {
|
|
|
|
|
uint32_t input[4];
|
|
|
|
|
uint64_t addend;
|
|
|
|
|
uint32_t expected[4];
|
|
|
|
|
const char *descr;
|
|
|
|
|
} tests[] = {
|
|
|
|
|
{
|
|
|
|
|
.input = { 0, 0, 0, 0 },
|
|
|
|
|
.addend = 1,
|
|
|
|
|
.expected = { 1, 0, 0, 0 },
|
|
|
|
|
.descr = "0 + 1 -> 1",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 1, 0, 0, 0 },
|
|
|
|
|
.addend = UINT32_MAX,
|
|
|
|
|
.expected = { 0, 1, 0, 0 },
|
|
|
|
|
.descr = "1 + (2^32 - 1) -> 2^32 (word carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 1, 0, 0, 0 },
|
|
|
|
|
.addend = UINT64_MAX,
|
|
|
|
|
.expected = { 0, 0, 1, 0 },
|
|
|
|
|
.descr = "1 + (2^64 - 1) -> 2^64 (u128t_word0 carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 0x11111111, 0x11111111, 0, 0 },
|
|
|
|
|
.addend = 0xf0123456789abcdeULL,
|
|
|
|
|
.expected = { 0x89abcdef, 0x01234567, 1, 0 },
|
|
|
|
|
.descr = "0x1111_1111_1111_1111 +"
|
|
|
|
|
"0xf012_3456_789a_bcde ->"
|
|
|
|
|
"0x1_0123_4567_89ab_cdef",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 1, 0, UINT32_MAX, 0 },
|
|
|
|
|
.addend = UINT64_MAX,
|
|
|
|
|
.expected = { 0, 0, 0, 1 },
|
|
|
|
|
.descr = "Carry ~2^96",
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
uint8_t inputle[16], expectedle[16];
|
|
|
|
|
uint128_t a;
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < nitems(tests); i++) {
|
|
|
|
|
vec_u32_tole128(inputle, tests[i].input);
|
|
|
|
|
vec_u32_tole128(expectedle, tests[i].expected);
|
|
|
|
|
|
|
|
|
|
a = le128dec(inputle);
|
|
|
|
|
uint128_add64(&a, tests[i].addend);
|
|
|
|
|
u128_check_equality(le128dec(expectedle), a, tests[i].descr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-17 10:59:45 -04:00
|
|
|
/*
|
|
|
|
|
* Test assumptions about Chacha incrementing counter in the same way as
|
|
|
|
|
* uint128.h
|
|
|
|
|
*/
|
|
|
|
|
ATF_TC_WITHOUT_HEAD(uint128_chacha_ctr);
|
|
|
|
|
ATF_TC_BODY(uint128_chacha_ctr, tc)
|
|
|
|
|
{
|
|
|
|
|
static const struct u128_chacha_tc {
|
|
|
|
|
uint32_t input[4];
|
|
|
|
|
uint32_t expected[4];
|
|
|
|
|
const char *descr;
|
|
|
|
|
} tests[] = {
|
|
|
|
|
{
|
|
|
|
|
.input = { 0, 0, 0, 0 },
|
|
|
|
|
.expected = { 1, 0, 0, 0 },
|
|
|
|
|
.descr = "Single block",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 1, 0, 0, 0 },
|
|
|
|
|
.expected = { 2, 0, 0, 0 },
|
|
|
|
|
.descr = "0 -> 2",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { 0xff, 0, 0, 0 },
|
|
|
|
|
.expected = { 0x100, 0, 0, 0 },
|
|
|
|
|
.descr = "0xff -> 0x100 (byte carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { UINT32_MAX, 0, 0, 0 },
|
|
|
|
|
.expected = { 0, 1, 0, 0 },
|
|
|
|
|
.descr = "2^32 - 1 -> 2^32 (word carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { UINT32_MAX, UINT32_MAX, 0, 0 },
|
|
|
|
|
.expected = { 0, 0, 1, 0 },
|
|
|
|
|
.descr = "2^64 - 1 -> 2^64 (u128t_word0 carry)",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
.input = { UINT32_MAX, UINT32_MAX, UINT32_MAX, 0 },
|
|
|
|
|
.expected = { 0, 0, 0, 1 },
|
|
|
|
|
.descr = "2^96 - 1 -> 2^96 (word carry)",
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
union randomdev_key context;
|
|
|
|
|
uint8_t inputle[16], expectedle[16], trash[CHACHA_BLOCKLEN];
|
|
|
|
|
uint8_t notrandomkey[RANDOM_KEYSIZE] = { 0 };
|
|
|
|
|
uint128_t a;
|
|
|
|
|
size_t i;
|
|
|
|
|
|
|
|
|
|
random_chachamode = true;
|
|
|
|
|
randomdev_encrypt_init(&context, notrandomkey);
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < nitems(tests); i++) {
|
|
|
|
|
vec_u32_tole128(inputle, tests[i].input);
|
|
|
|
|
vec_u32_tole128(expectedle, tests[i].expected);
|
|
|
|
|
|
|
|
|
|
a = le128dec(inputle);
|
random(4): Generalize algorithm-independent APIs
At a basic level, remove assumptions about the underlying algorithm (such as
output block size and reseeding requirements) from the algorithm-independent
logic in randomdev.c. Chacha20 does not have many of the restrictions that
AES-ICM does as a PRF (Pseudo-Random Function), because it has a cipher
block size of 512 bits. The motivation is that by generalizing the API,
Chacha is not penalized by the limitations of AES.
In READ_RANDOM_UIO, first attempt to NOWAIT allocate a large enough buffer
for the entire user request, or the maximal input we'll accept between
signal checking, whichever is smaller. The idea is that the implementation
of any randomdev algorithm is then free to divide up large requests in
whatever fashion it sees fit.
As part of this, two responsibilities from the "algorithm-generic" randomdev
code are pushed down into the Fortuna ra_read implementation (and any other
future or out-of-tree ra_read implementations):
1. If an algorithm needs to rekey every N bytes, it is responsible for
handling that in ra_read(). (I.e., Fortuna's 1MB rekey interval for AES
block generation.)
2. If an algorithm uses a block cipher that doesn't tolerate partial-block
requests (again, e.g., AES), it is also responsible for handling that in
ra_read().
Several APIs are changed from u_int buffer length to the more canonical
size_t. Several APIs are changed from taking a blockcount to a bytecount,
to permit PRFs like Chacha20 to directly generate quantities of output that
are not multiples of RANDOM_BLOCKSIZE (AES block size).
The Fortuna algorithm is changed to NOT rekey every 1MiB when in Chacha20
mode (kern.random.use_chacha20_cipher="1"). This is explicitly supported by
the math in FS&K §9.4 (Ferguson, Schneier, and Kohno; "Cryptography
Engineering"), as well as by their conclusion: "If we had a block cipher
with a 256-bit [or greater] block size, then the collisions would not
have been an issue at all."
For now, continue to break up reads into PAGE_SIZE chunks, as they were
before. So, no functional change, mostly.
Reviewed by: markm
Approved by: secteam(delphij)
Differential Revision: https://reviews.freebsd.org/D20312
2019-06-17 11:09:12 -04:00
|
|
|
randomdev_keystream(&context, &a, trash, sizeof(trash));
|
2019-06-17 10:59:45 -04:00
|
|
|
u128_check_equality(le128dec(expectedle), a, tests[i].descr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ATF_TP_ADD_TCS(tp)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
ATF_TP_ADD_TC(tp, uint128_inc);
|
random(4): Fortuna: allow increased concurrency
Add experimental feature to increase concurrency in Fortuna. As this
diverges slightly from canonical Fortuna, and due to the security
sensitivity of random(4), it is off by default. To enable it, set the
tunable kern.random.fortuna.concurrent_read="1". The rest of this commit
message describes the behavior when enabled.
Readers continue to update shared Fortuna state under global mutex, as they
do in the status quo implementation of the algorithm, but shift the actual
PRF generation out from under the global lock. This massively reduces the
CPU time readers spend holding the global lock, allowing for increased
concurrency on SMP systems and less bullying of the harvestq kthread.
It is somewhat of a deviation from FS&K. I think the primary difference is
that the specific sequence of AES keys will differ if READ_RANDOM_UIO is
accessed concurrently (as the 2nd thread to take the mutex will no longer
receive a key derived from rekeying the first thread). However, I believe
the goals of rekeying AES are maintained: trivially, we continue to rekey
every 1MB for the statistical property; and each consumer gets a
forward-secret, independent AES key for their PRF.
Since Chacha doesn't need to rekey for sequences of any length, this change
makes no difference to the sequence of Chacha keys and PRF generated when
Chacha is used in place of AES.
On a GENERIC 4-thread VM (so, INVARIANTS/WITNESS, numbers not necessarily
representative), 3x concurrent AES performance jumped from ~55 MiB/s per
thread to ~197 MB/s per thread. Concurrent Chacha20 at 3 threads went from
roughly ~113 MB/s per thread to ~430 MB/s per thread.
Prior to this change, the system was extremely unresponsive with 3-4
concurrent random readers; each thread had high variance in latency and
throughput, depending on who got lucky and won the lock. "rand_harvestq"
thread CPU use was high (double digits), seemingly due to spinning on the
global lock.
After the change, concurrent random readers and the system in general are
much more responsive, and rand_harvestq CPU use dropped to basically zero.
Tests are added to the devrandom suite to ensure the uint128_add64 primitive
utilized by unlocked read functions to specification.
Reviewed by: markm
Approved by: secteam(delphij)
Relnotes: yes
Differential Revision: https://reviews.freebsd.org/D20313
2019-06-17 16:29:13 -04:00
|
|
|
ATF_TP_ADD_TC(tp, uint128_add64);
|
2019-06-17 10:59:45 -04:00
|
|
|
ATF_TP_ADD_TC(tp, uint128_chacha_ctr);
|
|
|
|
|
return (atf_no_error());
|
|
|
|
|
}
|