opnsense-src/sys/dev/nvmf/host/nvmf.c
John Baldwin 365b89e8ea nvmf: Switch several ioctls to using nvlists
For requests that handoff queues from userspace to the kernel as well
as the request to fetch reconnect parameters from the kernel, switch
from using flat structures to nvlists.  In particular, this will
permit adding support for additional transports in the future without
breaking the ABI of the structures.

Note that this is an ABI break for the ioctls used by nvmf(4) and
nvmft(4).  Since this is only present in main I did not bother
implementing compatability shims.

Inspired by:	imp (suggestion on a different review)
Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D48230
2024-12-30 13:52:21 -05:00

1132 lines
26 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/dnv.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/nv.h>
#include <sys/reboot.h>
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/taskqueue.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/host/nvmf_var.h>
static struct cdevsw nvmf_cdevsw;
bool nvmf_fail_disconnect = false;
SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN,
&nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure");
MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
static void nvmf_disconnect_task(void *arg, int pending);
static void nvmf_shutdown_pre_sync(void *arg, int howto);
static void nvmf_shutdown_post_sync(void *arg, int howto);
void
nvmf_complete(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_completion_status *status = arg;
struct mtx *mtx;
status->cqe = *cqe;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
status->done = true;
mtx_unlock(mtx);
wakeup(status);
}
void
nvmf_io_complete(void *arg, size_t xfered, int error)
{
struct nvmf_completion_status *status = arg;
struct mtx *mtx;
status->io_error = error;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
status->io_done = true;
mtx_unlock(mtx);
wakeup(status);
}
void
nvmf_wait_for_reply(struct nvmf_completion_status *status)
{
struct mtx *mtx;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
while (!status->done || !status->io_done)
mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
mtx_unlock(mtx);
}
static int
nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t *value)
{
const struct nvmf_fabric_prop_get_rsp *rsp;
struct nvmf_completion_status status;
nvmf_status_init(&status);
if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
M_WAITOK))
return (ECONNABORTED);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
if (size == 8)
*value = le64toh(rsp->value.u64);
else
*value = le32toh(rsp->value.u32.low);
return (0);
}
static int
nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t value)
{
struct nvmf_completion_status status;
nvmf_status_init(&status);
if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
M_WAITOK))
return (ECONNABORTED);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
return (0);
}
static void
nvmf_shutdown_controller(struct nvmf_softc *sc)
{
uint64_t cc;
int error;
error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
return;
}
cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
if (error != 0)
device_printf(sc->dev,
"Failed to set CC to trigger shutdown\n");
}
static void
nvmf_check_keep_alive(void *arg)
{
struct nvmf_softc *sc = arg;
int traffic;
traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
if (traffic == 0) {
device_printf(sc->dev,
"disconnecting due to KeepAlive timeout\n");
nvmf_disconnect(sc);
return;
}
callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
}
static void
nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_softc *sc = arg;
atomic_store_int(&sc->ka_active_rx_traffic, 1);
if (cqe->status != 0) {
device_printf(sc->dev,
"KeepAlive response reported status %#x\n",
le16toh(cqe->status));
}
}
static void
nvmf_send_keep_alive(void *arg)
{
struct nvmf_softc *sc = arg;
int traffic;
/*
* Don't bother sending a KeepAlive command if TKAS is active
* and another command has been sent during the interval.
*/
traffic = atomic_load_int(&sc->ka_active_tx_traffic);
if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
sc, M_NOWAIT))
device_printf(sc->dev,
"Failed to allocate KeepAlive command\n");
/* Clear ka_active_tx_traffic after sending the keep alive command. */
atomic_store_int(&sc->ka_active_tx_traffic, 0);
callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
}
int
nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp)
{
const nvlist_t *const *io;
const nvlist_t *admin;
nvlist_t *nvl;
size_t i, num_io_queues;
uint32_t qsize;
int error;
error = nvmf_unpack_ioc_nvlist(nv, &nvl);
if (error != 0)
return (error);
if (!nvlist_exists_number(nvl, "trtype") ||
!nvlist_exists_nvlist(nvl, "admin") ||
!nvlist_exists_nvlist_array(nvl, "io") ||
!nvlist_exists_binary(nvl, "cdata"))
goto invalid;
admin = nvlist_get_nvlist(nvl, "admin");
if (!nvmf_validate_qpair_nvlist(admin, false))
goto invalid;
if (!nvlist_get_bool(admin, "admin"))
goto invalid;
io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
if (num_io_queues < 1)
goto invalid;
for (i = 0; i < num_io_queues; i++) {
if (!nvmf_validate_qpair_nvlist(io[i], false))
goto invalid;
}
/* Require all I/O queues to be the same size. */
qsize = nvlist_get_number(io[0], "qsize");
for (i = 1; i < num_io_queues; i++) {
if (nvlist_get_number(io[i], "qsize") != qsize)
goto invalid;
}
nvlist_get_binary(nvl, "cdata", &i);
if (i != sizeof(struct nvme_controller_data))
goto invalid;
*nvlp = nvl;
return (0);
invalid:
nvlist_destroy(nvl);
return (EINVAL);
}
static int
nvmf_probe(device_t dev)
{
const nvlist_t *nvl = device_get_ivars(dev);
const struct nvme_controller_data *cdata;
if (nvl == NULL)
return (ENXIO);
cdata = nvlist_get_binary(nvl, "cdata", NULL);
device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn);
return (BUS_PROBE_DEFAULT);
}
static int
nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl)
{
const nvlist_t *const *io;
const nvlist_t *admin;
uint64_t kato;
size_t num_io_queues;
enum nvmf_trtype trtype;
char name[16];
trtype = nvlist_get_number(nvl, "trtype");
admin = nvlist_get_nvlist(nvl, "admin");
io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues);
kato = dnvlist_get_number(nvl, "kato", 0);
/* Setup the admin queue. */
sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0);
if (sc->admin == NULL) {
device_printf(sc->dev, "Failed to setup admin queue\n");
return (ENXIO);
}
/* Setup I/O queues. */
sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF,
M_WAITOK | M_ZERO);
sc->num_io_queues = num_io_queues;
for (u_int i = 0; i < sc->num_io_queues; i++) {
snprintf(name, sizeof(name), "I/O queue %u", i);
sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i);
if (sc->io[i] == NULL) {
device_printf(sc->dev, "Failed to setup I/O queue %u\n",
i + 1);
return (ENXIO);
}
}
/* Start KeepAlive timers. */
if (kato != 0) {
sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
sc->cdata->ctratt) != 0;
sc->ka_rx_sbt = mstosbt(kato);
sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
nvmf_check_keep_alive, sc, C_HARDCLOCK);
callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
nvmf_send_keep_alive, sc, C_HARDCLOCK);
}
memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL),
sizeof(*sc->cdata));
return (0);
}
typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t,
const struct nvme_namespace_data *, void *);
static bool
nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
struct nvme_namespace_data *data, uint32_t *nsidp,
nvmf_scan_active_ns_cb *cb, void *cb_arg)
{
struct nvmf_completion_status status;
uint32_t nsid;
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY active namespaces command\n");
return (false);
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY active namespaces failed, status %#x\n",
le16toh(status.cqe.status));
return (false);
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY active namespaces failed with I/O error %d\n",
status.io_error);
return (false);
}
for (u_int i = 0; i < nitems(nslist->ns); i++) {
nsid = nslist->ns[i];
if (nsid == 0) {
*nsidp = 0;
return (true);
}
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
&status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY namespace %u command\n",
nsid);
return (false);
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed, status %#x\n", nsid,
le16toh(status.cqe.status));
return (false);
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed with I/O error %d\n",
nsid, status.io_error);
return (false);
}
nvme_namespace_data_swapbytes(data);
if (!cb(sc, nsid, data, cb_arg))
return (false);
}
MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1)
*nsidp = 0;
else
*nsidp = nsid;
return (true);
}
static bool
nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb,
void *cb_arg)
{
struct nvme_namespace_data *data;
struct nvme_ns_list *nslist;
uint32_t nsid;
bool retval;
nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
nsid = 0;
retval = true;
for (;;) {
if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb,
cb_arg)) {
retval = false;
break;
}
if (nsid == 0)
break;
}
free(data, M_NVMF);
free(nslist, M_NVMF);
return (retval);
}
static bool
nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid,
const struct nvme_namespace_data *data, void *arg __unused)
{
if (sc->ns[nsid - 1] != NULL) {
device_printf(sc->dev,
"duplicate namespace %u in active namespace list\n",
nsid);
return (false);
}
/*
* As in nvme_ns_construct, a size of zero indicates an
* invalid namespace.
*/
if (data->nsze == 0) {
device_printf(sc->dev,
"ignoring active namespace %u with zero size\n", nsid);
return (true);
}
sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
nvmf_sim_rescan_ns(sc, nsid);
return (true);
}
static bool
nvmf_add_namespaces(struct nvmf_softc *sc)
{
sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
M_WAITOK | M_ZERO);
return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL));
}
static int
nvmf_attach(device_t dev)
{
struct make_dev_args mda;
struct nvmf_softc *sc = device_get_softc(dev);
const nvlist_t *nvl = device_get_ivars(dev);
const nvlist_t * const *io;
struct sysctl_oid *oid;
uint64_t val;
u_int i;
int error;
if (nvl == NULL)
return (ENXIO);
sc->dev = dev;
sc->trtype = nvlist_get_number(nvl, "trtype");
callout_init(&sc->ka_rx_timer, 1);
callout_init(&sc->ka_tx_timer, 1);
sx_init(&sc->connection_lock, "nvmf connection");
TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev),
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues");
sc->ioq_oid_list = SYSCTL_CHILDREN(oid);
sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK);
nvmf_init_aer(sc);
error = nvmf_establish_connection(sc, nvl);
if (error != 0)
goto out;
error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch CAP\n");
error = ENXIO;
goto out;
}
error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch VS\n");
error = ENXIO;
goto out;
}
sc->vs = val;
/* Honor MDTS if it is set. */
sc->max_xfer_size = maxphys;
if (sc->cdata->mdts != 0) {
sc->max_xfer_size = ulmin(sc->max_xfer_size,
1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
}
io = nvlist_get_nvlist_array(nvl, "io", NULL);
sc->max_pending_io = nvlist_get_number(io[0], "qsize") *
sc->num_io_queues;
error = nvmf_init_sim(sc);
if (error != 0)
goto out;
error = nvmf_start_aer(sc);
if (error != 0) {
nvmf_destroy_sim(sc);
goto out;
}
if (!nvmf_add_namespaces(sc)) {
nvmf_destroy_sim(sc);
goto out;
}
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
mda.mda_si_drv1 = sc;
error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
if (error != 0) {
nvmf_destroy_sim(sc);
goto out;
}
sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync,
nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST);
sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync,
nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST);
return (0);
out:
if (sc->ns != NULL) {
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_destroy_ns(sc->ns[i]);
}
free(sc->ns, M_NVMF);
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
if (sc->admin != NULL)
nvmf_shutdown_controller(sc);
for (i = 0; i < sc->num_io_queues; i++) {
if (sc->io[i] != NULL)
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
nvmf_destroy_aer(sc);
taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
sx_destroy(&sc->connection_lock);
free(sc->cdata, M_NVMF);
return (error);
}
void
nvmf_disconnect(struct nvmf_softc *sc)
{
taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
}
static void
nvmf_disconnect_task(void *arg, int pending __unused)
{
struct nvmf_softc *sc = arg;
u_int i;
sx_xlock(&sc->connection_lock);
if (sc->admin == NULL) {
/*
* Ignore transport errors if there is no active
* association.
*/
sx_xunlock(&sc->connection_lock);
return;
}
if (sc->detaching) {
if (sc->admin != NULL) {
/*
* This unsticks the detach process if a
* transport error occurs during detach.
*/
nvmf_shutdown_qp(sc->admin);
}
sx_xunlock(&sc->connection_lock);
return;
}
if (sc->cdev == NULL) {
/*
* Transport error occurred during attach (nvmf_add_namespaces).
* Shutdown the admin queue.
*/
nvmf_shutdown_qp(sc->admin);
sx_xunlock(&sc->connection_lock);
return;
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
sc->ka_traffic = false;
/* Quiesce namespace consumers. */
nvmf_disconnect_sim(sc);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_disconnect_ns(sc->ns[i]);
}
/* Shutdown the existing qpairs. */
for (i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
sc->io = NULL;
sc->num_io_queues = 0;
nvmf_destroy_qp(sc->admin);
sc->admin = NULL;
sx_xunlock(&sc->connection_lock);
}
static int
nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
const struct nvme_controller_data *cdata;
nvlist_t *nvl;
u_int i;
int error;
error = nvmf_copyin_handoff(nv, &nvl);
if (error != 0)
return (error);
/* XXX: Should we permit changing the transport type? */
if (sc->trtype != nvlist_get_number(nvl, "trtype")) {
device_printf(sc->dev,
"transport type mismatch on reconnect\n");
return (EINVAL);
}
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching) {
error = EBUSY;
goto out;
}
/*
* Ensure this is for the same controller. Note that the
* controller ID can vary across associations if the remote
* system is using the dynamic controller model. This merely
* ensures the new association is connected to the same NVMe
* subsystem.
*/
cdata = nvlist_get_binary(nvl, "cdata", NULL);
if (memcmp(sc->cdata->subnqn, cdata->subnqn,
sizeof(cdata->subnqn)) != 0) {
device_printf(sc->dev,
"controller subsystem NQN mismatch on reconnect\n");
error = EINVAL;
goto out;
}
/*
* XXX: Require same number and size of I/O queues so that
* max_pending_io is still correct?
*/
error = nvmf_establish_connection(sc, nvl);
if (error != 0)
goto out;
error = nvmf_start_aer(sc);
if (error != 0)
goto out;
device_printf(sc->dev,
"established new association with %u I/O queues\n",
sc->num_io_queues);
/* Restart namespace consumers. */
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_reconnect_ns(sc->ns[i]);
}
nvmf_reconnect_sim(sc);
nvmf_rescan_all_ns(sc);
out:
sx_xunlock(&sc->connection_lock);
nvlist_destroy(nvl);
return (error);
}
static void
nvmf_shutdown_pre_sync(void *arg, int howto)
{
struct nvmf_softc *sc = arg;
if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
return;
/*
* If this association is disconnected, abort any pending
* requests with an error to permit filesystems to unmount
* without hanging.
*/
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching) {
sx_xunlock(&sc->connection_lock);
return;
}
for (u_int i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_shutdown_ns(sc->ns[i]);
}
nvmf_shutdown_sim(sc);
sx_xunlock(&sc->connection_lock);
}
static void
nvmf_shutdown_post_sync(void *arg, int howto)
{
struct nvmf_softc *sc = arg;
if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED())
return;
/*
* If this association is connected, disconnect gracefully.
*/
sx_xlock(&sc->connection_lock);
if (sc->admin == NULL || sc->detaching) {
sx_xunlock(&sc->connection_lock);
return;
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
nvmf_shutdown_controller(sc);
for (u_int i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
nvmf_destroy_qp(sc->admin);
sc->admin = NULL;
sx_xunlock(&sc->connection_lock);
}
static int
nvmf_detach(device_t dev)
{
struct nvmf_softc *sc = device_get_softc(dev);
u_int i;
destroy_dev(sc->cdev);
sx_xlock(&sc->connection_lock);
sc->detaching = true;
sx_xunlock(&sc->connection_lock);
EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh);
EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh);
nvmf_destroy_sim(sc);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_destroy_ns(sc->ns[i]);
}
free(sc->ns, M_NVMF);
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
if (sc->admin != NULL)
nvmf_shutdown_controller(sc);
for (i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
nvmf_destroy_aer(sc);
sx_destroy(&sc->connection_lock);
free(sc->cdata, M_NVMF);
return (0);
}
static void
nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid,
const struct nvme_namespace_data *data)
{
struct nvmf_namespace *ns;
/* XXX: Needs locking around sc->ns[]. */
ns = sc->ns[nsid - 1];
if (data->nsze == 0) {
/* XXX: Needs locking */
if (ns != NULL) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
}
} else {
/* XXX: Needs locking */
if (ns == NULL) {
sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
} else {
if (!nvmf_update_ns(ns, data)) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
}
}
}
nvmf_sim_rescan_ns(sc, nsid);
}
void
nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
{
struct nvmf_completion_status status;
struct nvme_namespace_data *data;
data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
&status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY namespace %u command\n", nsid);
free(data, M_NVMF);
return;
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed, status %#x\n", nsid,
le16toh(status.cqe.status));
free(data, M_NVMF);
return;
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed with I/O error %d\n",
nsid, status.io_error);
free(data, M_NVMF);
return;
}
nvme_namespace_data_swapbytes(data);
nvmf_rescan_ns_1(sc, nsid, data);
free(data, M_NVMF);
}
static void
nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid,
uint32_t next_valid_nsid)
{
struct nvmf_namespace *ns;
for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++)
{
/* XXX: Needs locking around sc->ns[]. */
ns = sc->ns[nsid - 1];
if (ns != NULL) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
nvmf_sim_rescan_ns(sc, nsid);
}
}
}
static bool
nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid,
const struct nvme_namespace_data *data, void *arg)
{
uint32_t *last_nsid = arg;
/* Check for any gaps prior to this namespace. */
nvmf_purge_namespaces(sc, *last_nsid + 1, nsid);
*last_nsid = nsid;
nvmf_rescan_ns_1(sc, nsid, data);
return (true);
}
void
nvmf_rescan_all_ns(struct nvmf_softc *sc)
{
uint32_t last_nsid;
last_nsid = 0;
if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid))
return;
/*
* Check for any namespace devices after the last active
* namespace.
*/
nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1);
}
int
nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
bool admin)
{
struct nvmf_completion_status status;
struct nvme_command cmd;
struct memdesc mem;
struct nvmf_host_qpair *qp;
struct nvmf_request *req;
void *buf;
int error;
if (pt->len > sc->max_xfer_size)
return (EINVAL);
buf = NULL;
if (pt->len != 0) {
/*
* XXX: Depending on the size we may want to pin the
* user pages and use a memdesc with vm_page_t's
* instead.
*/
buf = malloc(pt->len, M_NVMF, M_WAITOK);
if (pt->is_read == 0) {
error = copyin(pt->buf, buf, pt->len);
if (error != 0) {
free(buf, M_NVMF);
return (error);
}
} else {
/* Ensure no kernel data is leaked to userland. */
memset(buf, 0, pt->len);
}
}
memset(&cmd, 0, sizeof(cmd));
cmd.opc = pt->cmd.opc;
cmd.fuse = pt->cmd.fuse;
cmd.nsid = pt->cmd.nsid;
cmd.cdw10 = pt->cmd.cdw10;
cmd.cdw11 = pt->cmd.cdw11;
cmd.cdw12 = pt->cmd.cdw12;
cmd.cdw13 = pt->cmd.cdw13;
cmd.cdw14 = pt->cmd.cdw14;
cmd.cdw15 = pt->cmd.cdw15;
sx_slock(&sc->connection_lock);
if (sc->admin == NULL || sc->detaching) {
device_printf(sc->dev,
"failed to send passthrough command\n");
error = ECONNABORTED;
sx_sunlock(&sc->connection_lock);
goto error;
}
if (admin)
qp = sc->admin;
else
qp = nvmf_select_io_queue(sc);
nvmf_status_init(&status);
req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
sx_sunlock(&sc->connection_lock);
if (req == NULL) {
device_printf(sc->dev, "failed to send passthrough command\n");
error = ECONNABORTED;
goto error;
}
if (pt->len != 0) {
mem = memdesc_vaddr(buf, pt->len);
nvmf_capsule_append_data(req->nc, &mem, pt->len,
pt->is_read == 0, nvmf_io_complete, &status);
nvmf_status_wait_io(&status);
}
nvmf_submit_request(req);
nvmf_wait_for_reply(&status);
memset(&pt->cpl, 0, sizeof(pt->cpl));
pt->cpl.cdw0 = status.cqe.cdw0;
pt->cpl.status = status.cqe.status;
error = status.io_error;
if (error == 0 && pt->len != 0 && pt->is_read != 0)
error = copyout(buf, pt->buf, pt->len);
error:
free(buf, M_NVMF);
return (error);
}
static int
nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv)
{
nvlist_t *nvl;
int error;
nvl = nvlist_create(0);
sx_slock(&sc->connection_lock);
if ((sc->cdata->fcatt & 1) == 0)
nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC);
else
nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id);
nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn);
sx_sunlock(&sc->connection_lock);
error = nvmf_pack_ioc_nvlist(nvl, nv);
nvlist_destroy(nvl);
return (error);
}
static int
nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
struct nvmf_softc *sc = cdev->si_drv1;
struct nvme_get_nsid *gnsid;
struct nvme_pt_command *pt;
struct nvmf_ioc_nv *nv;
switch (cmd) {
case NVME_PASSTHROUGH_CMD:
pt = (struct nvme_pt_command *)arg;
return (nvmf_passthrough_cmd(sc, pt, true));
case NVME_GET_NSID:
gnsid = (struct nvme_get_nsid *)arg;
strlcpy(gnsid->cdev, device_get_nameunit(sc->dev),
sizeof(gnsid->cdev));
gnsid->nsid = 0;
return (0);
case NVME_GET_MAX_XFER_SIZE:
*(uint64_t *)arg = sc->max_xfer_size;
return (0);
case NVMF_RECONNECT_PARAMS:
nv = (struct nvmf_ioc_nv *)arg;
return (nvmf_reconnect_params(sc, nv));
case NVMF_RECONNECT_HOST:
nv = (struct nvmf_ioc_nv *)arg;
return (nvmf_reconnect_host(sc, nv));
default:
return (ENOTTY);
}
}
static struct cdevsw nvmf_cdevsw = {
.d_version = D_VERSION,
.d_ioctl = nvmf_ioctl
};
static int
nvmf_modevent(module_t mod, int what, void *arg)
{
switch (what) {
case MOD_LOAD:
return (nvmf_ctl_load());
case MOD_QUIESCE:
return (0);
case MOD_UNLOAD:
nvmf_ctl_unload();
destroy_dev_drain(&nvmf_cdevsw);
return (0);
default:
return (EOPNOTSUPP);
}
}
static device_method_t nvmf_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, nvmf_probe),
DEVMETHOD(device_attach, nvmf_attach),
DEVMETHOD(device_detach, nvmf_detach),
DEVMETHOD_END
};
driver_t nvme_nvmf_driver = {
"nvme",
nvmf_methods,
sizeof(struct nvmf_softc),
};
DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);