opnsense-src/sys/dev/nvmf/controller/nvmft_controller.c
John Baldwin a15f7c96a2 nvmft: The in-kernel NVMe over Fabrics controller
This is the server (target in SCSI terms) for NVMe over Fabrics.
Userland is responsible for accepting a new queue pair and receiving
the initial Connect command before handing the queue pair off via an
ioctl to this CTL frontend.

This frontend exposes CTL LUNs as NVMe namespaces to remote hosts.
Users can ask LUNS to CTL that can be shared via either iSCSI or
NVMeoF.

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44726
2024-05-02 16:38:30 -07:00

1130 lines
28 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/memdesc.h>
#include <sys/mutex.h>
#include <sys/sbuf.h>
#include <sys/sx.h>
#include <sys/taskqueue.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/controller/nvmft_subr.h>
#include <dev/nvmf/controller/nvmft_var.h>
static void nvmft_controller_shutdown(void *arg, int pending);
static void nvmft_controller_terminate(void *arg, int pending);
int
nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
{
char buf[128];
struct sbuf sb;
va_list ap;
size_t retval;
sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
sbuf_set_drain(&sb, sbuf_printf_drain, &retval);
sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid);
va_start(ap, fmt);
sbuf_vprintf(&sb, fmt, ap);
va_end(ap);
sbuf_finish(&sb);
sbuf_delete(&sb);
return (retval);
}
static struct nvmft_controller *
nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid,
const struct nvmf_fabric_connect_data *data)
{
struct nvmft_controller *ctrlr;
ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO);
ctrlr->cntlid = cntlid;
nvmft_port_ref(np);
TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link);
ctrlr->np = np;
mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF);
callout_init(&ctrlr->ka_timer, 1);
TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr);
TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0,
nvmft_controller_terminate, ctrlr);
ctrlr->cdata = np->cdata;
ctrlr->cdata.ctrlr_id = htole16(cntlid);
memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid));
memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn));
ctrlr->hip.power_cycles[0] = 1;
ctrlr->create_time = sbinuptime();
ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT,
M_WAITOK | M_ZERO);
return (ctrlr);
}
static void
nvmft_controller_free(struct nvmft_controller *ctrlr)
{
mtx_destroy(&ctrlr->lock);
MPASS(ctrlr->io_qpairs == NULL);
free(ctrlr->changed_ns, M_NVMFT);
free(ctrlr, M_NVMFT);
}
static void
nvmft_keep_alive_timer(void *arg)
{
struct nvmft_controller *ctrlr = arg;
int traffic;
if (ctrlr->shutdown)
return;
traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic);
if (traffic == 0) {
nvmft_printf(ctrlr,
"disconnecting due to KeepAlive timeout\n");
nvmft_controller_error(ctrlr, NULL, ETIMEDOUT);
return;
}
callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK);
}
int
nvmft_handoff_admin_queue(struct nvmft_port *np,
const struct nvmf_handoff_controller_qpair *handoff,
const struct nvmf_fabric_connect_cmd *cmd,
const struct nvmf_fabric_connect_data *data)
{
struct nvmft_controller *ctrlr;
struct nvmft_qpair *qp;
uint32_t kato;
int cntlid;
if (cmd->qid != htole16(0))
return (EINVAL);
qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0,
"admin queue");
sx_xlock(&np->lock);
cntlid = alloc_unr(np->ids);
if (cntlid == -1) {
sx_xunlock(&np->lock);
printf("NVMFT: Unable to allocate controller for %.*s\n",
(int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_INVALID_HOST);
nvmft_qpair_destroy(qp);
return (ENOMEM);
}
#ifdef INVARIANTS
TAILQ_FOREACH(ctrlr, &np->controllers, link) {
KASSERT(ctrlr->cntlid != cntlid,
("%s: duplicate controllers with id %d", __func__, cntlid));
}
#endif
ctrlr = nvmft_controller_alloc(np, cntlid, data);
nvmft_printf(ctrlr, "associated with %.*s\n",
(int)sizeof(data->hostnqn), data->hostnqn);
ctrlr->admin = qp;
ctrlr->trtype = handoff->trtype;
/*
* The spec requires a non-zero KeepAlive timer, but allow a
* zero KATO value to match Linux.
*/
kato = le32toh(cmd->kato);
if (kato != 0) {
/*
* Round up to 1 second matching granularity
* advertised in cdata.
*/
ctrlr->ka_sbt = mstosbt(roundup(kato, 1000));
callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK);
}
nvmft_finish_accept(qp, cmd, ctrlr);
sx_xunlock(&np->lock);
return (0);
}
int
nvmft_handoff_io_queue(struct nvmft_port *np,
const struct nvmf_handoff_controller_qpair *handoff,
const struct nvmf_fabric_connect_cmd *cmd,
const struct nvmf_fabric_connect_data *data)
{
struct nvmft_controller *ctrlr;
struct nvmft_qpair *qp;
char name[16];
uint16_t cntlid, qid;
qid = le16toh(cmd->qid);
if (qid == 0)
return (EINVAL);
cntlid = le16toh(data->cntlid);
snprintf(name, sizeof(name), "I/O queue %u", qid);
qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name);
sx_slock(&np->lock);
TAILQ_FOREACH(ctrlr, &np->controllers, link) {
if (ctrlr->cntlid == cntlid)
break;
}
if (ctrlr == NULL) {
sx_sunlock(&np->lock);
printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n",
ctrlr->cntlid, qid, (int)sizeof(data->hostnqn),
data->hostnqn);
nvmft_connect_invalid_parameters(qp, cmd, true,
offsetof(struct nvmf_fabric_connect_data, cntlid));
nvmft_qpair_destroy(qp);
return (ENOENT);
}
if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) {
sx_sunlock(&np->lock);
nvmft_printf(ctrlr,
"hostid mismatch for I/O queue %u from %.*s\n", qid,
(int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_invalid_parameters(qp, cmd, true,
offsetof(struct nvmf_fabric_connect_data, hostid));
nvmft_qpair_destroy(qp);
return (EINVAL);
}
if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) {
sx_sunlock(&np->lock);
nvmft_printf(ctrlr,
"hostnqn mismatch for I/O queue %u from %.*s\n", qid,
(int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_invalid_parameters(qp, cmd, true,
offsetof(struct nvmf_fabric_connect_data, hostnqn));
nvmft_qpair_destroy(qp);
return (EINVAL);
}
/* XXX: Require handoff->trtype == ctrlr->trtype? */
mtx_lock(&ctrlr->lock);
if (ctrlr->shutdown) {
mtx_unlock(&ctrlr->lock);
sx_sunlock(&np->lock);
nvmft_printf(ctrlr,
"attempt to create I/O queue %u on disabled controller from %.*s\n",
qid, (int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_invalid_parameters(qp, cmd, true,
offsetof(struct nvmf_fabric_connect_data, cntlid));
nvmft_qpair_destroy(qp);
return (EINVAL);
}
if (ctrlr->num_io_queues == 0) {
mtx_unlock(&ctrlr->lock);
sx_sunlock(&np->lock);
nvmft_printf(ctrlr,
"attempt to create I/O queue %u without enabled queues from %.*s\n",
qid, (int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
NVME_SC_COMMAND_SEQUENCE_ERROR);
nvmft_qpair_destroy(qp);
return (EINVAL);
}
if (cmd->qid > ctrlr->num_io_queues) {
mtx_unlock(&ctrlr->lock);
sx_sunlock(&np->lock);
nvmft_printf(ctrlr,
"attempt to create invalid I/O queue %u from %.*s\n", qid,
(int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_invalid_parameters(qp, cmd, false,
offsetof(struct nvmf_fabric_connect_cmd, qid));
nvmft_qpair_destroy(qp);
return (EINVAL);
}
if (ctrlr->io_qpairs[qid - 1].qp != NULL) {
mtx_unlock(&ctrlr->lock);
sx_sunlock(&np->lock);
nvmft_printf(ctrlr,
"attempt to re-create I/O queue %u from %.*s\n", qid,
(int)sizeof(data->hostnqn), data->hostnqn);
nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC,
NVME_SC_COMMAND_SEQUENCE_ERROR);
nvmft_qpair_destroy(qp);
return (EINVAL);
}
ctrlr->io_qpairs[qid - 1].qp = qp;
mtx_unlock(&ctrlr->lock);
nvmft_finish_accept(qp, cmd, ctrlr);
sx_sunlock(&np->lock);
return (0);
}
static void
nvmft_controller_shutdown(void *arg, int pending)
{
struct nvmft_controller *ctrlr = arg;
MPASS(pending == 1);
/*
* Shutdown all I/O queues to terminate pending datamoves and
* stop receiving new commands.
*/
mtx_lock(&ctrlr->lock);
for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
if (ctrlr->io_qpairs[i].qp != NULL) {
ctrlr->io_qpairs[i].shutdown = true;
mtx_unlock(&ctrlr->lock);
nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp);
mtx_lock(&ctrlr->lock);
}
}
mtx_unlock(&ctrlr->lock);
/* Terminate active CTL commands. */
nvmft_terminate_commands(ctrlr);
/* Wait for all pending CTL commands to complete. */
mtx_lock(&ctrlr->lock);
while (ctrlr->pending_commands != 0)
mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh",
hz / 100);
mtx_unlock(&ctrlr->lock);
/* Delete all of the I/O queues. */
for (u_int i = 0; i < ctrlr->num_io_queues; i++) {
if (ctrlr->io_qpairs[i].qp != NULL)
nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp);
}
free(ctrlr->io_qpairs, M_NVMFT);
ctrlr->io_qpairs = NULL;
mtx_lock(&ctrlr->lock);
ctrlr->num_io_queues = 0;
/* Mark shutdown complete. */
if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) {
ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
}
if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) {
ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY);
ctrlr->shutdown = false;
}
mtx_unlock(&ctrlr->lock);
/*
* If the admin queue was closed while shutting down or a
* fatal controller error has occurred, terminate the
* association immediately, otherwise wait up to 2 minutes
* (NVMe-over-Fabrics 1.1 4.6).
*/
if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0)
nvmft_controller_terminate(ctrlr, 0);
else
taskqueue_enqueue_timeout(taskqueue_thread,
&ctrlr->terminate_task, hz * 60 * 2);
}
static void
nvmft_controller_terminate(void *arg, int pending)
{
struct nvmft_controller *ctrlr = arg;
struct nvmft_port *np;
bool wakeup_np;
/* If the controller has been re-enabled, nothing to do. */
mtx_lock(&ctrlr->lock);
if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) {
mtx_unlock(&ctrlr->lock);
if (ctrlr->ka_sbt != 0)
callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0,
C_HARDCLOCK);
return;
}
/* Disable updates to CC while destroying admin qpair. */
ctrlr->shutdown = true;
mtx_unlock(&ctrlr->lock);
nvmft_qpair_destroy(ctrlr->admin);
/* Remove association (CNTLID). */
np = ctrlr->np;
sx_xlock(&np->lock);
TAILQ_REMOVE(&np->controllers, ctrlr, link);
free_unr(np->ids, ctrlr->cntlid);
wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers));
sx_xunlock(&np->lock);
if (wakeup_np)
wakeup(np);
callout_drain(&ctrlr->ka_timer);
nvmft_printf(ctrlr, "association terminated\n");
nvmft_controller_free(ctrlr);
nvmft_port_rele(np);
}
void
nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp,
int error)
{
/*
* If a queue pair is closed, that isn't an error per se.
* That just means additional commands cannot be received on
* that queue pair.
*
* If the admin queue pair is closed while idle or while
* shutting down, terminate the association immediately.
*
* If an I/O queue pair is closed, just ignore it.
*/
if (error == 0) {
if (qp != ctrlr->admin)
return;
mtx_lock(&ctrlr->lock);
if (ctrlr->shutdown) {
ctrlr->admin_closed = true;
mtx_unlock(&ctrlr->lock);
return;
}
if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) {
MPASS(ctrlr->num_io_queues == 0);
mtx_unlock(&ctrlr->lock);
/*
* Ok to drop lock here since ctrlr->cc can't
* change if the admin queue pair has closed.
* This also means no new queues can be handed
* off, etc. Note that since there are no I/O
* queues, only the admin queue needs to be
* destroyed, so it is safe to skip
* nvmft_controller_shutdown and just schedule
* nvmft_controller_terminate. Note that we
* cannot call nvmft_controller_terminate from
* here directly as this is called from the
* transport layer and freeing the admin qpair
* might deadlock waiting for the current
* thread to exit.
*/
if (taskqueue_cancel_timeout(taskqueue_thread,
&ctrlr->terminate_task, NULL) == 0)
taskqueue_enqueue_timeout(taskqueue_thread,
&ctrlr->terminate_task, 0);
return;
}
/*
* Treat closing of the admin queue pair while enabled
* as a transport error. Note that the admin queue
* pair has been closed.
*/
ctrlr->admin_closed = true;
} else
mtx_lock(&ctrlr->lock);
/* Ignore transport errors if we are already shutting down. */
if (ctrlr->shutdown) {
mtx_unlock(&ctrlr->lock);
return;
}
ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1);
ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
ctrlr->shutdown = true;
mtx_unlock(&ctrlr->lock);
callout_stop(&ctrlr->ka_timer);
taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
}
/* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */
static struct mbuf *
m_getml(size_t len, int how)
{
struct mbuf *m, *n;
m = m_getm2(NULL, len, how, MT_DATA, 0);
if (m == NULL)
return (NULL);
for (n = m; len > 0; n = n->m_next) {
n->m_len = M_SIZE(n);
if (n->m_len >= len) {
n->m_len = len;
MPASS(n->m_next == NULL);
}
len -= n->m_len;
}
return (m);
}
static void
m_zero(struct mbuf *m, u_int offset, u_int len)
{
u_int todo;
if (len == 0)
return;
while (m->m_len <= offset) {
offset -= m->m_len;
m = m->m_next;
}
todo = m->m_len - offset;
if (todo > len)
todo = len;
memset(mtodo(m, offset), 0, todo);
m = m->m_next;
len -= todo;
while (len > 0) {
todo = m->m_len;
if (todo > len)
todo = len;
memset(mtod(m, void *), 0, todo);
m = m->m_next;
len -= todo;
}
}
static void
handle_get_log_page(struct nvmft_controller *ctrlr,
struct nvmf_capsule *nc, const struct nvme_command *cmd)
{
struct mbuf *m;
uint64_t offset;
uint32_t numd;
size_t len, todo;
u_int status;
uint8_t lid;
bool rae;
lid = le32toh(cmd->cdw10) & 0xff;
rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0;
numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
if (offset % 3 != 0) {
status = NVME_SC_INVALID_FIELD;
goto done;
}
len = (numd + 1) * 4;
switch (lid) {
case NVME_LOG_ERROR:
todo = 0;
m = m_getml(len, M_WAITOK);
if (todo != len)
m_zero(m, todo, len - todo);
status = nvmf_send_controller_data(nc, 0, m, len);
MPASS(status != NVMF_MORE);
break;
case NVME_LOG_HEALTH_INFORMATION:
{
struct nvme_health_information_page hip;
if (offset >= sizeof(hip)) {
status = NVME_SC_INVALID_FIELD;
goto done;
}
todo = sizeof(hip) - offset;
if (todo > len)
todo = len;
mtx_lock(&ctrlr->lock);
hip = ctrlr->hip;
hip.controller_busy_time[0] =
sbintime_getsec(ctrlr->busy_total) / 60;
hip.power_on_hours[0] =
sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600;
mtx_unlock(&ctrlr->lock);
m = m_getml(len, M_WAITOK);
m_copyback(m, 0, todo, (char *)&hip + offset);
if (todo != len)
m_zero(m, todo, len - todo);
status = nvmf_send_controller_data(nc, 0, m, len);
MPASS(status != NVMF_MORE);
break;
}
case NVME_LOG_FIRMWARE_SLOT:
if (offset >= sizeof(ctrlr->np->fp)) {
status = NVME_SC_INVALID_FIELD;
goto done;
}
todo = sizeof(ctrlr->np->fp) - offset;
if (todo > len)
todo = len;
m = m_getml(len, M_WAITOK);
m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset);
if (todo != len)
m_zero(m, todo, len - todo);
status = nvmf_send_controller_data(nc, 0, m, len);
MPASS(status != NVMF_MORE);
break;
case NVME_LOG_CHANGED_NAMESPACE:
if (offset >= sizeof(*ctrlr->changed_ns)) {
status = NVME_SC_INVALID_FIELD;
goto done;
}
todo = sizeof(*ctrlr->changed_ns) - offset;
if (todo > len)
todo = len;
m = m_getml(len, M_WAITOK);
mtx_lock(&ctrlr->lock);
m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset);
if (offset == 0 && len == sizeof(*ctrlr->changed_ns))
memset(ctrlr->changed_ns, 0,
sizeof(*ctrlr->changed_ns));
if (!rae)
ctrlr->changed_ns_reported = false;
mtx_unlock(&ctrlr->lock);
if (todo != len)
m_zero(m, todo, len - todo);
status = nvmf_send_controller_data(nc, 0, m, len);
MPASS(status != NVMF_MORE);
break;
default:
nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n",
lid);
status = NVME_SC_INVALID_FIELD;
break;
}
done:
if (status == NVMF_SUCCESS_SENT)
nvmft_command_completed(ctrlr->admin, nc);
else
nvmft_send_generic_error(ctrlr->admin, nc, status);
nvmf_free_capsule(nc);
}
static void
m_free_nslist(struct mbuf *m)
{
free(m->m_ext.ext_arg1, M_NVMFT);
}
static void
handle_identify_command(struct nvmft_controller *ctrlr,
struct nvmf_capsule *nc, const struct nvme_command *cmd)
{
struct mbuf *m;
size_t data_len;
u_int status;
uint8_t cns;
cns = le32toh(cmd->cdw10) & 0xFF;
data_len = nvmf_capsule_data_len(nc);
if (data_len != sizeof(ctrlr->cdata)) {
nvmft_printf(ctrlr,
"Invalid length %zu for IDENTIFY with CNS %#x\n", data_len,
cns);
nvmft_send_generic_error(ctrlr->admin, nc,
NVME_SC_INVALID_OPCODE);
nvmf_free_capsule(nc);
return;
}
switch (cns) {
case 0: /* Namespace data. */
case 3: /* Namespace Identification Descriptor list. */
nvmft_dispatch_command(ctrlr->admin, nc, true);
return;
case 1:
/* Controller data. */
m = m_getml(sizeof(ctrlr->cdata), M_WAITOK);
m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata);
status = nvmf_send_controller_data(nc, 0, m,
sizeof(ctrlr->cdata));
MPASS(status != NVMF_MORE);
break;
case 2:
{
/* Active namespace list. */
struct nvme_ns_list *nslist;
uint32_t nsid;
nsid = le32toh(cmd->nsid);
if (nsid >= 0xfffffffe) {
status = NVME_SC_INVALID_FIELD;
break;
}
nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO);
nvmft_populate_active_nslist(ctrlr->np, nsid, nslist);
m = m_get(M_WAITOK, MT_DATA);
m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist,
nslist, NULL, 0, EXT_CTL);
m->m_len = sizeof(*nslist);
status = nvmf_send_controller_data(nc, 0, m, m->m_len);
MPASS(status != NVMF_MORE);
break;
}
default:
nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns);
status = NVME_SC_INVALID_FIELD;
break;
}
if (status == NVMF_SUCCESS_SENT)
nvmft_command_completed(ctrlr->admin, nc);
else
nvmft_send_generic_error(ctrlr->admin, nc, status);
nvmf_free_capsule(nc);
}
static void
handle_set_features(struct nvmft_controller *ctrlr,
struct nvmf_capsule *nc, const struct nvme_command *cmd)
{
struct nvme_completion cqe;
uint8_t fid;
fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
switch (fid) {
case NVME_FEAT_NUMBER_OF_QUEUES:
{
uint32_t num_queues;
struct nvmft_io_qpair *io_qpairs;
num_queues = le32toh(cmd->cdw11) & 0xffff;
/* 5.12.1.7: 65535 is invalid. */
if (num_queues == 65535)
goto error;
/* Fabrics requires the same number of SQs and CQs. */
if (le32toh(cmd->cdw11) >> 16 != num_queues)
goto error;
/* Convert to 1's based */
num_queues++;
io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs),
M_NVMFT, M_WAITOK | M_ZERO);
mtx_lock(&ctrlr->lock);
if (ctrlr->num_io_queues != 0) {
mtx_unlock(&ctrlr->lock);
free(io_qpairs, M_NVMFT);
nvmft_send_generic_error(ctrlr->admin, nc,
NVME_SC_COMMAND_SEQUENCE_ERROR);
nvmf_free_capsule(nc);
return;
}
ctrlr->num_io_queues = num_queues;
ctrlr->io_qpairs = io_qpairs;
mtx_unlock(&ctrlr->lock);
nvmft_init_cqe(&cqe, nc, 0);
cqe.cdw0 = cmd->cdw11;
nvmft_send_response(ctrlr->admin, &cqe);
nvmf_free_capsule(nc);
return;
}
case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
{
uint32_t aer_mask;
aer_mask = le32toh(cmd->cdw11);
/* Check for any reserved or unimplemented feature bits. */
if ((aer_mask & 0xffffc000) != 0)
goto error;
mtx_lock(&ctrlr->lock);
ctrlr->aer_mask = aer_mask;
mtx_unlock(&ctrlr->lock);
nvmft_send_success(ctrlr->admin, nc);
return;
}
default:
nvmft_printf(ctrlr,
"Unsupported feature ID %u for SET_FEATURES\n", fid);
goto error;
}
error:
nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
nvmf_free_capsule(nc);
}
static bool
update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown)
{
struct nvmft_port *np = ctrlr->np;
uint32_t changes;
*need_shutdown = false;
mtx_lock(&ctrlr->lock);
/* Don't allow any changes while shutting down. */
if (ctrlr->shutdown) {
mtx_unlock(&ctrlr->lock);
return (false);
}
if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) {
mtx_unlock(&ctrlr->lock);
return (false);
}
changes = ctrlr->cc ^ new_cc;
ctrlr->cc = new_cc;
/* Handle shutdown requests. */
if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING);
ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN);
ctrlr->shutdown = true;
*need_shutdown = true;
nvmft_printf(ctrlr, "shutdown requested\n");
}
if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
/* Controller reset. */
nvmft_printf(ctrlr, "reset requested\n");
ctrlr->shutdown = true;
*need_shutdown = true;
} else
ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
}
mtx_unlock(&ctrlr->lock);
return (true);
}
static void
handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
const struct nvmf_fabric_prop_get_cmd *pget)
{
struct nvmf_fabric_prop_get_rsp rsp;
nvmft_init_cqe(&rsp, nc, 0);
switch (le32toh(pget->ofst)) {
case NVMF_PROP_CAP:
if (pget->attrib.size != NVMF_PROP_SIZE_8)
goto error;
rsp.value.u64 = htole64(ctrlr->np->cap);
break;
case NVMF_PROP_VS:
if (pget->attrib.size != NVMF_PROP_SIZE_4)
goto error;
rsp.value.u32.low = ctrlr->cdata.ver;
break;
case NVMF_PROP_CC:
if (pget->attrib.size != NVMF_PROP_SIZE_4)
goto error;
rsp.value.u32.low = htole32(ctrlr->cc);
break;
case NVMF_PROP_CSTS:
if (pget->attrib.size != NVMF_PROP_SIZE_4)
goto error;
rsp.value.u32.low = htole32(ctrlr->csts);
break;
default:
goto error;
}
nvmft_send_response(ctrlr->admin, &rsp);
return;
error:
nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
}
static void
handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc,
const struct nvmf_fabric_prop_set_cmd *pset)
{
bool need_shutdown;
need_shutdown = false;
switch (le32toh(pset->ofst)) {
case NVMF_PROP_CC:
if (pset->attrib.size != NVMF_PROP_SIZE_4)
goto error;
if (!update_cc(ctrlr, le32toh(pset->value.u32.low),
&need_shutdown))
goto error;
break;
default:
goto error;
}
nvmft_send_success(ctrlr->admin, nc);
if (need_shutdown) {
callout_stop(&ctrlr->ka_timer);
taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task);
}
return;
error:
nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD);
}
static void
handle_admin_fabrics_command(struct nvmft_controller *ctrlr,
struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
{
switch (fc->fctype) {
case NVMF_FABRIC_COMMAND_PROPERTY_GET:
handle_property_get(ctrlr, nc,
(const struct nvmf_fabric_prop_get_cmd *)fc);
break;
case NVMF_FABRIC_COMMAND_PROPERTY_SET:
handle_property_set(ctrlr, nc,
(const struct nvmf_fabric_prop_set_cmd *)fc);
break;
case NVMF_FABRIC_COMMAND_CONNECT:
nvmft_printf(ctrlr,
"CONNECT command on connected admin queue\n");
nvmft_send_generic_error(ctrlr->admin, nc,
NVME_SC_COMMAND_SEQUENCE_ERROR);
break;
case NVMF_FABRIC_COMMAND_DISCONNECT:
nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n");
nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
break;
default:
nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n",
fc->fctype);
nvmft_send_generic_error(ctrlr->admin, nc,
NVME_SC_INVALID_OPCODE);
break;
}
nvmf_free_capsule(nc);
}
void
nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
struct nvmf_capsule *nc)
{
const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
/* Only permit Fabrics commands while a controller is disabled. */
if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 &&
cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
nvmft_printf(ctrlr,
"Unsupported admin opcode %#x whiled disabled\n", cmd->opc);
nvmft_send_generic_error(ctrlr->admin, nc,
NVME_SC_COMMAND_SEQUENCE_ERROR);
nvmf_free_capsule(nc);
return;
}
atomic_store_int(&ctrlr->ka_active_traffic, 1);
switch (cmd->opc) {
case NVME_OPC_GET_LOG_PAGE:
handle_get_log_page(ctrlr, nc, cmd);
break;
case NVME_OPC_IDENTIFY:
handle_identify_command(ctrlr, nc, cmd);
break;
case NVME_OPC_SET_FEATURES:
handle_set_features(ctrlr, nc, cmd);
break;
case NVME_OPC_ASYNC_EVENT_REQUEST:
mtx_lock(&ctrlr->lock);
if (ctrlr->aer_pending == NVMFT_NUM_AER) {
mtx_unlock(&ctrlr->lock);
nvmft_send_error(ctrlr->admin, nc,
NVME_SCT_COMMAND_SPECIFIC,
NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
} else {
/* NB: Store the CID without byte-swapping. */
ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid;
ctrlr->aer_pending++;
ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER;
mtx_unlock(&ctrlr->lock);
}
nvmf_free_capsule(nc);
break;
case NVME_OPC_KEEP_ALIVE:
nvmft_send_success(ctrlr->admin, nc);
nvmf_free_capsule(nc);
break;
case NVME_OPC_FABRICS_COMMANDS:
handle_admin_fabrics_command(ctrlr, nc,
(const struct nvmf_fabric_cmd *)cmd);
break;
default:
nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc);
nvmft_send_generic_error(ctrlr->admin, nc,
NVME_SC_INVALID_OPCODE);
nvmf_free_capsule(nc);
break;
}
}
void
nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
struct nvmf_capsule *nc)
{
struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
atomic_store_int(&ctrlr->ka_active_traffic, 1);
switch (cmd->opc) {
case NVME_OPC_FLUSH:
if (cmd->nsid == htole32(0xffffffff)) {
nvmft_send_generic_error(qp, nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
nvmf_free_capsule(nc);
break;
}
/* FALLTHROUGH */
case NVME_OPC_WRITE:
case NVME_OPC_READ:
case NVME_OPC_WRITE_UNCORRECTABLE:
case NVME_OPC_COMPARE:
case NVME_OPC_WRITE_ZEROES:
case NVME_OPC_DATASET_MANAGEMENT:
case NVME_OPC_VERIFY:
nvmft_dispatch_command(qp, nc, false);
break;
default:
nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc);
nvmft_send_generic_error(qp, nc,
NVME_SC_INVALID_OPCODE);
nvmf_free_capsule(nc);
break;
}
}
static void
nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask,
u_int type, uint8_t info, uint8_t log_page_id)
{
struct nvme_completion cpl;
MPASS(type <= 7);
/* Drop events that are not enabled. */
mtx_lock(&ctrlr->lock);
if ((ctrlr->aer_mask & aer_mask) == 0) {
mtx_unlock(&ctrlr->lock);
return;
}
/*
* If there is no pending AER command, drop it.
* XXX: Should we queue these?
*/
if (ctrlr->aer_pending == 0) {
mtx_unlock(&ctrlr->lock);
nvmft_printf(ctrlr,
"dropping AER type %u, info %#x, page %#x\n",
type, info, log_page_id);
return;
}
memset(&cpl, 0, sizeof(cpl));
cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx];
ctrlr->aer_pending--;
ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER;
mtx_unlock(&ctrlr->lock);
cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) |
NVMEF(NVME_ASYNC_EVENT_INFO, info) |
NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id));
nvmft_send_response(ctrlr->admin, &cpl);
}
void
nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id)
{
struct nvme_ns_list *nslist;
uint32_t new_nsid, nsid;
u_int i;
new_nsid = lun_id + 1;
mtx_lock(&ctrlr->lock);
nslist = ctrlr->changed_ns;
/* If the first entry is 0xffffffff, the list is already full. */
if (nslist->ns[0] != 0xffffffff) {
/* Find the insertion point for this namespace ID. */
for (i = 0; i < nitems(nslist->ns); i++) {
nsid = le32toh(nslist->ns[i]);
if (nsid == new_nsid) {
/* Already reported, nothing to do. */
mtx_unlock(&ctrlr->lock);
return;
}
if (nsid == 0 || nsid > new_nsid)
break;
}
if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) {
/* List is full. */
memset(ctrlr->changed_ns, 0,
sizeof(*ctrlr->changed_ns));
ctrlr->changed_ns->ns[0] = 0xffffffff;
} else if (nslist->ns[i] == htole32(0)) {
/*
* Optimize case where this ID is appended to
* the end.
*/
nslist->ns[i] = htole32(new_nsid);
} else {
memmove(&nslist->ns[i + 1], &nslist->ns[i],
(nitems(nslist->ns) - i - 1) *
sizeof(nslist->ns[0]));
nslist->ns[i] = htole32(new_nsid);
}
}
if (ctrlr->changed_ns_reported) {
mtx_unlock(&ctrlr->lock);
return;
}
ctrlr->changed_ns_reported = true;
mtx_unlock(&ctrlr->lock);
nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0,
NVME_LOG_CHANGED_NAMESPACE);
}