Discussion:
Xen virtual network (Netfront) driver
Mike Belopuhov
2016-01-06 15:37:36 UTC
Permalink
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.

OK?

diff --git sys/arch/amd64/conf/GENERIC sys/arch/amd64/conf/GENERIC
index fca4459..77e07cc 100644
--- sys/arch/amd64/conf/GENERIC
+++ sys/arch/amd64/conf/GENERIC
@@ -67,10 +67,11 @@ mpbios0 at bios0
ipmi0 at mainbus? disable # IPMI

vmt0 at pvbus? # VMware Tools

#xen0 at pvbus? # Xen HVM domU
+#xnf* at xen? # Xen Netfront

option PCIVERBOSE
option USBVERBOSE

pchb* at pci? # PCI-Host bridges
diff --git sys/dev/pv/files.pv sys/dev/pv/files.pv
index d0e3b8c..e1272b2 100644
--- sys/dev/pv/files.pv
+++ sys/dev/pv/files.pv
@@ -16,5 +16,9 @@ file dev/pv/vmt.c vmt needs-flag
# Xen
device xen {}
attach xen at pvbus
file dev/pv/xen.c xen needs-flag
file dev/pv/xenstore.c xen
+
+device xnf: ether, ifnet, ifmedia
+attach xnf at xen
+file dev/pv/if_xnf.c xnf
diff --git sys/dev/pv/if_xnf.c sys/dev/pv/if_xnf.c
new file mode 100644
index 0000000..7f8b08e
--- /dev/null
+++ sys/dev/pv/if_xnf.c
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2015 Mike Belopuhov
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "bpfilter.h"
+#include "vlan.h"
+#include "xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/atomic.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/device.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/queue.h>
+#include <sys/timeout.h>
+#include <sys/pool.h>
+
+#include <machine/bus.h>
+
+#include <dev/pv/xenreg.h>
+#include <dev/pv/xenvar.h>
+
+#include <net/if.h>
+#include <net/if_media.h>
+
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+
+#if NBPFILTER > 0
+#include <net/bpf.h>
+#endif
+
+
+/*
+ * Rx ring
+ */
+
+struct xnf_rx_req {
+ uint16_t rxq_id;
+ uint16_t rxq_pad;
+ uint32_t rxq_ref;
+} __packed;
+
+struct xnf_rx_rsp {
+ uint16_t rxp_id;
+ uint16_t rxp_offset;
+ uint16_t rxp_flags;
+#define XNF_RXF_CSUM 0x0001
+#define XNF_RXF_BLANK 0x0002
+#define XNF_RXF_CHUNK 0x0004
+#define XNF_RXF_EXTRA 0x0008
+ int16_t rxp_status;
+} __packed;
+
+union xnf_rx_desc {
+ struct xnf_rx_req rxd_req;
+ struct xnf_rx_rsp rxd_rsp;
+} __packed;
+
+#define XNF_RX_DESC 256
+#define XNF_MCLEN PAGE_SIZE
+#define XNF_RX_MIN 32
+
+struct xnf_rx_ring {
+ uint32_t rxr_prod;
+ uint32_t rxr_req_evt;
+ uint32_t rxr_cons;
+ uint32_t rxr_rsp_evt;
+ uint32_t rxr_reserved[12];
+ union xnf_rx_desc rxr_desc[XNF_RX_DESC];
+} __packed;
+
+
+/*
+ * Tx ring
+ */
+
+struct xnf_tx_req {
+ uint32_t txq_ref;
+ uint16_t txq_offset;
+ uint16_t txq_flags;
+#define XNF_TXF_CSUM 0x0001
+#define XNF_TXF_VALID 0x0002
+#define XNF_TXF_CHUNK 0x0004
+#define XNF_TXF_ETXRA 0x0008
+ uint16_t txq_id;
+ uint16_t txq_size;
+} __packed;
+
+struct xnf_tx_rsp {
+ uint16_t txp_id;
+ int16_t txp_status;
+} __packed;
+
+union xnf_tx_desc {
+ struct xnf_tx_req txd_req;
+ struct xnf_tx_rsp txd_rsp;
+} __packed;
+
+#define XNF_TX_DESC 256
+#define XNF_TX_FRAG 8 /* down from 18 */
+
+struct xnf_tx_ring {
+ uint32_t txr_prod;
+ uint32_t txr_req_evt;
+ uint32_t txr_cons;
+ uint32_t txr_rsp_evt;
+ uint32_t txr_reserved[12];
+ union xnf_tx_desc txr_desc[XNF_TX_DESC];
+} __packed;
+
+
+/* Management frame, "extra info" in Xen parlance */
+struct xnf_mgmt {
+ uint8_t mg_type;
+#define XNF_MGMT_MCAST_ADD 2
+#define XNF_MGMT_MCAST_DEL 3
+ uint8_t mg_flags;
+ union {
+ uint8_t mgu_mcaddr[ETHER_ADDR_LEN];
+ uint16_t mgu_pad[3];
+ } u;
+#define mg_mcaddr u.mgu_mcaddr
+} __packed;
+
+
+struct xnf_softc {
+ struct device sc_dev;
+ struct xen_attach_args sc_xa;
+ struct xen_softc *sc_xen;
+ bus_dma_tag_t sc_dmat;
+
+ struct arpcom sc_ac;
+ struct ifmedia sc_media;
+
+ xen_intr_handle_t sc_xih;
+
+ /* Rx ring */
+ struct xnf_rx_ring *sc_rx_ring;
+ int sc_rx_cons;
+ bus_dmamap_t sc_rx_rmap; /* map for the ring */
+ bus_dma_segment_t sc_rx_seg;
+ uint32_t sc_rx_ref; /* grant table ref */
+ struct mbuf *sc_rx_buf[XNF_RX_DESC];
+ bus_dmamap_t sc_rx_dmap[XNF_RX_DESC]; /* maps for packets */
+ struct mbuf *sc_rx_cbuf[2]; /* chain handling */
+ struct if_rxring sc_rx_slots;
+ struct timeout sc_rx_fill;
+
+ /* Tx ring */
+ struct xnf_tx_ring *sc_tx_ring;
+ int sc_tx_cons;
+ bus_dmamap_t sc_tx_rmap; /* map for the ring */
+ bus_dma_segment_t sc_tx_seg;
+ uint32_t sc_tx_ref; /* grant table ref */
+ struct mbuf *sc_tx_buf[XNF_TX_DESC];
+ bus_dmamap_t sc_tx_dmap[XNF_TX_DESC]; /* maps for packets */
+};
+
+int xnf_match(struct device *, void *, void *);
+void xnf_attach(struct device *, struct device *, void *);
+int xnf_lladdr(struct xnf_softc *);
+int xnf_ioctl(struct ifnet *, u_long, caddr_t);
+int xnf_media_change(struct ifnet *);
+void xnf_media_status(struct ifnet *, struct ifmediareq *);
+int xnf_iff(struct xnf_softc *);
+void xnf_init(struct xnf_softc *);
+void xnf_stop(struct xnf_softc *);
+void xnf_start(struct ifnet *);
+int xnf_encap(struct xnf_softc *, struct mbuf *, uint32_t *);
+void xnf_intr(void *);
+int xnf_txeof(struct xnf_softc *);
+int xnf_rxeof(struct xnf_softc *);
+void xnf_rx_ring_fill(void *);
+int xnf_rx_ring_create(struct xnf_softc *);
+void xnf_rx_ring_drain(struct xnf_softc *);
+void xnf_rx_ring_destroy(struct xnf_softc *);
+int xnf_tx_ring_create(struct xnf_softc *);
+void xnf_tx_ring_drain(struct xnf_softc *);
+void xnf_tx_ring_destroy(struct xnf_softc *);
+int xnf_init_backend(struct xnf_softc *);
+int xnf_stop_backend(struct xnf_softc *);
+
+struct cfdriver xnf_cd = {
+ NULL, "xnf", DV_IFNET
+};
+
+const struct cfattach xnf_ca = {
+ sizeof(struct xnf_softc), xnf_match, xnf_attach
+};
+
+int
+xnf_match(struct device *parent, void *match, void *aux)
+{
+ struct xen_attach_args *xa = aux;
+ char type[64];
+
+ if (strcmp("vif", xa->xa_name))
+ return (0);
+
+ if (xs_getprop(xa, "type", type, sizeof(type)) == 0 &&
+ ((strcmp("vif", type) == 0) || (strcmp("front", type) == 0)))
+ return (1);
+
+ return (0);
+}
+
+void
+xnf_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct xen_attach_args *xa = aux;
+ struct xnf_softc *sc = (struct xnf_softc *)self;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ sc->sc_xa = *xa;
+ sc->sc_xen = xa->xa_parent;
+ sc->sc_dmat = xa->xa_dmat;
+
+ strlcpy(ifp->if_xname, sc->sc_dev.dv_xname, IFNAMSIZ);
+
+ if (xnf_lladdr(sc)) {
+ printf(": failed to obtain MAC address\n");
+ return;
+ }
+
+ if (xen_intr_establish(0, &sc->sc_xih, xnf_intr, sc, ifp->if_xname)) {
+ printf("%s: failed to establish an interrupt\n", ifp->if_xname);
+ return;
+ }
+
+ printf(": event channel %u, address %s\n", sc->sc_xih,
+ ether_sprintf(sc->sc_ac.ac_enaddr));
+
+ if (xnf_rx_ring_create(sc)) {
+ xen_intr_disestablish(sc->sc_xih);
+ return;
+ }
+ if (xnf_tx_ring_create(sc)) {
+ xen_intr_disestablish(sc->sc_xih);
+ xnf_rx_ring_destroy(sc);
+ return;
+ }
+ if (xnf_init_backend(sc)) {
+ xen_intr_disestablish(sc->sc_xih);
+ xnf_rx_ring_destroy(sc);
+ xnf_tx_ring_destroy(sc);
+ return;
+ }
+
+ ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_xflags = IFXF_MPSAFE;
+ ifp->if_ioctl = xnf_ioctl;
+ ifp->if_start = xnf_start;
+ ifp->if_softc = sc;
+
+ ifp->if_capabilities = IFCAP_VLAN_MTU;
+
+ IFQ_SET_MAXLEN(&ifp->if_snd, XNF_TX_DESC - 1);
+ IFQ_SET_READY(&ifp->if_snd);
+
+ ifmedia_init(&sc->sc_media, IFM_IMASK, xnf_media_change,
+ xnf_media_status);
+ ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_MANUAL, 0, NULL);
+ ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_MANUAL);
+
+ if_attach(ifp);
+ ether_ifattach(ifp);
+
+ timeout_set(&sc->sc_rx_fill, xnf_rx_ring_fill, sc);
+}
+
+static int
+nibble(int ch)
+{
+ if (ch >= '0' && ch <= '9')
+ return (ch - '0');
+ if (ch >= 'A' && ch <= 'F')
+ return (10 + ch - 'A');
+ if (ch >= 'a' && ch <= 'f')
+ return (10 + ch - 'a');
+ return (-1);
+}
+
+int
+xnf_lladdr(struct xnf_softc *sc)
+{
+ char enaddr[ETHER_ADDR_LEN];
+ char mac[32];
+ int i, j, lo, hi;
+
+ if (xs_getprop(&sc->sc_xa, "mac", mac, sizeof(mac)))
+ return (-1);
+
+ for (i = 0, j = 0; j < ETHER_ADDR_LEN; i += 3) {
+ if ((hi = nibble(mac[i])) == -1 ||
+ (lo = nibble(mac[i+1])) == -1)
+ return (-1);
+ enaddr[j++] = hi << 4 | lo;
+ }
+
+ memcpy(sc->sc_ac.ac_enaddr, enaddr, ETHER_ADDR_LEN);
+ return (0);
+}
+
+int
+xnf_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
+{
+ struct xnf_softc *sc = ifp->if_softc;
+ struct ifreq *ifr = (struct ifreq *)data;
+ int s, error = 0;
+
+ s = splnet();
+
+ switch (command) {
+ case SIOCSIFADDR:
+ ifp->if_flags |= IFF_UP;
+ if (!(ifp->if_flags & IFF_RUNNING))
+ xnf_init(sc);
+ break;
+ case SIOCSIFFLAGS:
+ if (ifp->if_flags & IFF_UP) {
+ if (ifp->if_flags & IFF_RUNNING)
+ error = ENETRESET;
+ else
+ xnf_init(sc);
+ } else {
+ if (ifp->if_flags & IFF_RUNNING)
+ xnf_stop(sc);
+ }
+ break;
+ case SIOCGIFMEDIA:
+ case SIOCSIFMEDIA:
+ error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, command);
+ break;
+ case SIOCGIFRXR:
+ error = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data,
+ NULL, XNF_MCLEN, &sc->sc_rx_slots);
+ break;
+ default:
+ error = ether_ioctl(ifp, &sc->sc_ac, command, data);
+ break;
+ }
+
+ if (error == ENETRESET) {
+ if (ifp->if_flags & IFF_RUNNING)
+ xnf_iff(sc);
+ error = 0;
+ }
+
+ splx(s);
+
+ return (error);
+}
+
+int
+xnf_media_change(struct ifnet *ifp)
+{
+ return (0);
+}
+
+void
+xnf_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+ ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
+ ifmr->ifm_active = IFM_ETHER | IFM_MANUAL;
+}
+
+int
+xnf_iff(struct xnf_softc *sc)
+{
+ return (0);
+}
+
+void
+xnf_init(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ xnf_stop(sc);
+
+ xnf_iff(sc);
+
+ if (xen_intr_unmask(sc->sc_xih)) {
+ printf("%s: failed to enable interrupts\n", ifp->if_xname);
+ xnf_stop(sc);
+ return;
+ }
+
+ ifp->if_flags |= IFF_RUNNING;
+ ifq_clr_oactive(&ifp->if_snd);
+}
+
+void
+xnf_stop(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ ifp->if_flags &= ~IFF_RUNNING;
+
+ xen_intr_mask(sc->sc_xih);
+
+ timeout_del(&sc->sc_rx_fill);
+
+ ifq_barrier(&ifp->if_snd);
+ intr_barrier(&sc->sc_xih);
+
+ ifq_clr_oactive(&ifp->if_snd);
+
+ if (sc->sc_tx_ring)
+ xnf_tx_ring_drain(sc);
+ if (sc->sc_rx_ring)
+ xnf_rx_ring_drain(sc);
+}
+
+void
+xnf_start(struct ifnet *ifp)
+{
+ struct xnf_softc *sc = ifp->if_softc;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ struct mbuf *m;
+ int error, pkts = 0;
+ uint32_t prod;
+
+ if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(&ifp->if_snd))
+ return;
+
+ prod = txr->txr_prod;
+ membar_consumer();
+
+ for (;;) {
+ m = ifq_deq_begin(&ifp->if_snd);
+ if (m == NULL)
+ break;
+
+ error = xnf_encap(sc, m, &prod);
+ if (error == ENOENT) {
+ /* transient */
+ ifq_deq_rollback(&ifp->if_snd, m);
+ ifq_set_oactive(&ifp->if_snd);
+ break;
+ } else if (error) {
+ /* the chain is too large */
+ ifq_deq_commit(&ifp->if_snd, m);
+ m_freem(m);
+ continue;
+ }
+ ifq_deq_commit(&ifp->if_snd, m);
+
+#if NBPFILTER > 0
+ if (ifp->if_bpf)
+ bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT);
+#endif
+ pkts++;
+ }
+ if (pkts > 0) {
+ txr->txr_prod = prod;
+ xen_intr_signal(sc->sc_xih);
+ }
+}
+
+int
+xnf_encap(struct xnf_softc *sc, struct mbuf *m, uint32_t *prod)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ union xnf_tx_desc *txd;
+ bus_dmamap_t dmap;
+ int error, i, n = 0;
+
+ if (((txr->txr_cons - *prod - 1) & (XNF_TX_DESC - 1)) < XNF_TX_FRAG) {
+ error = ENOENT;
+ goto errout;
+ }
+
+ i = *prod & (XNF_TX_DESC - 1);
+ dmap = sc->sc_tx_dmap[i];
+
+ error = bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
+ BUS_DMA_NOWAIT);
+ if (error == EFBIG) {
+ if (m_defrag(m, M_DONTWAIT) ||
+ bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
+ BUS_DMA_NOWAIT))
+ goto errout;
+ } else if (error)
+ goto errout;
+
+ for (n = 0; n < dmap->dm_nsegs; n++, (*prod)++) {
+ i = *prod & (XNF_TX_DESC - 1);
+ if (sc->sc_tx_buf[i])
+ panic("%s: save vs spell: %d\n", ifp->if_xname, i);
+ txd = &txr->txr_desc[i];
+ if (n == 0) {
+ sc->sc_tx_buf[i] = m;
+ if (0 && m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
+ txd->txd_req.txq_flags = XNF_TXF_CSUM |
+ XNF_TXF_VALID;
+ txd->txd_req.txq_size = m->m_pkthdr.len;
+ } else
+ txd->txd_req.txq_size = dmap->dm_segs[n].ds_len;
+ if (n != dmap->dm_nsegs - 1)
+ txd->txd_req.txq_flags |= XNF_TXF_CHUNK;
+ txd->txd_req.txq_ref = dmap->dm_segs[n].ds_addr;
+ txd->txd_req.txq_offset = dmap->dm_segs[n].ds_offset;
+ }
+
+ ifp->if_opackets++;
+ return (0);
+
+ errout:
+ ifp->if_oerrors++;
+ return (error);
+}
+
+void
+xnf_intr(void *arg)
+{
+ struct xnf_softc *sc = arg;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ if (ifp->if_flags & IFF_RUNNING) {
+ xnf_rxeof(sc);
+ xnf_txeof(sc);
+ }
+}
+
+int
+xnf_txeof(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ union xnf_tx_desc *txd;
+ struct mbuf *m;
+ bus_dmamap_t dmap;
+ volatile uint32_t r;
+ uint32_t cons;
+ int i, id, pkts = 0;
+
+ do {
+ for (cons = sc->sc_tx_cons; cons != txr->txr_cons; cons++) {
+ membar_consumer();
+ i = cons & (XNF_TX_DESC - 1);
+ txd = &txr->txr_desc[i];
+ id = txd->txd_rsp.txp_id;
+ memset(txd, 0, sizeof(*txd));
+ txd->txd_req.txq_id = id;
+ membar_producer();
+ if (sc->sc_tx_buf[i]) {
+ dmap = sc->sc_tx_dmap[i];
+ bus_dmamap_unload(sc->sc_dmat, dmap);
+ m = sc->sc_tx_buf[i];
+ sc->sc_tx_buf[i] = NULL;
+ m_freem(m);
+ }
+ pkts++;
+ }
+
+ if (pkts > 0) {
+ sc->sc_tx_cons = cons;
+ membar_producer();
+ txr->txr_rsp_evt = cons + 1;
+ pkts = 0;
+ }
+
+ r = txr->txr_cons - sc->sc_tx_cons;
+ membar_consumer();
+ } while (r > 0);
+
+ if (ifq_is_oactive(&ifp->if_snd))
+ ifq_restart(&ifp->if_snd);
+
+ return (0);
+}
+
+int
+xnf_rxeof(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_rx_ring *rxr = sc->sc_rx_ring;
+ union xnf_rx_desc *rxd;
+ struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+ struct mbuf *fmp = sc->sc_rx_cbuf[0];
+ struct mbuf *lmp = sc->sc_rx_cbuf[1];
+ struct mbuf *m;
+ bus_dmamap_t dmap;
+ volatile uint32_t r;
+ uint32_t cons;
+ int i, id, flags, len, offset, pkts = 0;
+
+ do {
+ for (cons = sc->sc_rx_cons; cons != rxr->rxr_cons; cons++) {
+ membar_consumer();
+ i = cons & (XNF_RX_DESC - 1);
+ rxd = &rxr->rxr_desc[i];
+ dmap = sc->sc_rx_dmap[i];
+
+ len = rxd->rxd_rsp.rxp_status;
+ flags = rxd->rxd_rsp.rxp_flags;
+ offset = rxd->rxd_rsp.rxp_offset;
+ id = rxd->rxd_rsp.rxp_id;
+ memset(rxd, 0, sizeof(*rxd));
+ rxd->rxd_req.rxq_id = id;
+ membar_producer();
+
+ bus_dmamap_unload(sc->sc_dmat, dmap);
+
+ m = sc->sc_rx_buf[i];
+ KASSERT(m != NULL);
+ sc->sc_rx_buf[i] = NULL;
+
+ if (flags & XNF_RXF_EXTRA)
+ printf("%s: management data present\n",
+ ifp->if_xname);
+
+ if (flags & XNF_RXF_CSUM)
+ m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK;
+
+ if_rxr_put(&sc->sc_rx_slots, 1);
+ pkts++;
+
+ if (len < 0 || (len + offset > PAGE_SIZE)) {
+ ifp->if_ierrors++;
+ m_freem(m);
+ continue;
+ }
+
+ m->m_len = len;
+ m->m_data += offset;
+
+ if (fmp == NULL) {
+ m->m_pkthdr.len = len;
+ fmp = m;
+ } else {
+ m->m_flags &= ~M_PKTHDR;
+ lmp->m_next = m;
+ fmp->m_pkthdr.len += m->m_len;
+ }
+ lmp = m;
+
+ if (flags & XNF_RXF_CHUNK) {
+ sc->sc_rx_cbuf[0] = fmp;
+ sc->sc_rx_cbuf[1] = lmp;
+ continue;
+ }
+
+ m = fmp;
+
+ ml_enqueue(&ml, m);
+ sc->sc_rx_cbuf[0] = sc->sc_rx_cbuf[1] =
+ fmp = lmp = NULL;
+ }
+
+ if (pkts > 0) {
+ sc->sc_rx_cons = cons;
+ membar_producer();
+ rxr->rxr_rsp_evt = cons + 1;
+ pkts = 0;
+ }
+
+ r = rxr->rxr_cons - sc->sc_rx_cons;
+ membar_consumer();
+ } while (r > 0);
+
+ if (!ml_empty(&ml)) {
+ if_input(ifp, &ml);
+
+ xnf_rx_ring_fill(sc);
+ }
+
+ return (0);
+}
+
+void
+xnf_rx_ring_fill(void *arg)
+{
+ struct xnf_softc *sc = arg;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_rx_ring *rxr = sc->sc_rx_ring;
+ bus_dmamap_t dmap;
+ struct mbuf *m;
+ uint32_t cons, prod;
+ static int timer = 0;
+ int i, n;
+
+ cons = rxr->rxr_cons;
+ prod = rxr->rxr_prod;
+
+ n = if_rxr_get(&sc->sc_rx_slots, XNF_RX_DESC);
+
+ /* Less than XNF_RX_MIN slots available? */
+ if (n == 0 && prod - cons < XNF_RX_MIN) {
+ if (ifp->if_flags & IFF_RUNNING)
+ timeout_add(&sc->sc_rx_fill, 1 << timer);
+ if (timer < 10)
+ timer++;
+ return;
+ }
+
+ for (; n > 0; prod++, n--) {
+ i = prod & (XNF_RX_DESC - 1);
+ if (sc->sc_rx_buf[i])
+ break;
+ m = MCLGETI(NULL, M_DONTWAIT, NULL, XNF_MCLEN);
+ if (m == NULL)
+ break;
+ m->m_len = m->m_pkthdr.len = XNF_MCLEN;
+ dmap = sc->sc_rx_dmap[i];
+ if (bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_READ |
+ BUS_DMA_NOWAIT)) {
+ m_freem(m);
+ break;
+ }
+ sc->sc_rx_buf[i] = m;
+ rxr->rxr_desc[i].rxd_req.rxq_ref = dmap->dm_segs[0].ds_addr;
+ }
+
+ if (n > 0)
+ if_rxr_put(&sc->sc_rx_slots, n);
+
+ membar_producer();
+ rxr->rxr_prod = prod;
+
+ xen_intr_signal(sc->sc_xih);
+}
+
+int
+xnf_rx_ring_create(struct xnf_softc *sc)
+{
+ int i, rsegs;
+
+ /* Allocate a page of memory for the ring */
+ if (bus_dmamem_alloc(sc->sc_dmat, PAGE_SIZE, PAGE_SIZE, 0,
+ &sc->sc_rx_seg, 1, &rsegs, BUS_DMA_ZERO | BUS_DMA_WAITOK)) {
+ printf("%s: failed to allocate memory for the rx ring\n",
+ sc->sc_dev.dv_xname);
+ return (-1);
+ }
+ /* Map in the allocated memory into the ring structure */
+ if (bus_dmamem_map(sc->sc_dmat, &sc->sc_rx_seg, 1, PAGE_SIZE,
+ (caddr_t *)(&sc->sc_rx_ring), BUS_DMA_WAITOK)) {
+ printf("%s: failed to map memory for the rx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Create a map to load the ring memory into */
+ if (bus_dmamap_create(sc->sc_dmat, PAGE_SIZE, 1, PAGE_SIZE, 0,
+ BUS_DMA_WAITOK, &sc->sc_rx_rmap)) {
+ printf("%s: failed to create a memory map for the rx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Load the ring into the ring map to extract the PA */
+ if (bus_dmamap_load(sc->sc_dmat, sc->sc_rx_rmap, sc->sc_rx_ring,
+ PAGE_SIZE, NULL, BUS_DMA_WAITOK)) {
+ printf("%s: failed to load the rx ring map\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ sc->sc_rx_ref = sc->sc_rx_rmap->dm_segs[0].ds_addr;
+
+ sc->sc_rx_ring->rxr_req_evt = sc->sc_rx_ring->rxr_rsp_evt = 1;
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (bus_dmamap_create(sc->sc_dmat, XNF_MCLEN, 1,
+ XNF_MCLEN, 0, BUS_DMA_WAITOK, &sc->sc_rx_dmap[i])) {
+ printf("%s: failed to create a memory map for the rx "
+ "slot %d/%d\n", sc->sc_dev.dv_xname, i,
+ XNF_RX_DESC);
+ goto errout;
+ }
+ sc->sc_rx_ring->rxr_desc[i].rxd_req.rxq_id = i;
+ }
+
+ if_rxr_init(&sc->sc_rx_slots, XNF_RX_MIN, XNF_RX_DESC);
+ xnf_rx_ring_fill(sc);
+
+ return (0);
+
+ errout:
+ xnf_rx_ring_destroy(sc);
+ return (-1);
+}
+
+void
+xnf_rx_ring_drain(struct xnf_softc *sc)
+{
+ struct xnf_rx_ring *rxr = sc->sc_rx_ring;
+
+ if (sc->sc_rx_cons != rxr->rxr_cons)
+ xnf_rxeof(sc);
+}
+
+void
+xnf_rx_ring_destroy(struct xnf_softc *sc)
+{
+ int i, slots = 0;
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (sc->sc_rx_buf[i] == NULL)
+ continue;
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_rx_dmap[i]);
+ m_freem(sc->sc_rx_buf[i]);
+ sc->sc_rx_buf[i] = NULL;
+ slots++;
+ }
+ printf("%s: unload done\n", __func__);
+ if_rxr_put(&sc->sc_rx_slots, slots);
+ printf("%s: rxr_put done\n", __func__);
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (sc->sc_rx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_rx_dmap[i]);
+ sc->sc_rx_dmap[i] = NULL;
+ }
+ printf("%s: desc map destroy done\n", __func__);
+ if (sc->sc_rx_rmap) {
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_rx_rmap);
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_rx_rmap);
+ }
+ printf("%s: ring map destroy done\n", __func__);
+ if (sc->sc_rx_ring) {
+ bus_dmamem_unmap(sc->sc_dmat, (caddr_t)sc->sc_rx_ring,
+ PAGE_SIZE);
+ bus_dmamem_free(sc->sc_dmat, &sc->sc_rx_seg, 1);
+ }
+ printf("%s: ring mem free done\n", __func__);
+ sc->sc_rx_ring = NULL;
+ sc->sc_rx_rmap = NULL;
+ sc->sc_rx_cons = 0;
+}
+
+int
+xnf_tx_ring_create(struct xnf_softc *sc)
+{
+ int i, rsegs;
+
+ /* Allocate a page of memory for the ring */
+ if (bus_dmamem_alloc(sc->sc_dmat, PAGE_SIZE, PAGE_SIZE, 0,
+ &sc->sc_tx_seg, 1, &rsegs, BUS_DMA_ZERO | BUS_DMA_WAITOK)) {
+ printf("%s: failed to allocate memory for the tx ring\n",
+ sc->sc_dev.dv_xname);
+ return (-1);
+ }
+ /* Map in the allocated memory into the ring structure */
+ if (bus_dmamem_map(sc->sc_dmat, &sc->sc_tx_seg, 1, PAGE_SIZE,
+ (caddr_t *)&sc->sc_tx_ring, BUS_DMA_WAITOK)) {
+ printf("%s: failed to map memory for the tx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Create a map to load the ring memory into */
+ if (bus_dmamap_create(sc->sc_dmat, PAGE_SIZE, 1, PAGE_SIZE, 0,
+ BUS_DMA_WAITOK, &sc->sc_tx_rmap)) {
+ printf("%s: failed to create a memory map for the tx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Load the ring into the ring map to extract the PA */
+ if (bus_dmamap_load(sc->sc_dmat, sc->sc_tx_rmap, sc->sc_tx_ring,
+ PAGE_SIZE, NULL, BUS_DMA_WAITOK)) {
+ printf("%s: failed to load the tx ring map\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ sc->sc_tx_ref = sc->sc_tx_rmap->dm_segs[0].ds_addr;
+
+ sc->sc_tx_ring->txr_req_evt = sc->sc_tx_ring->txr_rsp_evt = 1;
+
+ for (i = 0; i < XNF_TX_DESC; i++) {
+ if (bus_dmamap_create(sc->sc_dmat, XNF_MCLEN, XNF_TX_FRAG,
+ XNF_MCLEN, 0, BUS_DMA_WAITOK, &sc->sc_tx_dmap[i])) {
+ printf("%s: failed to create a memory map for the tx "
+ "slot %d/%d\n", sc->sc_dev.dv_xname, i,
+ XNF_TX_DESC);
+ goto errout;
+ }
+ sc->sc_tx_ring->txr_desc[i].txd_req.txq_id = i;
+ }
+
+ return (0);
+
+ errout:
+ xnf_tx_ring_destroy(sc);
+ return (-1);
+}
+
+void
+xnf_tx_ring_drain(struct xnf_softc *sc)
+{
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+
+ if (sc->sc_tx_cons != txr->txr_cons)
+ xnf_txeof(sc);
+}
+
+void
+xnf_tx_ring_destroy(struct xnf_softc *sc)
+{
+ int i;
+
+ for (i = 0; i < XNF_TX_DESC; i++) {
+ if (sc->sc_tx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_tx_dmap[i]);
+ if (sc->sc_tx_buf[i] == NULL)
+ continue;
+ m_freem(sc->sc_tx_buf[i]);
+ sc->sc_tx_buf[i] = NULL;
+ }
+ for (i = 0; i < XNF_TX_DESC; i++) {
+ if (sc->sc_tx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_tx_dmap[i]);
+ sc->sc_tx_dmap[i] = NULL;
+ }
+ if (sc->sc_tx_rmap) {
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_tx_rmap);
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_tx_rmap);
+ }
+ if (sc->sc_tx_ring) {
+ bus_dmamem_unmap(sc->sc_dmat, (caddr_t)sc->sc_tx_ring,
+ PAGE_SIZE);
+ bus_dmamem_free(sc->sc_dmat, &sc->sc_tx_seg, 1);
+ }
+ sc->sc_tx_ring = NULL;
+ sc->sc_tx_rmap = NULL;
+}
+
+int
+xnf_init_backend(struct xnf_softc *sc)
+{
+ const char *prop;
+ char val[32];
+
+ /* Plumb the Rx ring */
+ prop = "rx-ring-ref";
+ snprintf(val, sizeof(val), "%u", sc->sc_rx_ref);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Enable "copy" mode */
+ prop = "request-rx-copy";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Enable notify mode */
+ prop = "feature-rx-notify";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Request multicast filtering */
+ prop = "request-multicast-control";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Plumb the Tx ring */
+ prop = "tx-ring-ref";
+ snprintf(val, sizeof(val), "%u", sc->sc_tx_ref);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Enable transmit scatter-gather mode */
+ prop = "feature-sg";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Disable TCP/UDP checksum offload */
+ prop = "feature-csum-offload";
+ if (xs_setprop(&sc->sc_xa, prop, NULL, 0))
+ goto errout;
+ prop = "feature-no-csum-offload";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ prop = "feature-ipv6-csum-offload";
+ if (xs_setprop(&sc->sc_xa, prop, NULL, 0))
+ goto errout;
+ prop = "feature-no-ipv6-csum-offload";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Plumb the event channel port */
+ prop = "event-channel";
+ snprintf(val, sizeof(val), "%u", sc->sc_xih);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Connect the device */
+ prop = "state";
+ snprintf(val, sizeof(val), "%u", 4);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ return (0);
+
+ errout:
+ printf("%s: failed to set \"%s\" property to \"%s\"\n",
+ sc->sc_dev.dv_xname, prop, val);
+ return (-1);
+}
Mike Belopuhov
2016-01-06 16:08:43 UTC
Permalink
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
OK?
Just noticed that a couple of debug printfs have sneaked in.
I'm not going to commit them.
Post by Mike Belopuhov
+void
+xnf_rx_ring_destroy(struct xnf_softc *sc)
+{
+ int i, slots = 0;
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (sc->sc_rx_buf[i] == NULL)
+ continue;
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_rx_dmap[i]);
+ m_freem(sc->sc_rx_buf[i]);
+ sc->sc_rx_buf[i] = NULL;
+ slots++;
+ }
+ printf("%s: unload done\n", __func__);
+ if_rxr_put(&sc->sc_rx_slots, slots);
+ printf("%s: rxr_put done\n", __func__);
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (sc->sc_rx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_rx_dmap[i]);
+ sc->sc_rx_dmap[i] = NULL;
+ }
+ printf("%s: desc map destroy done\n", __func__);
+ if (sc->sc_rx_rmap) {
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_rx_rmap);
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_rx_rmap);
+ }
+ printf("%s: ring map destroy done\n", __func__);
+ if (sc->sc_rx_ring) {
+ bus_dmamem_unmap(sc->sc_dmat, (caddr_t)sc->sc_rx_ring,
+ PAGE_SIZE);
+ bus_dmamem_free(sc->sc_dmat, &sc->sc_rx_seg, 1);
+ }
+ printf("%s: ring mem free done\n", __func__);
+ sc->sc_rx_ring = NULL;
+ sc->sc_rx_rmap = NULL;
+ sc->sc_rx_cons = 0;
+}
+
Stefan Fritsch
2016-01-06 16:58:16 UTC
Permalink
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
I only had a quick glance at the code, but I have one comment about your
use of memory barriers. The membar_* macros are pure compiler barriers
when the openbsd kernel is compiled for UP. But since the host machine and
xen may use SMP even in this case, I suspect the that you need hardware
memory barriers even if MULTIPROCESSOR is not defined. This does not seem
relevant for x86 because you don't use membar_sync, but it may become
relevant for arm, which is also supported by xen.

I had the same problem in virtio and introduced the virtio_membar_* macros
for this purpose. Maybe they should be renamed to a more generic name and
you should use them, too?
Post by Mike Belopuhov
+ if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(&ifp->if_snd))
+ return;
+
+ prod = txr->txr_prod;
+ membar_consumer();
+
+ for (;;) {
+ m = ifq_deq_begin(&ifp->if_snd);
+ if (m == NULL)
+ break;
+
+ error = xnf_encap(sc, m, &prod);
+ if (error == ENOENT) {
+ /* transient */
+ ifq_deq_rollback(&ifp->if_snd, m);
+ ifq_set_oactive(&ifp->if_snd);
+ break;
+ } else if (error) {
+ /* the chain is too large */
+ ifq_deq_commit(&ifp->if_snd, m);
+ m_freem(m);
+ continue;
+ }
+ ifq_deq_commit(&ifp->if_snd, m);
+
+#if NBPFILTER > 0
+ if (ifp->if_bpf)
+ bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT);
+#endif
+ pkts++;
+ }
+ if (pkts > 0) {
+ txr->txr_prod = prod;
+ xen_intr_signal(sc->sc_xih);
+ }
+}
+
+int
+xnf_encap(struct xnf_softc *sc, struct mbuf *m, uint32_t *prod)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ union xnf_tx_desc *txd;
+ bus_dmamap_t dmap;
+ int error, i, n = 0;
+
+ if (((txr->txr_cons - *prod - 1) & (XNF_TX_DESC - 1)) < XNF_TX_FRAG) {
+ error = ENOENT;
+ goto errout;
+ }
+
+ i = *prod & (XNF_TX_DESC - 1);
+ dmap = sc->sc_tx_dmap[i];
+
+ error = bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
+ BUS_DMA_NOWAIT);
+ if (error == EFBIG) {
+ if (m_defrag(m, M_DONTWAIT) ||
+ bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
+ BUS_DMA_NOWAIT))
+ goto errout;
+ } else if (error)
+ goto errout;
+
+ for (n = 0; n < dmap->dm_nsegs; n++, (*prod)++) {
+ i = *prod & (XNF_TX_DESC - 1);
+ if (sc->sc_tx_buf[i])
+ panic("%s: save vs spell: %d\n", ifp->if_xname, i);
+ txd = &txr->txr_desc[i];
+ if (n == 0) {
+ sc->sc_tx_buf[i] = m;
+ if (0 && m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
+ txd->txd_req.txq_flags = XNF_TXF_CSUM |
+ XNF_TXF_VALID;
+ txd->txd_req.txq_size = m->m_pkthdr.len;
+ } else
+ txd->txd_req.txq_size = dmap->dm_segs[n].ds_len;
+ if (n != dmap->dm_nsegs - 1)
+ txd->txd_req.txq_flags |= XNF_TXF_CHUNK;
+ txd->txd_req.txq_ref = dmap->dm_segs[n].ds_addr;
+ txd->txd_req.txq_offset = dmap->dm_segs[n].ds_offset;
+ }
+
+ ifp->if_opackets++;
+ return (0);
+
+ ifp->if_oerrors++;
+ return (error);
+}
+
+void
+xnf_intr(void *arg)
+{
+ struct xnf_softc *sc = arg;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ if (ifp->if_flags & IFF_RUNNING) {
+ xnf_rxeof(sc);
+ xnf_txeof(sc);
+ }
+}
+
+int
+xnf_txeof(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ union xnf_tx_desc *txd;
+ struct mbuf *m;
+ bus_dmamap_t dmap;
+ volatile uint32_t r;
+ uint32_t cons;
+ int i, id, pkts = 0;
+
+ do {
+ for (cons = sc->sc_tx_cons; cons != txr->txr_cons; cons++) {
+ membar_consumer();
+ i = cons & (XNF_TX_DESC - 1);
+ txd = &txr->txr_desc[i];
+ id = txd->txd_rsp.txp_id;
+ memset(txd, 0, sizeof(*txd));
+ txd->txd_req.txq_id = id;
+ membar_producer();
+ if (sc->sc_tx_buf[i]) {
+ dmap = sc->sc_tx_dmap[i];
+ bus_dmamap_unload(sc->sc_dmat, dmap);
+ m = sc->sc_tx_buf[i];
+ sc->sc_tx_buf[i] = NULL;
+ m_freem(m);
+ }
+ pkts++;
+ }
+
+ if (pkts > 0) {
+ sc->sc_tx_cons = cons;
+ membar_producer();
+ txr->txr_rsp_evt = cons + 1;
+ pkts = 0;
+ }
+
+ r = txr->txr_cons - sc->sc_tx_cons;
+ membar_consumer();
+ } while (r > 0);
+
+ if (ifq_is_oactive(&ifp->if_snd))
+ ifq_restart(&ifp->if_snd);
+
+ return (0);
+}
Mike Belopuhov
2016-01-07 11:02:23 UTC
Permalink
Post by Stefan Fritsch
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
I only had a quick glance at the code, but I have one comment about your
use of memory barriers. The membar_* macros are pure compiler barriers
when the openbsd kernel is compiled for UP. But since the host machine and
xen may use SMP even in this case, I suspect the that you need hardware
memory barriers even if MULTIPROCESSOR is not defined. This does not seem
relevant for x86 because you don't use membar_sync, but it may become
relevant for arm, which is also supported by xen.
membar_{producer,consumer} are defined on arm to perform store and
load memory barriers. Our arm code currently does not distinguish
between an MP case and non-MP case regarding the definition of these
macros, so I'm not entirely certain what are you trying to say.

However I'm thankful for bringing this up and I'll spend some time
figuring out if I need actual fences in my code. for instance the
cas loop in xen_grant_table_remove runs for more than 10000 iterations
in the normal case. I've changed the code to perform bus_dma_unload
after zeroing the descriptor out so that there won't be technically
any dangling grant table references, but I didn't remeasure the cas
loop. Possibly due to caching and CPU migration on the host we lose
out and perhaps can get a boost in performance by putting an implicit
memory barrier.
Post by Stefan Fritsch
I had the same problem in virtio and introduced the virtio_membar_* macros
for this purpose. Maybe they should be renamed to a more generic name and
you should use them, too?
I'm not sure cause I don't think x86 needs any explicit membars, but
I'll do some test and report on this.
Mark Kettenis
2016-01-07 12:17:45 UTC
Permalink
Date: Thu, 7 Jan 2016 12:02:23 +0100
Post by Stefan Fritsch
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
I only had a quick glance at the code, but I have one comment about your
use of memory barriers. The membar_* macros are pure compiler barriers
when the openbsd kernel is compiled for UP. But since the host machine and
xen may use SMP even in this case, I suspect the that you need hardware
memory barriers even if MULTIPROCESSOR is not defined. This does not seem
relevant for x86 because you don't use membar_sync, but it may become
relevant for arm, which is also supported by xen.
membar_{producer,consumer} are defined on arm to perform store and
load memory barriers. Our arm code currently does not distinguish
between an MP case and non-MP case regarding the definition of these
macros, so I'm not entirely certain what are you trying to say.
Not sure ARM is a good example to look at.

In principle I think that the membar_xxx() interfaces could be simple
compiler barriers on all our architectures, at least as long as the
CPU will observe its own stores in the same order as they were
emitted. But I think all sane CPU architectures make those
guarantees. At least for "normal" memory. However, we treat that as
an optimization. And we haven't done that for all our architectures.

The problem with virtualization is of course that even a non-MP kernel
is actually running in an MP environment. If data structures are
shared with the hypervisor or another domain running on a different
CPU, proper memory barriers must be used to guarantee the other side
sees our stores in the right order. The typical case would be
populating a descriptor with some sort of validity bit. There you
want to make sure the other side doesn't see the valid bit set until
all the other parts of the descriptor have been filled in and are
visible. In that case a simple compiler barrier may not be enough.
This is why the virtio_membar_xxx() primitives were introduced.

This is actually not all that different from handling DMA to real
hardware devices. There we must make sure that stores become visible
to the hardware device in the right order. That matters even on
non-MP kernels too and is handled by bus_dmamap_sync(9).

Since you have embraced bus_dma(9) for the xen stuff, it would make
sense to add a xen-specifc bus_dmamap_sync() implementation that
issues the appropriate memory barrier. I think it should be
virtio_membar_consumer() for BUS_DMASYNC_PREREAD and
virtio_membar_producer() for BUS_DMASYNC_POSTWRITE. But you'd better
double-check, because I always get confused!

BTW, your xen bus_dma(9) implementation relies on the internals of the
MD bus_dma(9) implementation. Don't expect it to to work on other
architectures. I'm not even sure I want to be held responsible if
changes in the MD code break it.
However I'm thankful for bringing this up and I'll spend some time
figuring out if I need actual fences in my code. for instance the
cas loop in xen_grant_table_remove runs for more than 10000 iterations
in the normal case. I've changed the code to perform bus_dma_unload
after zeroing the descriptor out so that there won't be technically
any dangling grant table references, but I didn't remeasure the cas
loop. Possibly due to caching and CPU migration on the host we lose
out and perhaps can get a boost in performance by putting an implicit
memory barrier.
Not sure what memory fences have to do with this. The hypervisor
should defenitely issue any appropriate barriers as part of the
context switching.
Post by Stefan Fritsch
I had the same problem in virtio and introduced the virtio_membar_* macros
for this purpose. Maybe they should be renamed to a more generic name and
you should use them, too?
I'm not sure cause I don't think x86 needs any explicit membars, but
I'll do some test and report on this.
It's a grey area. The x86 memory model evolved over time and isn't
all that well specified. On top of that it seems the actual hardware
is abit more strongly ordered than the specification. It is fairly
strongly ordered, which means that memory barriers can be omitted in
most cases that deal with "normal" memory. But our implementation
does issue memory barriers for membar_enter() and membar_sync(). I'm
not 100% certain it is correct.
Mike Belopuhov
2016-01-07 12:36:12 UTC
Permalink
Post by Mark Kettenis
Date: Thu, 7 Jan 2016 12:02:23 +0100
Post by Stefan Fritsch
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
I only had a quick glance at the code, but I have one comment about your
use of memory barriers. The membar_* macros are pure compiler barriers
when the openbsd kernel is compiled for UP. But since the host machine and
xen may use SMP even in this case, I suspect the that you need hardware
memory barriers even if MULTIPROCESSOR is not defined. This does not seem
relevant for x86 because you don't use membar_sync, but it may become
relevant for arm, which is also supported by xen.
membar_{producer,consumer} are defined on arm to perform store and
load memory barriers. Our arm code currently does not distinguish
between an MP case and non-MP case regarding the definition of these
macros, so I'm not entirely certain what are you trying to say.
Not sure ARM is a good example to look at.
The only architectures that Xen dom0 is implemented for are i386,
adm64 and arm, so there's no real need to look at anything else.
Post by Mark Kettenis
In principle I think that the membar_xxx() interfaces could be simple
compiler barriers on all our architectures, at least as long as the
CPU will observe its own stores in the same order as they were
emitted. But I think all sane CPU architectures make those
guarantees. At least for "normal" memory. However, we treat that as
an optimization. And we haven't done that for all our architectures.
The problem with virtualization is of course that even a non-MP kernel
is actually running in an MP environment. If data structures are
shared with the hypervisor or another domain running on a different
CPU, proper memory barriers must be used to guarantee the other side
sees our stores in the right order. The typical case would be
populating a descriptor with some sort of validity bit. There you
want to make sure the other side doesn't see the valid bit set until
all the other parts of the descriptor have been filled in and are
visible. In that case a simple compiler barrier may not be enough.
That's what I was referring to in my example below.
Post by Mark Kettenis
This is why the virtio_membar_xxx() primitives were introduced.
Any idea why wasn't store and load barriers implemented separately?
Post by Mark Kettenis
This is actually not all that different from handling DMA to real
hardware devices. There we must make sure that stores become visible
to the hardware device in the right order. That matters even on
non-MP kernels too and is handled by bus_dmamap_sync(9).
Except that bus_dmamap_sync is not needed on amd64 and is in
fact empty.
Post by Mark Kettenis
Since you have embraced bus_dma(9) for the xen stuff, it would make
sense to add a xen-specifc bus_dmamap_sync() implementation that
issues the appropriate memory barrier. I think it should be
virtio_membar_consumer() for BUS_DMASYNC_PREREAD and
virtio_membar_producer() for BUS_DMASYNC_POSTWRITE. But you'd better
double-check, because I always get confused!
Will do.
Post by Mark Kettenis
BTW, your xen bus_dma(9) implementation relies on the internals of the
MD bus_dma(9) implementation. Don't expect it to to work on other
architectures. I'm not even sure I want to be held responsible if
changes in the MD code break it.
If that's _ds_boundary you're referring to, it has been there for 15 years
so it's unlikely that it's going away. And in any case we can easily add
another member into this opaque data type so I don't think it's a big deal
at all.
Post by Mark Kettenis
However I'm thankful for bringing this up and I'll spend some time
figuring out if I need actual fences in my code. for instance the
cas loop in xen_grant_table_remove runs for more than 10000 iterations
in the normal case. I've changed the code to perform bus_dma_unload
after zeroing the descriptor out so that there won't be technically
any dangling grant table references, but I didn't remeasure the cas
loop. Possibly due to caching and CPU migration on the host we lose
out and perhaps can get a boost in performance by putting an implicit
memory barrier.
Not sure what memory fences have to do with this.
It's what you've described above. I possibly need to make sure that
the hypervisor sees one store before the other.
Post by Mark Kettenis
The hypervisor should defenitely issue any appropriate barriers as part
of the context switching.
There's no context switching in between.
Post by Mark Kettenis
Post by Stefan Fritsch
I had the same problem in virtio and introduced the virtio_membar_* macros
for this purpose. Maybe they should be renamed to a more generic name and
you should use them, too?
I'm not sure cause I don't think x86 needs any explicit membars, but
I'll do some test and report on this.
It's a grey area. The x86 memory model evolved over time and isn't
all that well specified. On top of that it seems the actual hardware
is abit more strongly ordered than the specification. It is fairly
strongly ordered, which means that memory barriers can be omitted in
most cases that deal with "normal" memory. But our implementation
does issue memory barriers for membar_enter() and membar_sync(). I'm
not 100% certain it is correct.
Stefan Fritsch
2016-01-12 13:04:03 UTC
Permalink
Post by Mike Belopuhov
Post by Mark Kettenis
Date: Thu, 7 Jan 2016 12:02:23 +0100
Post by Stefan Fritsch
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
I only had a quick glance at the code, but I have one comment about your
use of memory barriers. The membar_* macros are pure compiler barriers
when the openbsd kernel is compiled for UP. But since the host machine and
xen may use SMP even in this case, I suspect the that you need hardware
memory barriers even if MULTIPROCESSOR is not defined. This does not seem
relevant for x86 because you don't use membar_sync, but it may become
relevant for arm, which is also supported by xen.
membar_{producer,consumer} are defined on arm to perform store and
load memory barriers. Our arm code currently does not distinguish
between an MP case and non-MP case regarding the definition of these
macros, so I'm not entirely certain what are you trying to say.
I didn't check arm's implementation but new that it had non-empty
membar_{producer,consumer}. So, if it does not distinguish between an MP
case and non-MP case, then there is no problem there. But maybe you should
document somewhere which assumptions about the architecture you make, so
that they can be checked when adding a new architecture. I guess arm64
will come sooner or later and I don't know if it has exactly the same
memory model as 32bit arm.
Post by Mike Belopuhov
Post by Mark Kettenis
Not sure ARM is a good example to look at.
The only architectures that Xen dom0 is implemented for are i386,
adm64 and arm, so there's no real need to look at anything else.
Post by Mark Kettenis
In principle I think that the membar_xxx() interfaces could be simple
compiler barriers on all our architectures, at least as long as the
CPU will observe its own stores in the same order as they were
emitted. But I think all sane CPU architectures make those
guarantees. At least for "normal" memory. However, we treat that as
an optimization. And we haven't done that for all our architectures.
The problem with virtualization is of course that even a non-MP kernel
is actually running in an MP environment. If data structures are
shared with the hypervisor or another domain running on a different
CPU, proper memory barriers must be used to guarantee the other side
sees our stores in the right order. The typical case would be
populating a descriptor with some sort of validity bit. There you
want to make sure the other side doesn't see the valid bit set until
all the other parts of the descriptor have been filled in and are
visible. In that case a simple compiler barrier may not be enough.
Yes. With intel it's the "Reads may be reordered with older writes to
different locations but not with older writes to the same location" bit
from the memory model that is causing problems. So you have to check if
xen hits this case. virtio does (and removing the membarriers causes
observable hangs).
Post by Mike Belopuhov
That's what I was referring to in my example below.
Post by Mark Kettenis
This is why the virtio_membar_xxx() primitives were introduced.
Any idea why wasn't store and load barriers implemented separately?
No idea. virtio_membar_xxx() was modeled after the existing membar_xxx().
But AIUI membar_consumer() plus membar_producer() is not equivalent to
membar_sync() (which also prevents read vs. write reordering).
Reyk Floeter
2016-01-06 17:49:32 UTC
Permalink
Post by Mike Belopuhov
There's still stuff to do, but it receives and transmits reliably
(at least on modern Xen) so I'd like to get it in. Man page will
follow.
OK?
I can see it works now and as mentioned in icb:
I just had the first contact with OpenBSD in an EC2 instance.
(once again, we need emoji in xterm to see the U+1F596)

Two bugs:
- I didn't work on m4.10xlarge (see cvs:~reyk/dmesg.m4.10xlarge).
- One time, xnf stopped while coping a large file to a remote machine.

I think it is good enough to go in and be tweaked in the tree.
Post by Mike Belopuhov
diff --git sys/arch/amd64/conf/GENERIC sys/arch/amd64/conf/GENERIC
index fca4459..77e07cc 100644
--- sys/arch/amd64/conf/GENERIC
+++ sys/arch/amd64/conf/GENERIC
@@ -67,10 +67,11 @@ mpbios0 at bios0
ipmi0 at mainbus? disable # IPMI
vmt0 at pvbus? # VMware Tools
#xen0 at pvbus? # Xen HVM domU
+#xnf* at xen? # Xen Netfront
option PCIVERBOSE
option USBVERBOSE
pchb* at pci? # PCI-Host bridges
diff --git sys/dev/pv/files.pv sys/dev/pv/files.pv
index d0e3b8c..e1272b2 100644
--- sys/dev/pv/files.pv
+++ sys/dev/pv/files.pv
@@ -16,5 +16,9 @@ file dev/pv/vmt.c vmt needs-flag
# Xen
device xen {}
attach xen at pvbus
file dev/pv/xen.c xen needs-flag
file dev/pv/xenstore.c xen
+
+device xnf: ether, ifnet, ifmedia
+attach xnf at xen
+file dev/pv/if_xnf.c xnf
diff --git sys/dev/pv/if_xnf.c sys/dev/pv/if_xnf.c
new file mode 100644
index 0000000..7f8b08e
--- /dev/null
+++ sys/dev/pv/if_xnf.c
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2015 Mike Belopuhov
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "bpfilter.h"
+#include "vlan.h"
+#include "xen.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/atomic.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/device.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/queue.h>
+#include <sys/timeout.h>
+#include <sys/pool.h>
+
+#include <machine/bus.h>
+
+#include <dev/pv/xenreg.h>
+#include <dev/pv/xenvar.h>
+
+#include <net/if.h>
+#include <net/if_media.h>
+
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+
+#if NBPFILTER > 0
+#include <net/bpf.h>
+#endif
+
+
+/*
+ * Rx ring
+ */
+
+struct xnf_rx_req {
+ uint16_t rxq_id;
+ uint16_t rxq_pad;
+ uint32_t rxq_ref;
+} __packed;
+
+struct xnf_rx_rsp {
+ uint16_t rxp_id;
+ uint16_t rxp_offset;
+ uint16_t rxp_flags;
+#define XNF_RXF_CSUM 0x0001
+#define XNF_RXF_BLANK 0x0002
+#define XNF_RXF_CHUNK 0x0004
+#define XNF_RXF_EXTRA 0x0008
+ int16_t rxp_status;
+} __packed;
+
+union xnf_rx_desc {
+ struct xnf_rx_req rxd_req;
+ struct xnf_rx_rsp rxd_rsp;
+} __packed;
+
+#define XNF_RX_DESC 256
+#define XNF_MCLEN PAGE_SIZE
+#define XNF_RX_MIN 32
+
+struct xnf_rx_ring {
+ uint32_t rxr_prod;
+ uint32_t rxr_req_evt;
+ uint32_t rxr_cons;
+ uint32_t rxr_rsp_evt;
+ uint32_t rxr_reserved[12];
+ union xnf_rx_desc rxr_desc[XNF_RX_DESC];
+} __packed;
+
+
+/*
+ * Tx ring
+ */
+
+struct xnf_tx_req {
+ uint32_t txq_ref;
+ uint16_t txq_offset;
+ uint16_t txq_flags;
+#define XNF_TXF_CSUM 0x0001
+#define XNF_TXF_VALID 0x0002
+#define XNF_TXF_CHUNK 0x0004
+#define XNF_TXF_ETXRA 0x0008
+ uint16_t txq_id;
+ uint16_t txq_size;
+} __packed;
+
+struct xnf_tx_rsp {
+ uint16_t txp_id;
+ int16_t txp_status;
+} __packed;
+
+union xnf_tx_desc {
+ struct xnf_tx_req txd_req;
+ struct xnf_tx_rsp txd_rsp;
+} __packed;
+
+#define XNF_TX_DESC 256
+#define XNF_TX_FRAG 8 /* down from 18 */
+
+struct xnf_tx_ring {
+ uint32_t txr_prod;
+ uint32_t txr_req_evt;
+ uint32_t txr_cons;
+ uint32_t txr_rsp_evt;
+ uint32_t txr_reserved[12];
+ union xnf_tx_desc txr_desc[XNF_TX_DESC];
+} __packed;
+
+
+/* Management frame, "extra info" in Xen parlance */
+struct xnf_mgmt {
+ uint8_t mg_type;
+#define XNF_MGMT_MCAST_ADD 2
+#define XNF_MGMT_MCAST_DEL 3
+ uint8_t mg_flags;
+ union {
+ uint8_t mgu_mcaddr[ETHER_ADDR_LEN];
+ uint16_t mgu_pad[3];
+ } u;
+#define mg_mcaddr u.mgu_mcaddr
+} __packed;
+
+
+struct xnf_softc {
+ struct device sc_dev;
+ struct xen_attach_args sc_xa;
+ struct xen_softc *sc_xen;
+ bus_dma_tag_t sc_dmat;
+
+ struct arpcom sc_ac;
+ struct ifmedia sc_media;
+
+ xen_intr_handle_t sc_xih;
+
+ /* Rx ring */
+ struct xnf_rx_ring *sc_rx_ring;
+ int sc_rx_cons;
+ bus_dmamap_t sc_rx_rmap; /* map for the ring */
+ bus_dma_segment_t sc_rx_seg;
+ uint32_t sc_rx_ref; /* grant table ref */
+ struct mbuf *sc_rx_buf[XNF_RX_DESC];
+ bus_dmamap_t sc_rx_dmap[XNF_RX_DESC]; /* maps for packets */
+ struct mbuf *sc_rx_cbuf[2]; /* chain handling */
+ struct if_rxring sc_rx_slots;
+ struct timeout sc_rx_fill;
+
+ /* Tx ring */
+ struct xnf_tx_ring *sc_tx_ring;
+ int sc_tx_cons;
+ bus_dmamap_t sc_tx_rmap; /* map for the ring */
+ bus_dma_segment_t sc_tx_seg;
+ uint32_t sc_tx_ref; /* grant table ref */
+ struct mbuf *sc_tx_buf[XNF_TX_DESC];
+ bus_dmamap_t sc_tx_dmap[XNF_TX_DESC]; /* maps for packets */
+};
+
+int xnf_match(struct device *, void *, void *);
+void xnf_attach(struct device *, struct device *, void *);
+int xnf_lladdr(struct xnf_softc *);
+int xnf_ioctl(struct ifnet *, u_long, caddr_t);
+int xnf_media_change(struct ifnet *);
+void xnf_media_status(struct ifnet *, struct ifmediareq *);
+int xnf_iff(struct xnf_softc *);
+void xnf_init(struct xnf_softc *);
+void xnf_stop(struct xnf_softc *);
+void xnf_start(struct ifnet *);
+int xnf_encap(struct xnf_softc *, struct mbuf *, uint32_t *);
+void xnf_intr(void *);
+int xnf_txeof(struct xnf_softc *);
+int xnf_rxeof(struct xnf_softc *);
+void xnf_rx_ring_fill(void *);
+int xnf_rx_ring_create(struct xnf_softc *);
+void xnf_rx_ring_drain(struct xnf_softc *);
+void xnf_rx_ring_destroy(struct xnf_softc *);
+int xnf_tx_ring_create(struct xnf_softc *);
+void xnf_tx_ring_drain(struct xnf_softc *);
+void xnf_tx_ring_destroy(struct xnf_softc *);
+int xnf_init_backend(struct xnf_softc *);
+int xnf_stop_backend(struct xnf_softc *);
+
+struct cfdriver xnf_cd = {
+ NULL, "xnf", DV_IFNET
+};
+
+const struct cfattach xnf_ca = {
+ sizeof(struct xnf_softc), xnf_match, xnf_attach
+};
+
+int
+xnf_match(struct device *parent, void *match, void *aux)
+{
+ struct xen_attach_args *xa = aux;
+ char type[64];
+
+ if (strcmp("vif", xa->xa_name))
+ return (0);
+
+ if (xs_getprop(xa, "type", type, sizeof(type)) == 0 &&
+ ((strcmp("vif", type) == 0) || (strcmp("front", type) == 0)))
+ return (1);
+
+ return (0);
+}
+
+void
+xnf_attach(struct device *parent, struct device *self, void *aux)
+{
+ struct xen_attach_args *xa = aux;
+ struct xnf_softc *sc = (struct xnf_softc *)self;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ sc->sc_xa = *xa;
+ sc->sc_xen = xa->xa_parent;
+ sc->sc_dmat = xa->xa_dmat;
+
+ strlcpy(ifp->if_xname, sc->sc_dev.dv_xname, IFNAMSIZ);
+
+ if (xnf_lladdr(sc)) {
+ printf(": failed to obtain MAC address\n");
+ return;
+ }
+
+ if (xen_intr_establish(0, &sc->sc_xih, xnf_intr, sc, ifp->if_xname)) {
+ printf("%s: failed to establish an interrupt\n", ifp->if_xname);
+ return;
+ }
+
+ printf(": event channel %u, address %s\n", sc->sc_xih,
+ ether_sprintf(sc->sc_ac.ac_enaddr));
+
+ if (xnf_rx_ring_create(sc)) {
+ xen_intr_disestablish(sc->sc_xih);
+ return;
+ }
+ if (xnf_tx_ring_create(sc)) {
+ xen_intr_disestablish(sc->sc_xih);
+ xnf_rx_ring_destroy(sc);
+ return;
+ }
+ if (xnf_init_backend(sc)) {
+ xen_intr_disestablish(sc->sc_xih);
+ xnf_rx_ring_destroy(sc);
+ xnf_tx_ring_destroy(sc);
+ return;
+ }
+
+ ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_xflags = IFXF_MPSAFE;
+ ifp->if_ioctl = xnf_ioctl;
+ ifp->if_start = xnf_start;
+ ifp->if_softc = sc;
+
+ ifp->if_capabilities = IFCAP_VLAN_MTU;
+
+ IFQ_SET_MAXLEN(&ifp->if_snd, XNF_TX_DESC - 1);
+ IFQ_SET_READY(&ifp->if_snd);
+
+ ifmedia_init(&sc->sc_media, IFM_IMASK, xnf_media_change,
+ xnf_media_status);
+ ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_MANUAL, 0, NULL);
+ ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_MANUAL);
+
+ if_attach(ifp);
+ ether_ifattach(ifp);
+
+ timeout_set(&sc->sc_rx_fill, xnf_rx_ring_fill, sc);
+}
+
+static int
+nibble(int ch)
+{
+ if (ch >= '0' && ch <= '9')
+ return (ch - '0');
+ if (ch >= 'A' && ch <= 'F')
+ return (10 + ch - 'A');
+ if (ch >= 'a' && ch <= 'f')
+ return (10 + ch - 'a');
+ return (-1);
+}
+
+int
+xnf_lladdr(struct xnf_softc *sc)
+{
+ char enaddr[ETHER_ADDR_LEN];
+ char mac[32];
+ int i, j, lo, hi;
+
+ if (xs_getprop(&sc->sc_xa, "mac", mac, sizeof(mac)))
+ return (-1);
+
+ for (i = 0, j = 0; j < ETHER_ADDR_LEN; i += 3) {
+ if ((hi = nibble(mac[i])) == -1 ||
+ (lo = nibble(mac[i+1])) == -1)
+ return (-1);
+ enaddr[j++] = hi << 4 | lo;
+ }
+
+ memcpy(sc->sc_ac.ac_enaddr, enaddr, ETHER_ADDR_LEN);
+ return (0);
+}
+
+int
+xnf_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
+{
+ struct xnf_softc *sc = ifp->if_softc;
+ struct ifreq *ifr = (struct ifreq *)data;
+ int s, error = 0;
+
+ s = splnet();
+
+ switch (command) {
+ ifp->if_flags |= IFF_UP;
+ if (!(ifp->if_flags & IFF_RUNNING))
+ xnf_init(sc);
+ break;
+ if (ifp->if_flags & IFF_UP) {
+ if (ifp->if_flags & IFF_RUNNING)
+ error = ENETRESET;
+ else
+ xnf_init(sc);
+ } else {
+ if (ifp->if_flags & IFF_RUNNING)
+ xnf_stop(sc);
+ }
+ break;
+ error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, command);
+ break;
+ error = if_rxr_ioctl((struct if_rxrinfo *)ifr->ifr_data,
+ NULL, XNF_MCLEN, &sc->sc_rx_slots);
+ break;
+ error = ether_ioctl(ifp, &sc->sc_ac, command, data);
+ break;
+ }
+
+ if (error == ENETRESET) {
+ if (ifp->if_flags & IFF_RUNNING)
+ xnf_iff(sc);
+ error = 0;
+ }
+
+ splx(s);
+
+ return (error);
+}
+
+int
+xnf_media_change(struct ifnet *ifp)
+{
+ return (0);
+}
+
+void
+xnf_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+ ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
+ ifmr->ifm_active = IFM_ETHER | IFM_MANUAL;
+}
+
+int
+xnf_iff(struct xnf_softc *sc)
+{
+ return (0);
+}
+
+void
+xnf_init(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ xnf_stop(sc);
+
+ xnf_iff(sc);
+
+ if (xen_intr_unmask(sc->sc_xih)) {
+ printf("%s: failed to enable interrupts\n", ifp->if_xname);
+ xnf_stop(sc);
+ return;
+ }
+
+ ifp->if_flags |= IFF_RUNNING;
+ ifq_clr_oactive(&ifp->if_snd);
+}
+
+void
+xnf_stop(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ ifp->if_flags &= ~IFF_RUNNING;
+
+ xen_intr_mask(sc->sc_xih);
+
+ timeout_del(&sc->sc_rx_fill);
+
+ ifq_barrier(&ifp->if_snd);
+ intr_barrier(&sc->sc_xih);
+
+ ifq_clr_oactive(&ifp->if_snd);
+
+ if (sc->sc_tx_ring)
+ xnf_tx_ring_drain(sc);
+ if (sc->sc_rx_ring)
+ xnf_rx_ring_drain(sc);
+}
+
+void
+xnf_start(struct ifnet *ifp)
+{
+ struct xnf_softc *sc = ifp->if_softc;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ struct mbuf *m;
+ int error, pkts = 0;
+ uint32_t prod;
+
+ if (!(ifp->if_flags & IFF_RUNNING) || ifq_is_oactive(&ifp->if_snd))
+ return;
+
+ prod = txr->txr_prod;
+ membar_consumer();
+
+ for (;;) {
+ m = ifq_deq_begin(&ifp->if_snd);
+ if (m == NULL)
+ break;
+
+ error = xnf_encap(sc, m, &prod);
+ if (error == ENOENT) {
+ /* transient */
+ ifq_deq_rollback(&ifp->if_snd, m);
+ ifq_set_oactive(&ifp->if_snd);
+ break;
+ } else if (error) {
+ /* the chain is too large */
+ ifq_deq_commit(&ifp->if_snd, m);
+ m_freem(m);
+ continue;
+ }
+ ifq_deq_commit(&ifp->if_snd, m);
+
+#if NBPFILTER > 0
+ if (ifp->if_bpf)
+ bpf_mtap_ether(ifp->if_bpf, m, BPF_DIRECTION_OUT);
+#endif
+ pkts++;
+ }
+ if (pkts > 0) {
+ txr->txr_prod = prod;
+ xen_intr_signal(sc->sc_xih);
+ }
+}
+
+int
+xnf_encap(struct xnf_softc *sc, struct mbuf *m, uint32_t *prod)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ union xnf_tx_desc *txd;
+ bus_dmamap_t dmap;
+ int error, i, n = 0;
+
+ if (((txr->txr_cons - *prod - 1) & (XNF_TX_DESC - 1)) < XNF_TX_FRAG) {
+ error = ENOENT;
+ goto errout;
+ }
+
+ i = *prod & (XNF_TX_DESC - 1);
+ dmap = sc->sc_tx_dmap[i];
+
+ error = bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
+ BUS_DMA_NOWAIT);
+ if (error == EFBIG) {
+ if (m_defrag(m, M_DONTWAIT) ||
+ bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_WRITE |
+ BUS_DMA_NOWAIT))
+ goto errout;
+ } else if (error)
+ goto errout;
+
+ for (n = 0; n < dmap->dm_nsegs; n++, (*prod)++) {
+ i = *prod & (XNF_TX_DESC - 1);
+ if (sc->sc_tx_buf[i])
+ panic("%s: save vs spell: %d\n", ifp->if_xname, i);
+ txd = &txr->txr_desc[i];
+ if (n == 0) {
+ sc->sc_tx_buf[i] = m;
+ if (0 && m->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
+ txd->txd_req.txq_flags = XNF_TXF_CSUM |
+ XNF_TXF_VALID;
+ txd->txd_req.txq_size = m->m_pkthdr.len;
+ } else
+ txd->txd_req.txq_size = dmap->dm_segs[n].ds_len;
+ if (n != dmap->dm_nsegs - 1)
+ txd->txd_req.txq_flags |= XNF_TXF_CHUNK;
+ txd->txd_req.txq_ref = dmap->dm_segs[n].ds_addr;
+ txd->txd_req.txq_offset = dmap->dm_segs[n].ds_offset;
+ }
+
+ ifp->if_opackets++;
+ return (0);
+
+ ifp->if_oerrors++;
+ return (error);
+}
+
+void
+xnf_intr(void *arg)
+{
+ struct xnf_softc *sc = arg;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+
+ if (ifp->if_flags & IFF_RUNNING) {
+ xnf_rxeof(sc);
+ xnf_txeof(sc);
+ }
+}
+
+int
+xnf_txeof(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+ union xnf_tx_desc *txd;
+ struct mbuf *m;
+ bus_dmamap_t dmap;
+ volatile uint32_t r;
+ uint32_t cons;
+ int i, id, pkts = 0;
+
+ do {
+ for (cons = sc->sc_tx_cons; cons != txr->txr_cons; cons++) {
+ membar_consumer();
+ i = cons & (XNF_TX_DESC - 1);
+ txd = &txr->txr_desc[i];
+ id = txd->txd_rsp.txp_id;
+ memset(txd, 0, sizeof(*txd));
+ txd->txd_req.txq_id = id;
+ membar_producer();
+ if (sc->sc_tx_buf[i]) {
+ dmap = sc->sc_tx_dmap[i];
+ bus_dmamap_unload(sc->sc_dmat, dmap);
+ m = sc->sc_tx_buf[i];
+ sc->sc_tx_buf[i] = NULL;
+ m_freem(m);
+ }
+ pkts++;
+ }
+
+ if (pkts > 0) {
+ sc->sc_tx_cons = cons;
+ membar_producer();
+ txr->txr_rsp_evt = cons + 1;
+ pkts = 0;
+ }
+
+ r = txr->txr_cons - sc->sc_tx_cons;
+ membar_consumer();
+ } while (r > 0);
+
+ if (ifq_is_oactive(&ifp->if_snd))
+ ifq_restart(&ifp->if_snd);
+
+ return (0);
+}
+
+int
+xnf_rxeof(struct xnf_softc *sc)
+{
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_rx_ring *rxr = sc->sc_rx_ring;
+ union xnf_rx_desc *rxd;
+ struct mbuf_list ml = MBUF_LIST_INITIALIZER();
+ struct mbuf *fmp = sc->sc_rx_cbuf[0];
+ struct mbuf *lmp = sc->sc_rx_cbuf[1];
+ struct mbuf *m;
+ bus_dmamap_t dmap;
+ volatile uint32_t r;
+ uint32_t cons;
+ int i, id, flags, len, offset, pkts = 0;
+
+ do {
+ for (cons = sc->sc_rx_cons; cons != rxr->rxr_cons; cons++) {
+ membar_consumer();
+ i = cons & (XNF_RX_DESC - 1);
+ rxd = &rxr->rxr_desc[i];
+ dmap = sc->sc_rx_dmap[i];
+
+ len = rxd->rxd_rsp.rxp_status;
+ flags = rxd->rxd_rsp.rxp_flags;
+ offset = rxd->rxd_rsp.rxp_offset;
+ id = rxd->rxd_rsp.rxp_id;
+ memset(rxd, 0, sizeof(*rxd));
+ rxd->rxd_req.rxq_id = id;
+ membar_producer();
+
+ bus_dmamap_unload(sc->sc_dmat, dmap);
+
+ m = sc->sc_rx_buf[i];
+ KASSERT(m != NULL);
+ sc->sc_rx_buf[i] = NULL;
+
+ if (flags & XNF_RXF_EXTRA)
+ printf("%s: management data present\n",
+ ifp->if_xname);
+
+ if (flags & XNF_RXF_CSUM)
+ m->m_pkthdr.csum_flags = M_IPV4_CSUM_IN_OK;
+
+ if_rxr_put(&sc->sc_rx_slots, 1);
+ pkts++;
+
+ if (len < 0 || (len + offset > PAGE_SIZE)) {
+ ifp->if_ierrors++;
+ m_freem(m);
+ continue;
+ }
+
+ m->m_len = len;
+ m->m_data += offset;
+
+ if (fmp == NULL) {
+ m->m_pkthdr.len = len;
+ fmp = m;
+ } else {
+ m->m_flags &= ~M_PKTHDR;
+ lmp->m_next = m;
+ fmp->m_pkthdr.len += m->m_len;
+ }
+ lmp = m;
+
+ if (flags & XNF_RXF_CHUNK) {
+ sc->sc_rx_cbuf[0] = fmp;
+ sc->sc_rx_cbuf[1] = lmp;
+ continue;
+ }
+
+ m = fmp;
+
+ ml_enqueue(&ml, m);
+ sc->sc_rx_cbuf[0] = sc->sc_rx_cbuf[1] =
+ fmp = lmp = NULL;
+ }
+
+ if (pkts > 0) {
+ sc->sc_rx_cons = cons;
+ membar_producer();
+ rxr->rxr_rsp_evt = cons + 1;
+ pkts = 0;
+ }
+
+ r = rxr->rxr_cons - sc->sc_rx_cons;
+ membar_consumer();
+ } while (r > 0);
+
+ if (!ml_empty(&ml)) {
+ if_input(ifp, &ml);
+
+ xnf_rx_ring_fill(sc);
+ }
+
+ return (0);
+}
+
+void
+xnf_rx_ring_fill(void *arg)
+{
+ struct xnf_softc *sc = arg;
+ struct ifnet *ifp = &sc->sc_ac.ac_if;
+ struct xnf_rx_ring *rxr = sc->sc_rx_ring;
+ bus_dmamap_t dmap;
+ struct mbuf *m;
+ uint32_t cons, prod;
+ static int timer = 0;
+ int i, n;
+
+ cons = rxr->rxr_cons;
+ prod = rxr->rxr_prod;
+
+ n = if_rxr_get(&sc->sc_rx_slots, XNF_RX_DESC);
+
+ /* Less than XNF_RX_MIN slots available? */
+ if (n == 0 && prod - cons < XNF_RX_MIN) {
+ if (ifp->if_flags & IFF_RUNNING)
+ timeout_add(&sc->sc_rx_fill, 1 << timer);
+ if (timer < 10)
+ timer++;
+ return;
+ }
+
+ for (; n > 0; prod++, n--) {
+ i = prod & (XNF_RX_DESC - 1);
+ if (sc->sc_rx_buf[i])
+ break;
+ m = MCLGETI(NULL, M_DONTWAIT, NULL, XNF_MCLEN);
+ if (m == NULL)
+ break;
+ m->m_len = m->m_pkthdr.len = XNF_MCLEN;
+ dmap = sc->sc_rx_dmap[i];
+ if (bus_dmamap_load_mbuf(sc->sc_dmat, dmap, m, BUS_DMA_READ |
+ BUS_DMA_NOWAIT)) {
+ m_freem(m);
+ break;
+ }
+ sc->sc_rx_buf[i] = m;
+ rxr->rxr_desc[i].rxd_req.rxq_ref = dmap->dm_segs[0].ds_addr;
+ }
+
+ if (n > 0)
+ if_rxr_put(&sc->sc_rx_slots, n);
+
+ membar_producer();
+ rxr->rxr_prod = prod;
+
+ xen_intr_signal(sc->sc_xih);
+}
+
+int
+xnf_rx_ring_create(struct xnf_softc *sc)
+{
+ int i, rsegs;
+
+ /* Allocate a page of memory for the ring */
+ if (bus_dmamem_alloc(sc->sc_dmat, PAGE_SIZE, PAGE_SIZE, 0,
+ &sc->sc_rx_seg, 1, &rsegs, BUS_DMA_ZERO | BUS_DMA_WAITOK)) {
+ printf("%s: failed to allocate memory for the rx ring\n",
+ sc->sc_dev.dv_xname);
+ return (-1);
+ }
+ /* Map in the allocated memory into the ring structure */
+ if (bus_dmamem_map(sc->sc_dmat, &sc->sc_rx_seg, 1, PAGE_SIZE,
+ (caddr_t *)(&sc->sc_rx_ring), BUS_DMA_WAITOK)) {
+ printf("%s: failed to map memory for the rx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Create a map to load the ring memory into */
+ if (bus_dmamap_create(sc->sc_dmat, PAGE_SIZE, 1, PAGE_SIZE, 0,
+ BUS_DMA_WAITOK, &sc->sc_rx_rmap)) {
+ printf("%s: failed to create a memory map for the rx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Load the ring into the ring map to extract the PA */
+ if (bus_dmamap_load(sc->sc_dmat, sc->sc_rx_rmap, sc->sc_rx_ring,
+ PAGE_SIZE, NULL, BUS_DMA_WAITOK)) {
+ printf("%s: failed to load the rx ring map\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ sc->sc_rx_ref = sc->sc_rx_rmap->dm_segs[0].ds_addr;
+
+ sc->sc_rx_ring->rxr_req_evt = sc->sc_rx_ring->rxr_rsp_evt = 1;
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (bus_dmamap_create(sc->sc_dmat, XNF_MCLEN, 1,
+ XNF_MCLEN, 0, BUS_DMA_WAITOK, &sc->sc_rx_dmap[i])) {
+ printf("%s: failed to create a memory map for the rx "
+ "slot %d/%d\n", sc->sc_dev.dv_xname, i,
+ XNF_RX_DESC);
+ goto errout;
+ }
+ sc->sc_rx_ring->rxr_desc[i].rxd_req.rxq_id = i;
+ }
+
+ if_rxr_init(&sc->sc_rx_slots, XNF_RX_MIN, XNF_RX_DESC);
+ xnf_rx_ring_fill(sc);
+
+ return (0);
+
+ xnf_rx_ring_destroy(sc);
+ return (-1);
+}
+
+void
+xnf_rx_ring_drain(struct xnf_softc *sc)
+{
+ struct xnf_rx_ring *rxr = sc->sc_rx_ring;
+
+ if (sc->sc_rx_cons != rxr->rxr_cons)
+ xnf_rxeof(sc);
+}
+
+void
+xnf_rx_ring_destroy(struct xnf_softc *sc)
+{
+ int i, slots = 0;
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (sc->sc_rx_buf[i] == NULL)
+ continue;
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_rx_dmap[i]);
+ m_freem(sc->sc_rx_buf[i]);
+ sc->sc_rx_buf[i] = NULL;
+ slots++;
+ }
+ printf("%s: unload done\n", __func__);
+ if_rxr_put(&sc->sc_rx_slots, slots);
+ printf("%s: rxr_put done\n", __func__);
+
+ for (i = 0; i < XNF_RX_DESC; i++) {
+ if (sc->sc_rx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_rx_dmap[i]);
+ sc->sc_rx_dmap[i] = NULL;
+ }
+ printf("%s: desc map destroy done\n", __func__);
+ if (sc->sc_rx_rmap) {
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_rx_rmap);
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_rx_rmap);
+ }
+ printf("%s: ring map destroy done\n", __func__);
+ if (sc->sc_rx_ring) {
+ bus_dmamem_unmap(sc->sc_dmat, (caddr_t)sc->sc_rx_ring,
+ PAGE_SIZE);
+ bus_dmamem_free(sc->sc_dmat, &sc->sc_rx_seg, 1);
+ }
+ printf("%s: ring mem free done\n", __func__);
+ sc->sc_rx_ring = NULL;
+ sc->sc_rx_rmap = NULL;
+ sc->sc_rx_cons = 0;
+}
+
+int
+xnf_tx_ring_create(struct xnf_softc *sc)
+{
+ int i, rsegs;
+
+ /* Allocate a page of memory for the ring */
+ if (bus_dmamem_alloc(sc->sc_dmat, PAGE_SIZE, PAGE_SIZE, 0,
+ &sc->sc_tx_seg, 1, &rsegs, BUS_DMA_ZERO | BUS_DMA_WAITOK)) {
+ printf("%s: failed to allocate memory for the tx ring\n",
+ sc->sc_dev.dv_xname);
+ return (-1);
+ }
+ /* Map in the allocated memory into the ring structure */
+ if (bus_dmamem_map(sc->sc_dmat, &sc->sc_tx_seg, 1, PAGE_SIZE,
+ (caddr_t *)&sc->sc_tx_ring, BUS_DMA_WAITOK)) {
+ printf("%s: failed to map memory for the tx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Create a map to load the ring memory into */
+ if (bus_dmamap_create(sc->sc_dmat, PAGE_SIZE, 1, PAGE_SIZE, 0,
+ BUS_DMA_WAITOK, &sc->sc_tx_rmap)) {
+ printf("%s: failed to create a memory map for the tx ring\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ /* Load the ring into the ring map to extract the PA */
+ if (bus_dmamap_load(sc->sc_dmat, sc->sc_tx_rmap, sc->sc_tx_ring,
+ PAGE_SIZE, NULL, BUS_DMA_WAITOK)) {
+ printf("%s: failed to load the tx ring map\n",
+ sc->sc_dev.dv_xname);
+ goto errout;
+ }
+ sc->sc_tx_ref = sc->sc_tx_rmap->dm_segs[0].ds_addr;
+
+ sc->sc_tx_ring->txr_req_evt = sc->sc_tx_ring->txr_rsp_evt = 1;
+
+ for (i = 0; i < XNF_TX_DESC; i++) {
+ if (bus_dmamap_create(sc->sc_dmat, XNF_MCLEN, XNF_TX_FRAG,
+ XNF_MCLEN, 0, BUS_DMA_WAITOK, &sc->sc_tx_dmap[i])) {
+ printf("%s: failed to create a memory map for the tx "
+ "slot %d/%d\n", sc->sc_dev.dv_xname, i,
+ XNF_TX_DESC);
+ goto errout;
+ }
+ sc->sc_tx_ring->txr_desc[i].txd_req.txq_id = i;
+ }
+
+ return (0);
+
+ xnf_tx_ring_destroy(sc);
+ return (-1);
+}
+
+void
+xnf_tx_ring_drain(struct xnf_softc *sc)
+{
+ struct xnf_tx_ring *txr = sc->sc_tx_ring;
+
+ if (sc->sc_tx_cons != txr->txr_cons)
+ xnf_txeof(sc);
+}
+
+void
+xnf_tx_ring_destroy(struct xnf_softc *sc)
+{
+ int i;
+
+ for (i = 0; i < XNF_TX_DESC; i++) {
+ if (sc->sc_tx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_tx_dmap[i]);
+ if (sc->sc_tx_buf[i] == NULL)
+ continue;
+ m_freem(sc->sc_tx_buf[i]);
+ sc->sc_tx_buf[i] = NULL;
+ }
+ for (i = 0; i < XNF_TX_DESC; i++) {
+ if (sc->sc_tx_dmap[i] == NULL)
+ continue;
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_tx_dmap[i]);
+ sc->sc_tx_dmap[i] = NULL;
+ }
+ if (sc->sc_tx_rmap) {
+ bus_dmamap_unload(sc->sc_dmat, sc->sc_tx_rmap);
+ bus_dmamap_destroy(sc->sc_dmat, sc->sc_tx_rmap);
+ }
+ if (sc->sc_tx_ring) {
+ bus_dmamem_unmap(sc->sc_dmat, (caddr_t)sc->sc_tx_ring,
+ PAGE_SIZE);
+ bus_dmamem_free(sc->sc_dmat, &sc->sc_tx_seg, 1);
+ }
+ sc->sc_tx_ring = NULL;
+ sc->sc_tx_rmap = NULL;
+}
+
+int
+xnf_init_backend(struct xnf_softc *sc)
+{
+ const char *prop;
+ char val[32];
+
+ /* Plumb the Rx ring */
+ prop = "rx-ring-ref";
+ snprintf(val, sizeof(val), "%u", sc->sc_rx_ref);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Enable "copy" mode */
+ prop = "request-rx-copy";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Enable notify mode */
+ prop = "feature-rx-notify";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Request multicast filtering */
+ prop = "request-multicast-control";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Plumb the Tx ring */
+ prop = "tx-ring-ref";
+ snprintf(val, sizeof(val), "%u", sc->sc_tx_ref);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ /* Enable transmit scatter-gather mode */
+ prop = "feature-sg";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Disable TCP/UDP checksum offload */
+ prop = "feature-csum-offload";
+ if (xs_setprop(&sc->sc_xa, prop, NULL, 0))
+ goto errout;
+ prop = "feature-no-csum-offload";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+ prop = "feature-ipv6-csum-offload";
+ if (xs_setprop(&sc->sc_xa, prop, NULL, 0))
+ goto errout;
+ prop = "feature-no-ipv6-csum-offload";
+ snprintf(val, sizeof(val), "%u", 1);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Plumb the event channel port */
+ prop = "event-channel";
+ snprintf(val, sizeof(val), "%u", sc->sc_xih);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ /* Connect the device */
+ prop = "state";
+ snprintf(val, sizeof(val), "%u", 4);
+ if (xs_setprop(&sc->sc_xa, prop, val, strlen(val)))
+ goto errout;
+
+ return (0);
+
+ printf("%s: failed to set \"%s\" property to \"%s\"\n",
+ sc->sc_dev.dv_xname, prop, val);
+ return (-1);
+}
--
Anders Berggren
2016-01-23 11:12:52 UTC
Permalink
Post by Reyk Floeter
- I didn't work on m4.10xlarge (see cvs:~reyk/dmesg.m4.10xlarge).
I didn’t see any mentions of it in the dmesg https://gist.github.com/reyk/b372af303eb86bab3fee but could it be that those machine classes (*x*large-ish) uses Intel NICs with SR-IOV (ixgbe/ixv-ish) by default http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html. Last time I tried, sriovNetSupport couldn’t be disabled after the AMI/VM was created, and I had to use the "aws ec2 register-image …” commands, because the AWS web console didn’t offer any web to create a machine without it...
Reyk Floeter
2016-01-23 11:19:29 UTC
Permalink
Post by Anders Berggren
Post by Reyk Floeter
- I didn't work on m4.10xlarge (see cvs:~reyk/dmesg.m4.10xlarge).
I didn’t see any mentions of it in the dmesg https://gist.github.com/reyk/b372af303eb86bab3fee but could it be that those machine classes (*x*large-ish) uses Intel NICs with SR-IOV (ixgbe/ixv-ish) by default http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking.html. Last time I tried, sriovNetSupport couldn’t be disabled after the AMI/VM was created, and I had to use the "aws ec2 register-image …” commands, because the AWS web console didn’t offer any web to create a machine without it...
No, you have to *enable* SR-IOV in the image.

Machines with the Intel NIC will not show any netfront in the device list via XenStore (just try Ubuntu).

Reyk
Jonathon Sisson
2016-01-23 21:27:09 UTC
Permalink
Post by Reyk Floeter
No, you have to *enable* SR-IOV in the image.
Machines with the Intel NIC will not show any netfront in the device list via XenStore (just try Ubuntu).
Reyk
That's correct, but I think what was being pointed out is that
an instance with SRIOV enabled cannot have it *disabled* (i.e.
to switch back to xnf NICs). I was able to get xnf operational
on a c3.large (enhanced networking-capable) by creating an instance
with CentOS and swapping the root volume out. Any AMI constructed
on Amazon Linux or Ubuntu will have enhanced networking enabled
by default, whereas CentOS doesn't appear to have it enabled (unless
you manually enable it).
Reyk Floeter
2016-01-23 21:57:21 UTC
Permalink
This post might be inappropriate. Click to display it.
Jonathon Sisson
2016-01-23 22:18:17 UTC
Permalink
Post by Reyk Floeter
Post by Jonathon Sisson
Post by Reyk Floeter
No, you have to *enable* SR-IOV in the image.
Machines with the Intel NIC will not show any netfront in the device list via XenStore (just try Ubuntu).
Reyk
That's correct, but I think what was being pointed out is that
an instance with SRIOV enabled cannot have it *disabled* (i.e.
to switch back to xnf NICs). I was able to get xnf operational
on a c3.large (enhanced networking-capable) by creating an instance
with CentOS and swapping the root volume out. Any AMI constructed
on Amazon Linux or Ubuntu will have enhanced networking enabled
by default, whereas CentOS doesn't appear to have it enabled (unless
you manually enable it).
Ah, OK.
I recommend to upload new images or to use my public openbsd
images to bootstrap new AMIs.
The "dd from Linux" trick is just a hack if you don't want to install the
aws and ec2 cli tools - but we have ports now.
Reyk
Fair enough =)

I wasn't certain if the experimental images were considered ready
for testing. I'll switch to using them for any other testing I do.

Speaking of testing, is there any particular area non-devs could
assist with at this time? Gathering dmesgs for different instance
types?
Jonathon Sisson
2016-01-24 05:49:00 UTC
Permalink
Post by Jonathon Sisson
Speaking of testing, is there any particular area non-devs could
assist with at this time? Gathering dmesgs for different instance
types?
I decided to spin up one of each instance type and grab the console
output in case it would be beneficial to the on-going work:

http://update.j3z.org/dmesg/c3.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/c3.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/c3.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/c3.large_dmesg.txt
http://update.j3z.org/dmesg/c3.xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.large_dmesg.txt
http://update.j3z.org/dmesg/c4.xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.xlarge_dmesg.txt
http://update.j3z.org/dmesg/g2.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/g2.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.xlarge_dmesg.txt
http://update.j3z.org/dmesg/m3.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/m3.large_dmesg.txt
http://update.j3z.org/dmesg/m3.medium_dmesg.txt
http://update.j3z.org/dmesg/m3.xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.10xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.large_dmesg.txt
http://update.j3z.org/dmesg/m4.xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.large_dmesg.txt
http://update.j3z.org/dmesg/r3.xlarge_dmesg.txt
http://update.j3z.org/dmesg/t2.large_dmesg.txt
http://update.j3z.org/dmesg/t2.medium_dmesg.txt
http://update.j3z.org/dmesg/t2.micro_dmesg.txt
http://update.j3z.org/dmesg/t2.nano_dmesg.txt
http://update.j3z.org/dmesg/t2.small_dmesg.txt

If it is deemed helpful, I can keep them updated as
new AMIs come out.

Thanks!

-Jonathon
Mike Belopuhov
2016-01-24 13:16:37 UTC
Permalink
Hi Jonathon,

Thanks a lot for taking your time to test this.
Post by Jonathon Sisson
Post by Jonathon Sisson
Speaking of testing, is there any particular area non-devs could
assist with at this time? Gathering dmesgs for different instance
types?
Trying newer kernels would be the most helpful. I've just enabled tcp/udp
checksum offloading in the xnf on Friday and would welcome any network
tests.
Post by Jonathon Sisson
I decided to spin up one of each instance type and grab the console
http://update.j3z.org/dmesg/c3.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/c3.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/c3.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/c3.large_dmesg.txt
http://update.j3z.org/dmesg/c3.xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/c4.large_dmesg.txt
http://update.j3z.org/dmesg/c4.xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/d2.xlarge_dmesg.txt
http://update.j3z.org/dmesg/g2.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/g2.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/i2.xlarge_dmesg.txt
http://update.j3z.org/dmesg/m3.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/m3.large_dmesg.txt
http://update.j3z.org/dmesg/m3.medium_dmesg.txt
http://update.j3z.org/dmesg/m3.xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.10xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/m4.large_dmesg.txt
http://update.j3z.org/dmesg/m4.xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.2xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.4xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.8xlarge_dmesg.txt
http://update.j3z.org/dmesg/r3.large_dmesg.txt
http://update.j3z.org/dmesg/r3.xlarge_dmesg.txt
http://update.j3z.org/dmesg/t2.large_dmesg.txt
http://update.j3z.org/dmesg/t2.medium_dmesg.txt
http://update.j3z.org/dmesg/t2.micro_dmesg.txt
http://update.j3z.org/dmesg/t2.nano_dmesg.txt
http://update.j3z.org/dmesg/t2.small_dmesg.txt
If it is deemed helpful, I can keep them updated as
new AMIs come out.
It would be very interesting to see newer code run on these.
Post by Jonathon Sisson
Thanks!
-Jonathon
Cheers,
Mike
Jonathon Sisson
2016-01-24 19:55:23 UTC
Permalink
Post by Mike Belopuhov
Hi Jonathon,
Thanks a lot for taking your time to test this.
No, thank you guys for all of the work you're doing to get
this working. I'm just a user heh.
Post by Mike Belopuhov
Trying newer kernels would be the most helpful. I've just enabled tcp/udp
checksum offloading in the xnf on Friday and would welcome any network
tests.
I rebuilt with a source checkout earlier today, and after
rebooting to the new kernel I can't seem to get a dhcp lease.
I'm working on building userland to determine if there is
some issue with dhclient, but I haven't finished that step
yet. Has anyone else noted the dhcp issue?
Mike Belopuhov
2016-01-24 20:08:32 UTC
Permalink
Post by Jonathon Sisson
Post by Mike Belopuhov
Hi Jonathon,
Thanks a lot for taking your time to test this.
No, thank you guys for all of the work you're doing to get
this working. I'm just a user heh.
Post by Mike Belopuhov
Trying newer kernels would be the most helpful. I've just enabled tcp/udp
checksum offloading in the xnf on Friday and would welcome any network
tests.
I rebuilt with a source checkout earlier today, and after
rebooting to the new kernel I can't seem to get a dhcp lease.
I'm working on building userland to determine if there is
some issue with dhclient, but I haven't finished that step
yet. Has anyone else noted the dhcp issue?
I haven't seen that on my test box (not AWS), but maybe reverting
the minimum number of rx slots back to 32 can help?

http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/dev/pv/if_xnf.c.diff?r1=1.9&r2=1.10
Jonathon Sisson
2016-01-24 21:22:20 UTC
Permalink
Post by Mike Belopuhov
Post by Jonathon Sisson
Post by Mike Belopuhov
Hi Jonathon,
Thanks a lot for taking your time to test this.
No, thank you guys for all of the work you're doing to get
this working. I'm just a user heh.
Post by Mike Belopuhov
Trying newer kernels would be the most helpful. I've just enabled tcp/udp
checksum offloading in the xnf on Friday and would welcome any network
tests.
I rebuilt with a source checkout earlier today, and after
rebooting to the new kernel I can't seem to get a dhcp lease.
I'm working on building userland to determine if there is
some issue with dhclient, but I haven't finished that step
yet. Has anyone else noted the dhcp issue?
I haven't seen that on my test box (not AWS), but maybe reverting
the minimum number of rx slots back to 32 can help?
http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/dev/pv/if_xnf.c.diff?r1=1.9&r2=1.10
Reverting to 32 fixed the dhcp issue.

I'll go ahead and get those dmesgs for you now =)

Thanks again!
Jonathon Sisson
2016-01-24 22:14:36 UTC
Permalink
Post by Jonathon Sisson
Post by Mike Belopuhov
I haven't seen that on my test box (not AWS), but maybe reverting
the minimum number of rx slots back to 32 can help?
http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/sys/dev/pv/if_xnf.c.diff?r1=1.9&r2=1.10
Reverting to 32 fixed the dhcp issue.
I'll go ahead and get those dmesgs for you now =)
Thanks again!
Mike,

A series of quick iperf tests showed the following:

iperf server: iperf -s (Amazon Linux)
iperf client: iperf -c $SRVIP -dt 300 -i 30

Amazon Linux <-> Amazon Linux (same AZ/VPC subnet)
~690 Mbits. (M3.larges)

OpenBSD-CURRENT <-> Amazon Linux (same AZ/VPC subnet)
~400 Mbits. (M3.larges again)

Each test, I ran the same Amazon Linux instance as the server.

Since I was running bi-directional, I did notice that the
OpenBSD machine accepting incoming traffic was slower than
it sending traffic:

[ 4] 0.0-30.0 sec 1.48 GBytes 422 Mbits/sec
[ 5] 0.0-30.0 sec 1.08 GBytes 310 Mbits/sec

I chose M3 due to the lack of support for SRIOV, so the Amazon
Linux instances wouldn't utilize it:

[***@ip-172-31-46-242 ~]# ethtool -i eth0 | grep driver
driver: vif

I'm gathering the dmesgs and will post links when I have them
uploaded.

-Jonathon
Jonathon Sisson
2016-01-25 00:38:30 UTC
Permalink
tech@,

I've uploaded a few of the dmesgs gathered to dmesgd.nycbug.org:

http://dmesgd.nycbug.org/index.cgi?action=dmesgd&do=index&fts=Jonathon

Currently I have m4.10xlarge, c4.8xlarge, m3.medium, and t2.nano
uploaded for perusal.

I noticed some new output in the m4.10xlarge console output here:

starting network
DHCPDISCOVER on xnf0 - interval 3
DHCPDISCOVER on xnf0 - interval 5
xnf0: tx prod 2 cons 2,0 evt 3,1
DHCPDISCOVER on xnf0 - interval 8
xnf0: tx prod 3 cons 3,0 evt 4,1
DHCPDISCOVER on xnf0 - interval 10
xnf0: tx prod 4 cons 4,0 evt 5,1
DHCPDISCOVER on xnf0 - interval 15
xnf0: tx prod 5 cons 5,0 evt 6,1
DHCPDISCOVER on xnf0 - interval 20
xnf0: tx prod 6 cons 6,0 evt 7,1
No acceptable DHCPOFFERS received.
No working leases in persistent database - sleeping.

Not certain if this is debug output put there intentionally or
if this is some error condition? At any rate, there it is =)

-Jonathon
Mike Belopuhov
2016-01-25 11:04:03 UTC
Permalink
Post by Jonathon Sisson
http://dmesgd.nycbug.org/index.cgi?action=dmesgd&do=index&fts=Jonathon
Currently I have m4.10xlarge, c4.8xlarge, m3.medium, and t2.nano
uploaded for perusal.
Thanks!
Post by Jonathon Sisson
starting network
DHCPDISCOVER on xnf0 - interval 3
DHCPDISCOVER on xnf0 - interval 5
xnf0: tx prod 2 cons 2,0 evt 3,1
DHCPDISCOVER on xnf0 - interval 8
xnf0: tx prod 3 cons 3,0 evt 4,1
DHCPDISCOVER on xnf0 - interval 10
xnf0: tx prod 4 cons 4,0 evt 5,1
DHCPDISCOVER on xnf0 - interval 15
xnf0: tx prod 5 cons 5,0 evt 6,1
DHCPDISCOVER on xnf0 - interval 20
xnf0: tx prod 6 cons 6,0 evt 7,1
No acceptable DHCPOFFERS received.
No working leases in persistent database - sleeping.
Not certain if this is debug output put there intentionally or
Yes.
Post by Jonathon Sisson
if this is some error condition?
It is. Transmission is stuck and these are watchdog timeouts.
Did it get a lease on c4.8xlarge? So far it looks like it happens
on m4.10xlarge instance only. No idea why.
Post by Jonathon Sisson
At any rate, there it is =)
-Jonathon
Jonathon Sisson
2016-01-25 16:17:37 UTC
Permalink
Post by Mike Belopuhov
Post by Jonathon Sisson
Not certain if this is debug output put there intentionally or
Yes.
Post by Jonathon Sisson
if this is some error condition?
It is. Transmission is stuck and these are watchdog timeouts.
Did it get a lease on c4.8xlarge? So far it looks like it happens
on m4.10xlarge instance only. No idea why.
Here's the console output:

***@host:~$ grep 'tx prod' dmesg/*
dmesg/c4.8xlarge_dmesg.txt:xnf0: tx prod 2 cons 2,0 evt 3,1
dmesg/c4.8xlarge_dmesg.txt:xnf0: tx prod 3 cons 3,0 evt 4,1
dmesg/c4.8xlarge_dmesg.txt:xnf0: tx prod 4 cons 4,0 evt 5,1
dmesg/c4.8xlarge_dmesg.txt:xnf0: tx prod 5 cons 5,0 evt 6,1
dmesg/c4.8xlarge_dmesg.txt:xnf0: tx prod 6 cons 6,0 evt 7,1
dmesg/c4.8xlarge_dmesg.txt:xnf0: tx prod 7 cons 7,0 evt 8,1
dmesg/d2.8xlarge_dmesg.txt:xnf0: tx prod 2 cons 2,0 evt 3,1
dmesg/d2.8xlarge_dmesg.txt:xnf0: tx prod 3 cons 3,0 evt 4,1
dmesg/d2.8xlarge_dmesg.txt:xnf0: tx prod 4 cons 4,0 evt 5,1
dmesg/d2.8xlarge_dmesg.txt:xnf0: tx prod 5 cons 5,0 evt 6,1
dmesg/d2.8xlarge_dmesg.txt:xnf0: tx prod 6 cons 6,0 evt 7,1
dmesg/g2.8xlarge_dmesg.txt:xnf0: tx prod 5 cons 5,0 evt 6,1
dmesg/g2.8xlarge_dmesg.txt:xnf0: tx prod 6 cons 6,0 evt 7,1
dmesg/i2.8xlarge_dmesg.txt:xnf0: tx prod 3 cons 3,0 evt 4,1
dmesg/i2.8xlarge_dmesg.txt:xnf0: tx prod 4 cons 4,0 evt 5,1
dmesg/i2.8xlarge_dmesg.txt:xnf0: tx prod 5 cons 5,0 evt 6,1
dmesg/i2.8xlarge_dmesg.txt:xnf0: tx prod 6 cons 6,0 evt 7,1
dmesg/i2.8xlarge_dmesg.txt:xnf0: tx prod 7 cons 7,0 evt 8,1
dmesg/m4.10xlarge_dmesg.txt:xnf0: tx prod 2 cons 2,0 evt 3,1
dmesg/m4.10xlarge_dmesg.txt:xnf0: tx prod 3 cons 3,0 evt 4,1
dmesg/m4.10xlarge_dmesg.txt:xnf0: tx prod 4 cons 4,0 evt 5,1
dmesg/m4.10xlarge_dmesg.txt:xnf0: tx prod 5 cons 5,0 evt 6,1
dmesg/m4.10xlarge_dmesg.txt:xnf0: tx prod 6 cons 6,0 evt 7,1
dmesg/r3.8xlarge_dmesg.txt:xnf0: tx prod 5 cons 5,0 evt 6,1
dmesg/r3.8xlarge_dmesg.txt:xnf0: tx prod 6 cons 6,0 evt 7,1
dmesg/r3.8xlarge_dmesg.txt:xnf0: tx prod 7 cons 7,0 evt 8,1
dmesg/r3.8xlarge_dmesg.txt:xnf0: tx prod 8 cons 8,0 evt 9,1

It happens on c4.8x, d2.8x, g2.8x, i2.8x, m4.10x, r3.8x.
Basically all of the largest instances sizes...on newer
gen instance types that support enhanced networking?

I can re-test these and see if it occurs frequently or
if it was just a fluke. I'll update in a bit.

Loading...